package net.sf.vietpad;

import java.io.*;
import org.unicode.Normalizer;

/**
 *  Conversion of text and HTML files in various Vietnamese encodings to Unicode
 *  Strings.
 *
 *@author     Quan Nguyen
 *@author     Gero Herrmann
 *@created    28 October 2003
 *@version    1.7, 14 April 2005
 *@see        http://unicodeconvert.sourceforge.net
 */
public class UnicodeConversion
{
    private VietEncodings sourceEncoding;
    private String str;
    private boolean html;

    // Fonts for HTML font tags
    private final static String SERIF = "Times New Roman";
    private final static String SANS_SERIF = "Arial";


    /**
     *  Constructor for the UnicodeConversion object
     *
     *@param  sourceEncoding  One of supported encodings: "VISCII", "VPS", "VNI", "VIQR/Vietnet",
     *                        "TCVN3 (ABC)", "Unicode", "Unicode Composite", "UTF-8", ISC", or "NCR"
     */
    public UnicodeConversion(String sourceEncoding) {
        this.sourceEncoding = VietEncodings.valueOf(sourceEncoding);
    }
    /**
     *  Constructor for the UnicodeConversion object
     *
     *@param  sourceEncoding  One of supported encodings: "VISCII", "VPS", "VNI", "VIQR/Vietnet",
     *                        "TCVN3 (ABC)", "Unicode", "Unicode Composite", "UTF-8", ISC", or "NCR"
     */
    public UnicodeConversion(VietEncodings sourceEncoding) {
        this.sourceEncoding = sourceEncoding;
    }
    
    /**
     *  Converts a string
     *
     *@param  source    Text to be converted
     *@param  html      True if HTML document
     */    
    public String convert(String source, boolean html) {
        this.str = source;
        this.html = html;
        convertText();
        return str; // converted text
    }
    
    /**
     *  Performs conversion
     */    
    private void convertText() {
        if (sourceEncoding == VietEncodings.VISCII) {
            VISCIItoUnicode();
        } else if (sourceEncoding == VietEncodings.VPS) {
            VPStoUnicode();
        } else if (sourceEncoding == VietEncodings.VNI) {
            VNItoUnicode();
        } else if (sourceEncoding == VietEncodings.VIQR) {
            VIQRtoUnicode("VIQR");
        } else if (sourceEncoding == VietEncodings.Unicode) {
            // If there exists any (stand-alone) combining diacritical mark,
            // i.e., compound Unicode, perform the C normalization. Skip, otherwise.
            if (str.matches(".*\\p{InCombiningDiacriticalMarks}+.*")) {
                Normalizer composer = new Normalizer(Normalizer.C, false);
                
                // Works around an obscure Unicode-to-VIQR conversion bug which 
                // erroneously converts D with stroke and d with stroke to D and d
                // (instead of to DD and dd), respectively, on certain Windows systems,
                // by substituting them with \00DO and \00F0, respectively,
                // prior to normalization and then reverting them in post-processing.
//                str = composer.normalize(str);               
                str = composer.normalize(str.replace('\u0110', '\u00D0').replace('\u0111', '\u00F0'))
                        .replace('\u00D0', '\u0110').replace('\u00F0', '\u0111');
            }
            VIQRtoUnicode("Unicode");
        } else if (sourceEncoding == VietEncodings.ISC) {
            ISCtoUnicode();
        } else if (sourceEncoding == VietEncodings.TCVN3) {
            TCVNtoUnicode();
        } else if (sourceEncoding == VietEncodings.NCR) {
            NCRtoUnicode();
        } else if (sourceEncoding == VietEncodings.UTF8) {
            UTF8toUnicode();
        } else if (sourceEncoding == VietEncodings.Unicode_Composite) {
            CompositetoPrecomposed();
        } else {
            throw new RuntimeException("Unsupported encoding: " + sourceEncoding);
        }
    }


    /**
     *  Multiple String replacement
     *
     *@param  text     Text to be performed on
     *@param  pattern  Find text
     *@param  replace  Replace text
     *@return          Result text
     */
    private String replaceString(String text, final String[] pattern, final String[] replace) {
        int startIndex;
        int foundIndex;
        StringBuffer result = new StringBuffer();

        for (int i = 0; i < pattern.length; i++) {
            startIndex = 0;
            // Clear the buffer
            result.setLength(0);

            // Look for a pattern to replace
            while ((foundIndex = text.indexOf(pattern[i], startIndex)) >= 0) {
                result.append(text.substring(startIndex, foundIndex));
                result.append(replace[i]);
                startIndex = foundIndex + pattern[i].length();
            }
            result.append(text.substring(startIndex));
            text = result.toString();
        }
        return text;
    }

    /**
     *  Changes HTML meta tag for charset to UTF-8.
     */
    private void prepareMetaTag() {
                // delete existing charset attribute in meta tag
        str = str.replaceAll("(?i)charset=(?:iso-8859-1|windows-1252|windows-1258|us-ascii|x-user-defined)", "")
                // delete the rest of the meta tag
                .replaceAll("(?i)<meta http-equiv=\"?Content-Type\"? content=\"text/html;\\s*\">\\n?", "")
                // insert new meta tag with UTF-8 charset
                .replaceAll("(?i)<head>", "<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">");
    }


    /**
     *  Translates Character entity references to corresponding Cp1252 characters.
     */
    private void HTMLtoANSI() {
        final String[] extended_ansi_html = {"&trade;", "&#8209;", "&nbsp;", 
                "&iexcl;", "&cent;", "&pound;", "&curren;", "&yen;", "&brvbar;", "&sect;", "&uml;", "&copy;", "&ordf;",
                "&laquo;", "&not;", "&shy;", "&reg;", "&macr;", "&deg;", "&plusmn;", "&sup2;", "&sup3;",
                "&acute;", "&micro;", "&para;", "&middot;", "&cedil;", "&sup1;", "&ordm;", "&raquo;",
                "&frac14;", "&frac12;", "&frac34;", "&iquest;", "&Agrave;", "&Aacute;", "&Acirc;",
                "&Atilde;", "&Auml;", "&Aring;", "&AElig;", "&Ccedil;", "&Egrave;", "&Eacute;", "&Ecirc;",
                "&Euml;", "&Igrave;", "&Iacute;", "&Icirc;", "&Iuml;", "&ETH;", "&Ntilde;", "&Ograve;",
                "&Oacute;", "&Ocirc;", "&Otilde;", "&Ouml;", "&times;", "&Oslash;", "&Ugrave;", "&Uacute;",
                "&Ucirc;", "&Uuml;", "&Yacute;", "&THORN;", "&szlig;", "&agrave;", "&aacute;", "&acirc;",
                "&atilde;", "&auml;", "&aring;", "&aelig;", "&ccedil;", "&egrave;", "&eacute;", "&ecirc;",
                "&euml;", "&igrave;", "&iacute;", "&icirc;", "&iuml;", "&eth;", "&ntilde;", "&ograve;",
                "&oacute;", "&ocirc;", "&otilde;", "&ouml;", "&divide;", "&oslash;", "&ugrave;", "&uacute;",
                "&ucirc;", "&uuml;", "&yacute;", "&thorn;", "&yuml;"};
        final String[] extended_ansi = {"\u0099", "\u2011", "\u00A0",
                "\u00A1", "\u00A2", "\u00A3", "\u00A4", "\u00A5", "\u00A6", "\u00A7", "\u00A8", "\u00A9",
                "\u00AA", "\u00AB", "\u00AC", "\u00AD", "\u00AE", "\u00AF", "\u00B0", "\u00B1", "\u00B2",
                "\u00B3", "\u00B4", "\u00B5", "\u00B6", "\u00B7", "\u00B8", "\u00B9", "\u00BA", "\u00BB",
                "\u00BC", "\u00BD", "\u00BE", "\u00BF", "\u00C0", "\u00C1", "\u00C2", "\u00C3", "\u00C4",
                "\u00C5", "\u00C6", "\u00C7", "\u00C8", "\u00C9", "\u00CA", "\u00CB", "\u00CC", "\u00CD",
                "\u00CE", "\u00CF", "\u00D0", "\u00D1", "\u00D2", "\u00D3", "\u00D4", "\u00D5", "\u00D6",
                "\u00D7", "\u00D8", "\u00D9", "\u00DA", "\u00DB", "\u00DC", "\u00DD", "\u00DE", "\u00DF",
                "\u00E0", "\u00E1", "\u00E2", "\u00E3", "\u00E4", "\u00E5", "\u00E6", "\u00E7", "\u00E8",
                "\u00E9", "\u00EA", "\u00EB", "\u00EC", "\u00ED", "\u00EE", "\u00EF", "\u00F0", "\u00F1",
                "\u00F2", "\u00F3", "\u00F4", "\u00F5", "\u00F6", "\u00F7", "\u00F8", "\u00F9", "\u00FA",
                "\u00FB", "\u00FC", "\u00FD", "\u00FE", "\u00FF"};

        str = replaceString(str, extended_ansi_html, extended_ansi);
    }
    
    /**
     * Converts Cp1252 characters in \u0080-\u009F range to pure hex.
     * This method is required for VISCII and VPS because these encodings
     * utilize characters in this range.
     */    
    private void Cp1252toHex() {
        final char[] cha = {'\u20AC', '\u201A', '\u0192', '\u201E', '\u2026', '\u2020', '\u2021',
                            '\u02C6', '\u2030', '\u0160', '\u2039', '\u0152', '\u017D',
                            '\u2018', '\u2019', '\u201C', '\u201D', '\u2022', '\u2013', '\u2014',
                            '\u02DC', '\u2122', '\u0161', '\u203A', '\u0153', '\u017E', '\u0178'
                            };
        final char[] hex = {'\u0080', '\u0082', '\u0083', '\u0084', '\u0085', '\u0086', '\u0087', 
                            '\u0088', '\u0089', '\u008A', '\u008B', '\u008C', '\u008E',
                            '\u0091', '\u0092', '\u0093', '\u0094', '\u0095', '\u0096', '\u0097', 
                            '\u0098', '\u0099', '\u009A', '\u009B', '\u009C', '\u009E', '\u009F'
                            };
        for (int i = 0; i < hex.length; i++) {
            str = str.replace(cha[i], hex[i]);
        }
    }

    /**
     *  VISCII-to-Unicode conversion.
     */
    private void VISCIItoUnicode() {
        if (html) {
            HTMLtoANSI();
            convertNCR();
            prepareMetaTag();

            // Replace fonts
            str = str.replaceAll("(?:VI Times|Heo May|HoangYen|MinhQu\\u00E2n|PhuongThao|ThaHuong|UHo\\u00E0i)H?(?: Hoa)?(?: 1\\.1)?", SERIF);            
            str = str.replaceAll("VI Arial", SANS_SERIF);                       
        }
        Cp1252toHex();
        
        final char[] VISCII_char = {'\u2011', '\u00C5', '\u00E5', '\u00F0', '\u00CE', '\u00EE', '\u009D',
                '\u00FB', '\u00B4', '\u00BD', '\u00BF', '\u00DF', '\u0080', '\u00D5', '\u00C4', '\u00E4',
                '\u0084', '\u00A4', '\u0085', '\u00A5', '\u0086', '\u00A6', '\u0006', '\u00E7', '\u0087',
                '\u00A7', '\u0081', '\u00A1', '\u0082', '\u00A2', '\u0002', '\u00C6', '\u0005', '\u00C7',
                '\u0083', '\u00A3', '\u0089', '\u00A9', '\u00CB', '\u00EB', '\u0088', '\u00A8', '\u008A',
                '\u00AA', '\u008B', '\u00AB', '\u008C', '\u00AC', '\u008D', '\u00AD', '\u008E', '\u00AE',
                '\u009B', '\u00EF', '\u0098', '\u00B8', '\u009A', '\u00F7', '\u0099', '\u00F6', '\u008F',
                '\u00AF', '\u0090', '\u00B0', '\u0091', '\u00B1', '\u0092', '\u00B2', '\u0093', '\u00B5',
                '\u0095', '\u00BE', '\u0096', '\u00B6', '\u0097', '\u00B7', '\u00B3', '\u00DE', '\u0094',
                '\u00FE', '\u009E', '\u00F8', '\u009C', '\u00FC', '\u00BA', '\u00D1', '\u00BB', '\u00D7',
                '\u00BC', '\u00D8', '\u00FF', '\u00E6', '\u00B9', '\u00F1', '\u009F', '\u00CF', '\u001E',
                '\u00DC', '\u0014', '\u00D6', '\u0019', '\u00DB', '\u00A0'};
        final char[] Unicode_char = {'\u1EF4', '\u0102', '\u0103', '\u0111', '\u0128', '\u0129', '\u0168',
                '\u0169', '\u01A0', '\u01A1', '\u01AF', '\u01B0', '\u1EA0', '\u1EA1', '\u1EA2', '\u1EA3',
                '\u1EA4', '\u1EA5', '\u1EA6', '\u1EA7', '\u1EA8', '\u1EA9', '\u1EAA', '\u1EAB', '\u1EAC',
                '\u1EAD', '\u1EAE', '\u1EAF', '\u1EB0', '\u1EB1', '\u1EB2', '\u1EB3', '\u1EB4', '\u1EB5',
                '\u1EB6', '\u1EB7', '\u1EB8', '\u1EB9', '\u1EBA', '\u1EBB', '\u1EBC', '\u1EBD', '\u1EBE',
                '\u1EBF', '\u1EC0', '\u1EC1', '\u1EC2', '\u1EC3', '\u1EC4', '\u1EC5', '\u1EC6', '\u1EC7',
                '\u1EC8', '\u1EC9', '\u1ECA', '\u1ECB', '\u1ECC', '\u1ECD', '\u1ECE', '\u1ECF', '\u1ED0',
                '\u1ED1', '\u1ED2', '\u1ED3', '\u1ED4', '\u1ED5', '\u1ED6', '\u1ED7', '\u1ED8', '\u1ED9',
                '\u1EDA', '\u1EDB', '\u1EDC', '\u1EDD', '\u1EDE', '\u1EDF', '\u1EE0', '\u1EE1', '\u1EE2',
                '\u1EE3', '\u1EE4', '\u1EE5', '\u1EE6', '\u1EE7', '\u1EE8', '\u1EE9', '\u1EEA', '\u1EEB',
                '\u1EEC', '\u1EED', '\u1EEE', '\u1EEF', '\u1EF0', '\u1EF1', '\u1EF2', '\u1EF3', '\u1EF4',
                '\u1EF5', '\u1EF6', '\u1EF7', '\u1EF8', '\u1EF9', '\u00D5'};
        for (int i = 0; i < VISCII_char.length; i++) {
            str = str.replace(VISCII_char[i], Unicode_char[i]);
        }
    }


    /**
     *  VIQR-to-Unicode conversion.
     *
     *@param  UnicodeVIQR  VIQR, to convert to Unicode
     *                     Unicode, to convert to VIQR   
     */
    private void VIQRtoUnicode(String UnicodeVIQR) {
        final String[] VIQR_char = {"y~", "Y~", "y?", "Y?", "y.", "Y.", "y`", "Y`", "u+.", "U+.", "u+~",
                "U+~", "u+?", "U+?", "u+`", "U+`", "u+'", "U+'", "u?", "U?", "u.", "U.", "o+.", "O+.",
                "o+~", "O+~", "o+?", "O+?", "o+`", "O+`", "o+'", "O+'", "o^.", "O^.", "o^~", "O^~", "o^?",
                "O^?", "o^`", "O^`", "o^'", "O^'", "o?", "O?", "o.", "O.", "i.", "I.", "i?", "I?", "e^.",
                "E^.", "e^~", "E^~", "e^?", "E^?", "e^`", "E^`", "e^'", "E^'", "e~", "E~", "e?", "E?", "e.",
                "E.", "a(.", "A(.", "a(~", "A(~", "a(?", "A(?", "a(`", "A(`", "a('", "A('", "a^.", "A^.",
                "a^~", "A^~", "a^?", "A^?", "a^`", "A^`", "a^'", "A^'", "a?", "A?", "a.", "A.", "u+", "U+",
                "o+", "O+", "u~", "U~", "i~", "I~", "dd", "a(", "A(", "y'", "u'", "u`", "o~", "o^", "o'",
                "o`", "i'", "i`", "e^", "e'", "e`", "a~", "a^", "a'", "a`", "Y'", "U'", "U`", "O~", "O^",
                "O'", "O`", "DD", "I'", "I`", "E^", "E'", "E`", "A~", "A^", "A'", "A`"};
        final String[] Unicode_char = {"\u1EF9", "\u1EF8", "\u1EF7", "\u1EF6", "\u1EF5", "\u1EF4",
                "\u1EF3", "\u1EF2", "\u1EF1", "\u1EF0", "\u1EEF", "\u1EEE", "\u1EED", "\u1EEC", "\u1EEB",
                "\u1EEA", "\u1EE9", "\u1EE8", "\u1EE7", "\u1EE6", "\u1EE5", "\u1EE4", "\u1EE3", "\u1EE2",
                "\u1EE1", "\u1EE0", "\u1EDF", "\u1EDE", "\u1EDD", "\u1EDC", "\u1EDB", "\u1EDA", "\u1ED9",
                "\u1ED8", "\u1ED7", "\u1ED6", "\u1ED5", "\u1ED4", "\u1ED3", "\u1ED2", "\u1ED1", "\u1ED0",
                "\u1ECF", "\u1ECE", "\u1ECD", "\u1ECC", "\u1ECB", "\u1ECA", "\u1EC9", "\u1EC8", "\u1EC7",
                "\u1EC6", "\u1EC5", "\u1EC4", "\u1EC3", "\u1EC2", "\u1EC1", "\u1EC0", "\u1EBF", "\u1EBE",
                "\u1EBD", "\u1EBC", "\u1EBB", "\u1EBA", "\u1EB9", "\u1EB8", "\u1EB7", "\u1EB6", "\u1EB5",
                "\u1EB4", "\u1EB3", "\u1EB2", "\u1EB1", "\u1EB0", "\u1EAF", "\u1EAE", "\u1EAD", "\u1EAC",
                "\u1EAB", "\u1EAA", "\u1EA9", "\u1EA8", "\u1EA7", "\u1EA6", "\u1EA5", "\u1EA4", "\u1EA3",
                "\u1EA2", "\u1EA1", "\u1EA0", "\u01B0", "\u01AF", "\u01A1", "\u01A0", "\u0169", "\u0168",
                "\u0129", "\u0128", "\u0111", "\u0103", "\u0102", "\u00FD", "\u00FA", "\u00F9", "\u00F5",
                "\u00F4", "\u00F3", "\u00F2", "\u00ED", "\u00EC", "\u00EA", "\u00E9", "\u00E8", "\u00E3",
                "\u00E2", "\u00E1", "\u00E0", "\u00DD", "\u00DA", "\u00D9", "\u00D5", "\u00D4", "\u00D3",
                "\u00D2", "\u0110", "\u00CD", "\u00CC", "\u00CA", "\u00C9", "\u00C8", "\u00C3", "\u00C2",
                "\u00C1", "\u00C0"};
                
        /*
        *  Unicode-to-VIQR conversion
        *  Placed inside this method to share the VIQR and Unicode data arrays
        */
        if (UnicodeVIQR.equals("Unicode")) {
            // insert escape character '\' where needed
            str = str.replaceAll("(?=[.?'])", "\\\\").replaceAll("(?i)(?<=d)(?=d)", "\\\\");
            // convert to VIQR            
            str = replaceString(str, Unicode_char, VIQR_char);

            cleanupVIQR();
            return;
        }

        if (html) {
            HTMLtoANSI();
            convertNCR();
            prepareMetaTag();
        }
        
        // adjust irregular characters to VIQR standard
        str = str.replaceAll("(?i)(?<=[uo])\\*", "+").replaceAll("(?i)(d)([-d])", "$1$1");
        
        str = str.replace('\u0092', '\''); // replace right single quotation mark (\u0092, or \u2019) with apostrophe
        
        // Attempt to fix the problem with . and ? punctuation marks becoming tone marks.
        // This, however, is commented out because it may interfere with correct conversions
        // when a proper name (hence capitalized) instead of a capital letter beginning a sentence is encountered.
//        str = str.replaceAll("(?=[?.]\\s+\\p{Upper})", "\\\\");
        
        // change tone marks to punctuation marks if ' or . or ? is before a whitespace
        // and after a vowel which in turn is after a vowel and any one or two marks `?~'.^(+ , or ae        
        str = str.replaceAll("(?i)(?<=(?:(?:[aeiouy][`?~'.^(+]{1,2})|[ae])[aeiouy])(?=[?'.](?:\\s|$|\\p{Punct}))", "\\\\");
        
        // convert to Unicode
        str = replaceString(str, VIQR_char, Unicode_char);
        
        // delete redundant '\' characters
        str = str.replaceAll("(?i)\\\\(?=[-.?'d\\\\])", "");
        
        cleanupURL();
    }


    /**
     *  Removes unneeded '\' characters
     */
    private void cleanupVIQR() {
        // delete BOM, if any
        if (str.charAt(0) == '\uFEFF') {
            str = str.substring(1);
        }
        
        // delete '\' characters after consonants
        str = str.replaceAll("(?i)(?<![aeiouy^(+])\\\\", "");
               
        // delete '\' in URLs
        str = str.replaceAll("(?<=://|mailto:)([^\\\\]+)\\\\(?=[.?])", "$1");
    }


    /**
     *  Corrects invalid characters in URLs
     */
    private void cleanupURL() {
        StringBuffer sb = new StringBuffer(str);
        int startIndex = 0;
        int foundIndex;

        // correct characters in URLs, they can't be non-ASCII
        try {
            while ((foundIndex = sb.indexOf("://", startIndex)) != -1) {
                startIndex = foundIndex + 3;
                // Look for a pattern to replace
                char ch;

                while ((ch = sb.charAt(startIndex)) != ' ' && ch != '\n') {
                    if (ch >= '\u1EA0') { // A.
                        String replace = null;
                        switch (ch) {
                            case '\u1EA1': replace = "a."; break;
                            case '\u1EB9': replace = "e."; break;
                            case '\u1ECB': replace = "i."; break;
                            case '\u1ECD': replace = "o."; break;
                            case '\u1EE5': replace = "u."; break;
                            case '\u1EF5': replace = "y."; break;
                            case '\u1EA0': replace = "A."; break;
                            case '\u1EB8': replace = "E."; break;
                            case '\u1ECA': replace = "I."; break;
                            case '\u1ECC': replace = "O."; break;
                            case '\u1EE4': replace = "U."; break;
                            case '\u1EF4': replace = "Y."; break;
                            default: break;
                        }
                        if (replace != null) {
                            sb.replace(startIndex, startIndex + 1, replace);
                            startIndex++;
                        }
                    }
                    startIndex++;
                }
            }
        } catch (StringIndexOutOfBoundsException exc) {
            exc.printStackTrace();
        } finally {
            str = sb.toString();
        }

    }

    /**
     *  VNI-to-Unicode conversion process consisting of 3 parts. Part 1 converts
     *  single-byte (ASCII) characters except O+ and U+ because they are also used
     *  as base in some VNI double-byte characters. Part 2 converts VNI double-byte
     *  characters. Part 3 converts O+ and U+.
     */
    private void VNItoUnicode() {
        if (html) {
            HTMLtoANSI();
            convertNCR();
            prepareMetaTag();

            // Replace fonts
            str = str.replaceAll("VNI[- ](?:Times|Couri|Centur|Brush)", SERIF);
            str = str.replaceAll("VNI[- ](?:Helve|Aptima)", SANS_SERIF);
        }

        // Part 1
        str = str.replace('\u00D1', '\u0110');// DD
        str = str.replace('\u00F1', '\u0111');// dd
        str = str.replace('\u00D3', '\u0128');// I~
        str = str.replace('\u00F3', '\u0129');// i~
        str = str.replace('\u00D2', '\u1ECA');// I.
        str = str.replace('\u00F2', '\u1ECB');// i.
        str = str.replace('\u00C6', '\u1EC8');// I?
        str = str.replace('\u00E6', '\u1EC9');// i?
        str = str.replace('\u00CE', '\u1EF4');// Y.
        str = str.replace('\u00EE', '\u1EF5');// y.

        // Part 2
        // Transform "O\u00C2" -> "\u00C6" to later transform back to "\u00D4" in Part 3
        final String[] VNI_char = {"O\u00C2", "o\u00E2", "y\u00F5", "Y\u00D5", "y\u00FB", "Y\u00DB",
                "y\u00F8", "Y\u00D8", "\u00F6\u00EF", "\u00D6\u00CF", "\u00F6\u00F5", "\u00D6\u00D5",
                "\u00F6\u00FB", "\u00D6\u00DB", "\u00F6\u00F8", "\u00D6\u00D8", "\u00F6\u00F9",
                "\u00D6\u00D9", "u\u00FB", "U\u00DB", "u\u00EF", "U\u00CF", "\u00F4\u00EF", "\u00D4\u00CF",
                "\u00F4\u00F5", "\u00D4\u00D5", "\u00F4\u00FB", "\u00D4\u00DB", "\u00F4\u00F8",
                "\u00D4\u00D8", "\u00F4\u00F9", "\u00D4\u00D9", "o\u00E4", "O\u00C4", "o\u00E3", "O\u00C3",
                "o\u00E5", "O\u00C5", "o\u00E0", "O\u00C0", "o\u00E1", "O\u00C1", "o\u00FB", "O\u00DB",
                "o\u00EF", "O\u00CF", "e\u00E4", "E\u00C4", "e\u00E3", "E\u00C3", "e\u00E5", "E\u00C5",
                "e\u00E0", "E\u00C0", "e\u00E1", "E\u00C1", "e\u00F5", "E\u00D5", "e\u00FB", "E\u00DB",
                "e\u00EF", "E\u00CF", "a\u00EB", "A\u00CB", "a\u00FC", "A\u00DC", "a\u00FA", "A\u00DA",
                "a\u00E8", "A\u00C8", "a\u00E9", "A\u00C9", "a\u00E4", "A\u00C4", "a\u00E3", "A\u00C3",
                "a\u00E5", "A\u00C5", "a\u00E0", "A\u00C0", "a\u00E1", "A\u00C1", "a\u00FB", "A\u00DB",
                "a\u00EF", "A\u00CF", "u\u00F5", "U\u00D5", "a\u00EA", "A\u00CA", "y\u00F9", "u\u00F9",
                "u\u00F8", "o\u00F5", "o\u00F9", "o\u00F8", "e\u00E2", "e\u00F9", "e\u00F8", "a\u00F5",
                "a\u00E2", "a\u00F9", "a\u00F8", "Y\u00D9", "U\u00D9", "U\u00D8", "O\u00D5", "O\u00D9",
                "O\u00D8", "E\u00C2", "E\u00D9", "E\u00D8", "A\u00D5", "A\u00C2", "A\u00D9", "A\u00D8"};
        final String[] Unicode_char = {"\u00C6", "\u00E6", "\u1EF9", "\u1EF8", "\u1EF7", "\u1EF6",
                "\u1EF3", "\u1EF2", "\u1EF1", "\u1EF0", "\u1EEF", "\u1EEE", "\u1EED", "\u1EEC", "\u1EEB",
                "\u1EEA", "\u1EE9", "\u1EE8", "\u1EE7", "\u1EE6", "\u1EE5", "\u1EE4", "\u1EE3", "\u1EE2",
                "\u1EE1", "\u1EE0", "\u1EDF", "\u1EDE", "\u1EDD", "\u1EDC", "\u1EDB", "\u1EDA", "\u1ED9",
                "\u1ED8", "\u1ED7", "\u1ED6", "\u1ED5", "\u1ED4", "\u1ED3", "\u1ED2", "\u1ED1", "\u1ED0",
                "\u1ECF", "\u1ECE", "\u1ECD", "\u1ECC", "\u1EC7", "\u1EC6", "\u1EC5", "\u1EC4", "\u1EC3",
                "\u1EC2", "\u1EC1", "\u1EC0", "\u1EBF", "\u1EBE", "\u1EBD", "\u1EBC", "\u1EBB", "\u1EBA",
                "\u1EB9", "\u1EB8", "\u1EB7", "\u1EB6", "\u1EB5", "\u1EB4", "\u1EB3", "\u1EB2", "\u1EB1",
                "\u1EB0", "\u1EAF", "\u1EAE", "\u1EAD", "\u1EAC", "\u1EAB", "\u1EAA", "\u1EA9", "\u1EA8",
                "\u1EA7", "\u1EA6", "\u1EA5", "\u1EA4", "\u1EA3", "\u1EA2", "\u1EA1", "\u1EA0", "\u0169",
                "\u0168", "\u0103", "\u0102", "\u00FD", "\u00FA", "\u00F9", "\u00F5", "\u00F3", "\u00F2",
                "\u00EA", "\u00E9", "\u00E8", "\u00E3", "\u00E2", "\u00E1", "\u00E0", "\u00DD", "\u00DA",
                "\u00D9", "\u00D5", "\u00D3", "\u00D2", "\u00CA", "\u00C9", "\u00C8", "\u00C3", "\u00C2",
                "\u00C1", "\u00C0"};

        str = replaceString(str, VNI_char, Unicode_char);

        // Part 3
        str = str.replace('\u00D4', '\u01A0');// O+
        str = str.replace('\u00F4', '\u01A1');// o+
        str = str.replace('\u00D6', '\u01AF');// U+
        str = str.replace('\u00F6', '\u01B0');// u+
        str = str.replace('\u00C6', '\u00D4');// O^
        str = str.replace('\u00E6', '\u00F4');// o^
    }

    /**
     *  VPS-to-Unicode conversion
     */
    private void VPStoUnicode() {
        if (html) {
            HTMLtoANSI();
            convertNCR();
            prepareMetaTag();

            // Replace fonts, longer names first!
            str = str.replaceAll("VPS (?:Times|Long An|Nam Dinh|Ninh Binh)(?: Hoa)?", SERIF);           
            str = str.replaceAll("VPS Helv(?: Hoa)?", SANS_SERIF);           
        }
        Cp1252toHex();
        
        final char[] VPS_char = {'\u00CF', '\u00B3', '\u009B', '\u00FD', '\u009C', '\u0019', '\u00FF',
                '\u00B2', '\u00BF', '\u0015', '\u00BB', '\u001D', '\u00BA', '\u00B1', '\u00D8', '\u00AF',
                '\u00D9', '\u00AD', '\u00FB', '\u00D1', '\u00F8', '\u0014', '\u00AE', '\u0013', '\u00AB',
                '\u00A6', '\u00AA', '\u009F', '\u00A9', '\u009E', '\u00A7', '\u009D', '\u00B6', '\u0012',
                '\u0087', '\u0099', '\u00B0', '\u0098', '\u00D2', '\u0097', '\u00D3', '\u0096', '\u00D5',
                '\u00BD', '\u0086', '\u0011', '\u00CE', '\u0010', '\u00CC', '\u00B7', '\u008C', '\u0006',
                '\u00CD', '\u0095', '\u008B', '\u0094', '\u008A', '\u0093', '\u0089', '\u0090', '\u00EB',
                '\u00FE', '\u00C8', '\u00DE', '\u00CB', '\u0005', '\u00A5', '\u0004', '\u00A4', '\u00F0',
                '\u00A3', '\u008F', '\u00A2', '\u008E', '\u00A1', '\u008D', '\u00C6', '\u0003', '\u00C5',
                '\u001C', '\u00C4', '\u0085', '\u00C0', '\u0084', '\u00C3', '\u0083', '\u00E4', '\u0081',
                '\u00E5', '\u0002', '\u00DC', '\u00D0', '\u00D6', '\u00F7', '\u00DB', '\u00AC', '\u00EF',
                '\u00B8', '\u00C7', '\u00E6', '\u0088', '\u009A', '\u00A8', '\u00BE', '\u00B9', '\u00BC',
                '\u00F1', '\u00B4', '\u00B5', '\u00D7', '\u0082', '\u0080'};
        final char[] Unicode_char = {'\u1EF9', '\u1EF8', '\u1EF7', '\u1EF6', '\u1EF5', '\u1EF4', '\u1EF3',
                '\u1EF2', '\u1EF1', '\u1EF0', '\u1EEF', '\u1EEE', '\u1EED', '\u1EEC', '\u1EEB', '\u1EEA',
                '\u1EE9', '\u1EE8', '\u1EE7', '\u1EE6', '\u1EE5', '\u1EE4', '\u1EE3', '\u1EE2', '\u1EE1',
                '\u1EE0', '\u1EDF', '\u1EDE', '\u1EDD', '\u1EDC', '\u1EDB', '\u1EDA', '\u1ED9', '\u1ED8',
                '\u1ED7', '\u1ED6', '\u1ED5', '\u1ED4', '\u1ED3', '\u1ED2', '\u1ED1', '\u1ED0', '\u1ECF',
                '\u1ECE', '\u1ECD', '\u1ECC', '\u1ECB', '\u1ECA', '\u1EC9', '\u1EC8', '\u1EC7', '\u1EC6',
                '\u1EC5', '\u1EC4', '\u1EC3', '\u1EC2', '\u1EC1', '\u1EC0', '\u1EBF', '\u1EBE', '\u1EBD',
                '\u1EBC', '\u1EBB', '\u1EBA', '\u1EB9', '\u1EB8', '\u1EB7', '\u1EB6', '\u1EB5', '\u1EB4',
                '\u1EB3', '\u1EB2', '\u1EB1', '\u1EB0', '\u1EAF', '\u1EAE', '\u1EAD', '\u1EAC', '\u1EAB',
                '\u1EAA', '\u1EA9', '\u1EA8', '\u1EA7', '\u1EA6', '\u1EA5', '\u1EA4', '\u1EA3', '\u1EA2',
                '\u1EA1', '\u1EA0', '\u01B0', '\u01AF', '\u01A1', '\u01A0', '\u0169', '\u0168', '\u0129',
                '\u0128', '\u0111', '\u0103', '\u0102', '\u00FD', '\u00D9', '\u00D5', '\u00D3', '\u00D2',
                '\u0110', '\u00CD', '\u00CC', '\u00C8', '\u00C3', '\u00C0'};
        for (int i = 0; i < VPS_char.length; i++) {
            str = str.replace(VPS_char[i], Unicode_char[i]);
        }
    }


    /**
     *  ISC-to-Unicode conversion
     */
    private void ISCtoUnicode() {
        if (html) {
            HTMLtoANSI();
            convertNCR();
            prepareMetaTag();
        }

        final char[] ISC_char     = {'\u2018', '\u2019', '\u201C', '\u201D', '\u0023', '\u00DA', '\\',     '\u005E', '\u0060', '\u007C', '\u007E', '\uFFFD', '\u00C0', '\u00C5', '\u00E8', '\u00C1', '\u00D1', '\u00D6', '\u00DC', '\u00C8', '\u00E0', '\u00E2', '\u00E9', '\u00E3', '\u00E5', '\u00CC', '\u00E4', '\u00C3', '\u00CD', '\u00EB', '\u00D2', '\u00EC', '\u00D5', '\u00D3', '\u00F1', '\u00F3', '\u00F2', '\u00F4', '\u00F6', '\u00F5', '\u00FA', '\u00D9', '\u00FB', '\u00FC', '\u2020', '\u00B0', '\u00C2', '\u00CA', '\u00F9', '\u2022', '\u00B6', '\u00DF', '\u00AE', '\u00A9', '\u00EA', '\u00B4', '\u00A8', '\u2260', '\u00C6', '\u00D8', '\u221E', '\u00B1', '\u2264', '\u2265', '\u00A5', '\u00B5', '\u2202', '\u2211', '\u00E1', '\u03C0', '\u222B', '\u00AA', '\u00BA', '\u03A9', '\u00E6', '\u00F8', '\u00BF', '\u00A1', '\u00AC', '\u221A', '\u0192', '\u2248', '\u2206', '\u00AB', '\u00BB', '\u2026', '\u00C4', '\u00C7', '\u00EE', '\u0152', '\u0153', '\u00C9', '\u2014', '\u0013', '\u0014', '\u0011', '\u0012', '\u00F7', '\u25CA', '\u00FF', '\u0178', '\u2044', '\u20AC', '\u2039', '\u00ED', '\uFB01', '\uFB02', '\u2021', '\u00B7', '\u201A', '\u201E', '\u2030', '\u00A2', '\u00A3', '\u2013', '\u00CB', '\u220F', '\u2122', '\u00CE', '\u00CF', '\u00E7', '\u00EF', '\u00D4', '\uF8FF', '\u203A', '\u0040', '\u00DB', '\u00A7', '\u0131', '\u02C6', '\u02DC', '\u00AF', '\u02D8', '\u02D9', '\u02DA', '\u00B8', '\u02DD', '\u02DB', '\u02C7'};
        final char[] Unicode_char = {'\u1EC5', '\u1EBF', '\u1EC1', '\u1EC3', '\u1EF0', '\u0169', '\u1EEA', '\u1EEC', '\u1EE8', '\u1EE4', '\u1EEE', '\u007F', '\u1EAD', '\u1EA2', '\u0128', '\u1ED7', '\u1EA0', '\u1EB6', '\u1EAC', '\u1ED9', '\u1EBA', '\u1EBC', '\u1EC8', '\u1EB9', '\u1EC6', '\u1EDB', '\u00E9', '\u00E8', '\u1EDD', '\u1ECA', '\u1EE7', '\u1ECE', '\u1EC2', '\u1EE3', '\u1ECC', '\u1ED8', '\u1EDC', '\u1EDE', '\u1EE0', '\u1EDA', '\u1EE2', '\u1EE5', '\u1EE6', '\u0168', '\u1EA5', '\u0102', '\u1ED3', '\u1ED5', '\u00D9', '\u01A0', '\u01AF', '\u0110', '\u0103', '\u00E2', '\u00CD', '\u00F4', '\u01A1', '\u01B0', '\u0111', '\u1EB0', '\u1EF2', '\u1EF6', '\u1EF8', '\u00DD', '\u1EF4', '\u00E0', '\u1EA3', '\u00E3', '\u00C8', '\u1EA1', '\u1EB2', '\u1EB1', '\u1EB3', '\u1EB5', '\u1EAF', '\u1EB4', '\u1EAE', '\u1EA6', '\u1EA8', '\u1EAA', '\u1EA4', '\u1EC0', '\u1EB7', '\u1EA7', '\u1EA9', '\u1EAB', '\u00C0', '\u00C3', '\u00D5', '\u1EBB', '\u1EBD', '\u00C1', '\u1EB8', '\u201C', '\u201D', '\u2018', '\u2019', '\u1EC7', '\u00EC', '\u1EC9', '\u1EC4', '\u1EBE', '\u1ED2', '\u0129', '\u00D2', '\u1ECB', '\u00F2', '\u1ED4', '\u1ECF', '\u00F5', '\u00F3', '\u1ECD', '\u00C2', '\u00CA', '\u00C9', '\u1ED1', '\u00E1', '\u00EA', '\u1EDF', '\u1EE1', '\u00CC', '\u00D3', '\u00F9', '\u1ED6', '\u00ED', '\u00DA', '\u00FA', '\u00D4', '\u1EEB', '\u1EED', '\u1EEF', '\u1EE9', '\u1EF1', '\u1EF3', '\u1EF7', '\u1EF9', '\u00FD', '\u1EF5', '\u1ED0'};
        for (int i = 0; i < ISC_char.length; i++) {
            str = str.replace(ISC_char[i], Unicode_char[i]);
        }
    }


    /**
     *  TCVN-to-Unicode conversion
     */
    private void TCVNtoUnicode() {
        if (html) {
            HTMLtoANSI();
            convertNCR();
            prepareMetaTag();

            // Replace fonts
            str = str.replaceAll("\\.VnTimeH?", SERIF);               
            str = str.replaceAll("\\.VnArialH?", SANS_SERIF);             
        }

        final char[] TCVN_char = {'\u00FC', '\u00FB', '\u00FE', '\u00FA', '\u00F9', '\u00F7', '\u00F6',
                '\u00F5', '\u00F8', '\u00F1', '\u00F4', '\u00EE', '\u00EC', '\u00EB', '\u00EA', '\u00ED',
                '\u00E9', '\u00E7', '\u00E6', '\u00E5', '\u00E8', '\u00E1', '\u00E4', '\u00DE', '\u00D8',
                '\u00D6', '\u00D4', '\u00D3', '\u00D2', '\u00D5', '\u00CF', '\u00CE', '\u00D1', '\u00C6',
                '\u00BD', '\u00BC', '\u00AB', '\u00BE', '\u00CB', '\u00C9', '\u00C8', '\u00C7', '\u00CA',
                '\u00B6', '\u00B9', '\u00AD', '\u00A6', '\u00AC', '\u00A5', '\u00F2', '\u00DC', '\u00AE',
                '\u00A8', '\u00A1', '\u00F3', '\u00EF', '\u00E2', '\u00BB', '\u00E3', '\u00DF', '\u00DD',
                '\u00D7', '\u00AA', '\u00D0', '\u00CC', '\u00B7', '\u00A9', '\u00B8', '\u00B5', '\u00A4',
                '\u00A7', '\u00A3', '\u00A2'};
        final char[] Unicode_char = {'\u1EF9', '\u1EF7', '\u1EF5', '\u1EF3', '\u1EF1', '\u1EEF', '\u1EED',
                '\u1EEB', '\u1EE9', '\u1EE7', '\u1EE5', '\u1EE3', '\u1EE1', '\u1EDF', '\u1EDD', '\u1EDB',
                '\u1ED9', '\u1ED7', '\u1ED5', '\u1ED3', '\u1ED1', '\u1ECF', '\u1ECD', '\u1ECB', '\u1EC9',
                '\u1EC7', '\u1EC5', '\u1EC3', '\u1EC1', '\u1EBF', '\u1EBD', '\u1EBB', '\u1EB9', '\u1EB7',
                '\u1EB5', '\u1EB3', '\u00F4', '\u1EAF', '\u1EAD', '\u1EAB', '\u1EA9', '\u1EA7', '\u1EA5',
                '\u1EA3', '\u1EA1', '\u01B0', '\u01AF', '\u01A1', '\u01A0', '\u0169', '\u0129', '\u0111',
                '\u0103', '\u0102', '\u00FA', '\u00F9', '\u00F5', '\u1EB1', '\u00F3', '\u00F2', '\u00ED',
                '\u00EC', '\u00EA', '\u00E9', '\u00E8', '\u00E3', '\u00E2', '\u00E1', '\u00E0', '\u00D4',
                '\u0110', '\u00CA', '\u00C2'};

        for (int i = 0; i < TCVN_char.length; i++) {
            str = str.replace(TCVN_char[i], Unicode_char[i]);
        }

        final String[] TCVN_cap = {"\u0041\u00E0", "\u0041\u1EA3", "\u0041\u00E3", "\u0041\u00E1",
                "\u0041\u1EA1", "\u0045\u00E8", "\u0045\u1EBB", "\u0045\u1EBD", "\u0045\u00E9",
                "\u0045\u1EB9", "\u0049\u00EC", "\u0049\u1EC9", "\u0049\u0129", "\u0049\u00ED",
                "\u0049\u1ECB", "\u004F\u00F2", "\u004F\u1ECF", "\u004F\u00F5", "\u004F\u00F3",
                "\u004F\u1ECD", "\u0055\u00F9", "\u0055\u1EE7", "\u0055\u0169", "\u0055\u00FA",
                "\u0055\u1EE5", "\u0059\u1EF3", "\u0059\u1EF7", "\u0059\u1EF9", "\u0059\u00FD",
                "\u0059\u1EF5", "\u0102\u1EB1", "\u0102\u1EB3", "\u0102\u1EB5", "\u0102\u1EAF",
                "\u0102\u1EB7", "\u00C2\u1EA7", "\u00C2\u1EA9", "\u00C2\u1EAB", "\u00C2\u1EA5",
                "\u00C2\u1EAD", "\u00CA\u1EC1", "\u00CA\u1EC3", "\u00CA\u1EC5", "\u00CA\u1EBF",
                "\u00CA\u1EC7", "\u00D4\u1ED3", "\u00D4\u1ED5", "\u00D4\u1ED7", "\u00D4\u1ED1",
                "\u00D4\u1ED9", "\u01A0\u1EDD", "\u01A0\u1EDF", "\u01A0\u1EE1", "\u01A0\u1EDB",
                "\u01A0\u1EE3", "\u01AF\u1EEB", "\u01AF\u1EED", "\u01AF\u1EEF", "\u01AF\u1EE9",
                "\u01AF\u1EF1"};
        final String[] Unicode_cap = {"\u00C0", "\u1EA2", "\u00C3", "\u00C1", "\u1EA0", "\u00C8",
                "\u1EBA", "\u1EBC", "\u00C9", "\u1EB8", "\u00CC", "\u1EC8", "\u0128", "\u00CD", "\u1ECA",
                "\u00D2", "\u1ECE", "\u00D5", "\u00D3", "\u1ECC", "\u00D9", "\u1EE6", "\u0168", "\u00DA",
                "\u1EE4", "\u1EF2", "\u1EF6", "\u1EF8", "\u00DD", "\u1EF4", "\u1EB0", "\u1EB2", "\u1EB4",
                "\u1EAE", "\u1EB6", "\u1EA6", "\u1EA8", "\u1EAA", "\u1EA4", "\u1EAC", "\u1EC0", "\u1EC2",
                "\u1EC4", "\u1EBE", "\u1EC6", "\u1ED2", "\u1ED4", "\u1ED6", "\u1ED0", "\u1ED8", "\u1EDC",
                "\u1EDE", "\u1EE0", "\u1EDA", "\u1EE2", "\u1EEA", "\u1EEC", "\u1EEE", "\u1EE8", "\u1EF0"};

        str = replaceString(str, TCVN_cap, Unicode_cap);
    }
                            

    /**
     *  UTF8-to-Unicode conversion
     */
    private void UTF8toUnicode() {
        try {        
            byte[] aBytes = str.getBytes("Cp1252");
            
            // UTF-8 byte strings are frequently corrupted during handling or transmission.
            // Specifically, no-break spaces (0xA0 or 160) usually become regular spaces (0x20). 
            // In the Vietnamese Unicode set, there are only four characters whose UTF-8 
            // representations contain NBSP.
            
            // UTF-8 byte values  Unicode name
            // ------------------------------
            // 195 160        a with grave
            // 225 186 160    A with dot below
            // 198 160        O with horn
            // 225 187 160    O with horn and tilde
            
            // Replace spaces with NBSP where applicable
            for (int i = 1; i < aBytes.length; i++) {
                if (aBytes[i] == 0x20) // space?
                {
                    if (   (aBytes[i-1] == (byte) 0xC3) 
                        || ((i > 1) && (aBytes[i-2] == (byte) 0xE1) && (aBytes[i-1] == (byte) 0xBA)) 
                        || (aBytes[i-1] == (byte) 0xC6) 
                        || ((i > 1) && (aBytes[i-2] == (byte) 0xE1) && (aBytes[i-1] == (byte) 0xBB)) )  
                    {
                        aBytes[i] = (byte) 0xA0; // NBSP
                    }
                }
            }

            str = new String(aBytes, "UTF-8");
            
        } catch (UnsupportedEncodingException exc) {
            throw new RuntimeException("Unsupported encoding.");
        }
    }

    /**
     *  Numeric Character References-to-Unicode conversion
     *
     */
    private void NCRtoUnicode() {
        if (html) {
            HTMLtoANSI();
            prepareMetaTag();
        }
            
        convertNCR();
        CompositetoPrecomposed();
    }
    
    /**
     *  Converts Numeric Character References and Unicode escape sequences to Unicode
     */
    private void convertNCR() {     
        StringBuffer result = new StringBuffer();
        String[] NCRs = {"&#x", "&#", "\\u", "U+", "#x", "#"};
        
        for (int i = 0; i < NCRs.length; i++) {
            int radix;
            int foundIndex;
            int startIndex = 0;        
            final int STR_LENGTH = str.length();
            final String NCR = NCRs[i]; 
            final int NCR_LENGTH = NCR.length();
            
            if (NCR == "&#" || NCR == "#") {
                radix = 10; 
            } else {
                radix = 16;
            }
                                  
            while (startIndex < STR_LENGTH) {
                foundIndex = str.indexOf(NCR, startIndex);

                if (foundIndex == -1) {
                    result.append(str.substring(startIndex));
                    break;
                }

                result.append(str.substring(startIndex, foundIndex));
                if (NCR == "\\u" || NCR == "U+") {
                    startIndex = foundIndex + 6;
                    if (startIndex > str.length()) startIndex = -1; // for invalid Unicode escape sequences
                } else {
                    startIndex = str.indexOf(";", foundIndex);
                }

                if (startIndex == -1) {
                    result.append(str.substring(foundIndex));
                    break;
                }

                String tok = str.substring(foundIndex + NCR_LENGTH, startIndex);

                try {
                    result.append((char) Integer.parseInt(tok, radix));
                } catch (NumberFormatException nfe) {
                    try {
                        if (NCR == "\\u" || NCR == "U+") {
                            result.append(NCR + tok);
                        } else {
                            result.append(NCR + tok + str.charAt(startIndex));                            
                        }
                    } catch (StringIndexOutOfBoundsException sioobe) {
                        result.append(NCR + tok);
                    }
                }

                if (NCR != "\\u" && NCR != "U+") {
                    startIndex++;
                }
            }

            str = result.toString();
            result.setLength(0);
        }
    }
    
    /**
     *  Unicode Composite-to-Unicode Precomposed conversion
     *  (NFD -> NFC)
     */    
    private void CompositetoPrecomposed() {
        Normalizer composer = new Normalizer(Normalizer.C, false);
        // Same bug as in Unicode-to-VIQR conversion. The bug is traced to
        // the Java interpreter's inability to properly resolve package name
        // on certain Windows machines. It seems that some other installed 
        // applications have interfered with Java.

        // Perform Unicode NFC on NFD string
//        str = composer.normalize(str);
        str = composer.normalize(str.replace('\u0110', '\u00D0').replace('\u0111', '\u00F0'))
                .replace('\u00D0', '\u0110').replace('\u00F0', '\u0111');       
    }
}
