java.lang.Character.UnicodeBlock# LATIN_EXTENDED_B 源码实例Demo

下面列出了java.lang.Character.UnicodeBlock# LATIN_EXTENDED_B 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。

源代码1 项目: weslang   文件: NGram.java

/**
 * Character Normalization
 * @param ch
 * @return Normalized character
 */
static public char normalize(char ch) {
    Character.UnicodeBlock block = Character.UnicodeBlock.of(ch);
    if (block == UnicodeBlock.BASIC_LATIN) {
        if (ch<'A' || (ch<'a' && ch >'Z') || ch>'z') ch = ' ';
    } else if (block == UnicodeBlock.LATIN_1_SUPPLEMENT) {
        if (LATIN1_EXCLUDED.indexOf(ch)>=0) ch = ' ';
    } else if (block == UnicodeBlock.LATIN_EXTENDED_B) {
        // normalization for Romanian
        if (ch == '\u0219') ch = '\u015f';  // Small S with comma below => with cedilla
        if (ch == '\u021b') ch = '\u0163';  // Small T with comma below => with cedilla
    } else if (block == UnicodeBlock.GENERAL_PUNCTUATION) {
        ch = ' ';
    } else if (block == UnicodeBlock.ARABIC) {
        if (ch == '\u06cc') ch = '\u064a';  // Farsi yeh => Arabic yeh
    } else if (block == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL) {
        if (ch >= '\u1ea0') ch = '\u1ec3';
    } else if (block == UnicodeBlock.HIRAGANA) {
        ch = '\u3042';
    } else if (block == UnicodeBlock.KATAKANA) {
        ch = '\u30a2';
    } else if (block == UnicodeBlock.BOPOMOFO || block == UnicodeBlock.BOPOMOFO_EXTENDED) {
        ch = '\u3105';
    } else if (block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
        if (cjk_map.containsKey(ch)) ch = cjk_map.get(ch);
    } else if (block == UnicodeBlock.HANGUL_SYLLABLES) {
        ch = '\uac00';
    }
    return ch;
}
 
源代码2 项目: language-detection   文件: NGram.java

/**
 * Character Normalization
 * @param ch
 * @return Normalized character
 */
static public char normalize(char ch) {
    Character.UnicodeBlock block = Character.UnicodeBlock.of(ch);
    if (block == UnicodeBlock.BASIC_LATIN) {
        if (ch<'A' || (ch<'a' && ch >'Z') || ch>'z') ch = ' ';
    } else if (block == UnicodeBlock.LATIN_1_SUPPLEMENT) {
        if (LATIN1_EXCLUDED.indexOf(ch)>=0) ch = ' ';
    } else if (block == UnicodeBlock.LATIN_EXTENDED_B) {
        // normalization for Romanian
        if (ch == '\u0219') ch = '\u015f';  // Small S with comma below => with cedilla
        if (ch == '\u021b') ch = '\u0163';  // Small T with comma below => with cedilla
    } else if (block == UnicodeBlock.GENERAL_PUNCTUATION) {
        ch = ' ';
    } else if (block == UnicodeBlock.ARABIC) {
        if (ch == '\u06cc') ch = '\u064a';  // Farsi yeh => Arabic yeh
    } else if (block == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL) {
        if (ch >= '\u1ea0') ch = '\u1ec3';
    } else if (block == UnicodeBlock.HIRAGANA) {
        ch = '\u3042';
    } else if (block == UnicodeBlock.KATAKANA) {
        ch = '\u30a2';
    } else if (block == UnicodeBlock.BOPOMOFO || block == UnicodeBlock.BOPOMOFO_EXTENDED) {
        ch = '\u3105';
    } else if (block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
        if (cjk_map.containsKey(ch)) ch = cjk_map.get(ch);
    } else if (block == UnicodeBlock.HANGUL_SYLLABLES) {
        ch = '\uac00';
    }
    return ch;
}