JAva判断中国語文字

16803 ワード

//    HanLP       ,git  :https://github.com/hankcs/HanLP/blob/master/src/main/java/com/hankcs/hanlp/utility/TextUtility.java
/**
 *      
 */
public class TextUtility
{

    /**
     *    
     */
    public static final int CT_SINGLE = 5;// SINGLE byte

    /**
     *    "!,.?()[]{}+=
     */
    public static final int CT_DELIMITER = CT_SINGLE + 1;// delimiter

    /**
     *     
     */
    public static final int CT_CHINESE = CT_SINGLE + 2;// Chinese Char

    /**
     *   
     */
    public static final int CT_LETTER = CT_SINGLE + 3;// HanYu Pinyin

    /**
     *   
     */
    public static final int CT_NUM = CT_SINGLE + 4;// HanYu Pinyin

    /**
     *   
     */
    public static final int CT_INDEX = CT_SINGLE + 5;// HanYu Pinyin

    /**
     *     
     */
    public static final int CT_CNUM = CT_SINGLE + 6;

    /**
     *   
     */
    public static final int CT_OTHER = CT_SINGLE + 12;// Other

    public static int charType(char c)
    {
        return charType(String.valueOf(c));
    }

    /**
     *       
     * @param str
     * @return
     */
    public static int charType(String str)
    {
        if (str != null && str.length() > 0)
        {
            if (" ○                             ".contains(str)) return CT_CNUM;
            byte[] b;
            try
            {
                b = str.getBytes("GBK");
            }
            catch (UnsupportedEncodingException e)
            {
                b = str.getBytes();
                e.printStackTrace();
            }
            byte b1 = b[0];
            byte b2 = b.length > 1 ? b[1] : 0;
            int ub1 = getUnsigned(b1);
            int ub2 = getUnsigned(b2);
            if (ub1 < 128)
            {
                if (ub1 < 32) return CT_DELIMITER; // NON PRINTABLE CHARACTERS
                if (' ' == b1) return CT_OTHER;
                if ('
' == b1) return CT_DELIMITER; if ("*\"!,.?()[]{}+=/\\;:|".indexOf((char) b1) != -1) return CT_DELIMITER; if ("0123456789".indexOf((char)b1) != -1) return CT_NUM; return CT_SINGLE; } else if (ub1 == 162) return CT_INDEX; else if (ub1 == 163 && ub2 > 175 && ub2 < 186) return CT_NUM; else if (ub1 == 163 && (ub2 >= 193 && ub2 <= 218 || ub2 >= 225 && ub2 <= 250)) return CT_LETTER; else if (ub1 == 161 || ub1 == 163) return CT_DELIMITER; else if (ub1 >= 176 && ub1 <= 247) return CT_CHINESE; } return CT_OTHER; } /** * * @param str * @return */ public static boolean isAllChinese(String str) { return str.matches("[\\u4E00-\\u9FA5]+"); } /** * * @param sString * @return */ public static boolean isAllNonChinese(byte[] sString) { int nLen = sString.length; int i = 0; while (i < nLen) { if (getUnsigned(sString[i]) < 248 && getUnsigned(sString[i]) > 175) return false; if (sString[i] < 0) i += 2; else i += 1; } return true; } /** * * @param str * @return */ public static boolean isAllSingleByte(String str) { assert str != null; for (int i = 0; i < str.length(); i++) { if (str.charAt(i) >128) { return false; } } return true; } /** * * * @param str * @return , 。 , -1。 */ public static int cint(String str) { if (str != null) try { int i = new Integer(str).intValue(); return i; } catch (NumberFormatException e) { } return -1; } /** * * @param str * @return */ public static boolean isAllNum(String str) { if (str == null) return false; int i = 0; /** +- */ if ("±+-+-—".indexOf(str.charAt(0)) != -1) i++; /** 0123456789 * */ while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1) i++; // Get middle delimiter such as . if (i > 0 && i < str.length()) { char ch = str.charAt(i); if ("·∶:,,..//".indexOf(ch) != -1) {// 98.1% i++; while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1) i++; } } if (i >= str.length()) return true; /** 0123456789 * */ while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1) i++; // Get middle delimiter such as . if (i > 0 && i < str.length()) { char ch = str.charAt(i); if (',' == ch || '.' == ch || '/' == ch || ':' == ch || "∶·,./".indexOf(ch) != -1) {// 98.1% i++; while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1) i++; } } if (i < str.length()) { if (" %%‰".indexOf(str.charAt(i)) != -1) i++; } if (i >= str.length()) return true; return false; } /** * * @param sString * @return */ public static boolean isAllIndex(byte[] sString) { int nLen = sString.length; int i = 0; while (i < nLen - 1 && getUnsigned(sString[i]) == 162) { i += 2; } if (i >= nLen) return true; while (i < nLen && (sString[i] > 'A' - 1 && sString[i] < 'Z' + 1) || (sString[i] > 'a' - 1 && sString[i] < 'z' + 1)) {// single // byte // number // char i += 1; } if (i < nLen) return false; return true; } /** * * * @param text * @return */ public static boolean isAllLetter(String text) { for (int i = 0; i < text.length(); ++i) { char c = text.charAt(i); if ((((c < 'a' || c > 'z')) && ((c < 'A' || c > 'Z')))) { return false; } } return true; } /** * * * @param text * @return */ public static boolean isAllLetterOrNum(String text) { for (int i = 0; i < text.length(); ++i) { char c = text.charAt(i); if ((((c < 'a' || c > 'z')) && ((c < 'A' || c > 'Z')) && ((c < '0' || c > '9')))) { return false; } } return true; } /** * * @param sString * @return */ public static boolean isAllDelimiter(byte[] sString) { int nLen = sString.length; int i = 0; while (i < nLen - 1 && (getUnsigned(sString[i]) == 161 || getUnsigned(sString[i]) == 163)) { i += 2; } if (i < nLen) return false; return true; } /** * * @param word * @return */ public static boolean isAllChineseNum(String word) {// String chineseNum = " ○ ∶·./ ";// String prefix = " "; String surfix = " "; boolean round = false; if (word == null) return false; char[] temp = word.toCharArray(); for (int i = 0; i < temp.length; i++) { if (word.startsWith(" ", i))// { i += 1; continue; } char tchar = temp[i]; if (i == 0 && prefix.indexOf(tchar) != -1) { round = true; } else if (i == temp.length-1 && !round && surfix.indexOf(tchar) != -1) { round = true; } else if (chineseNum.indexOf(tchar) == -1) return false; } return true; } /** * * * @param charSet * @param word * @return */ public static int getCharCount(String charSet, String word) { int nCount = 0; if (word != null) { String temp = word + " "; for (int i = 0; i < word.length(); i++) { String s = temp.substring(i, i + 1); if (charSet.indexOf(s) != -1) nCount++; } } return nCount; } /** * * * @param b * @return */ public static int getUnsigned(byte b) { if (b > 0) return (int) b; else return (b & 0x7F + 128); } /** * * * @param snum * @return */ public static boolean isYearTime(String snum) { if (snum != null) { int len = snum.length(); String first = snum.substring(0, 1); // 1992 , 98 ,06 if (isAllSingleByte(snum) && (len == 4 || len == 2 && (cint(first) > 4 || cint(first) == 0))) return true; if (isAllNum(snum) && (len >= 3 || len == 2 && "056789".indexOf(first) != -1)) return true; if (getCharCount(" ○ ", snum) == len && len >= 2) return true; if (len == 4 && getCharCount(" ○", snum) == 2)// return true; if (len == 1 && getCharCount(" ", snum) == 1) return true; if (len == 2 && getCharCount(" ", snum) == 1 && getCharCount(" ", snum.substring(1)) == 1) return true; } return false; } /** * * * @param aggr * @param str * @return */ public static boolean isInAggregate(String aggr, String str) { if (aggr != null && str != null) { str += "1"; for (int i = 0; i < str.length(); i++) { String s = str.substring(i, i + 1); if (aggr.indexOf(s) == -1) return false; } return true; } return false; } /** * * * @param str * @return */ public static boolean isDBCCase(String str) { if (str != null) { str += " "; for (int i = 0; i < str.length(); i++) { String s = str.substring(i, i + 1); int length = 0; try { length = s.getBytes("GBK").length; } catch (UnsupportedEncodingException e) { e.printStackTrace(); length = s.getBytes().length; } if (length != 1) return false; } return true; } return false; } /** * * * @param str * @return */ public static boolean isSBCCase(String str) { if (str != null) { str += " "; for (int i = 0; i < str.length(); i++) { String s = str.substring(i, i + 1); int length = 0; try { length = s.getBytes("GBK").length; } catch (UnsupportedEncodingException e) { e.printStackTrace(); length = s.getBytes().length; } if (length != 2) return false; } return true; } return false; } /** * ( ) * * @param str * @return */ public static boolean isDelimiter(String str) { if (str != null && ("-".equals(str) || "-".equals(str))) return true; else return false; } public static boolean isUnknownWord(String word) { if (word != null && word.indexOf(" ##") == 0) return true; else return false; } /** * 0 * * @param frequency * @return */ public static double nonZero(double frequency) { if (frequency == 0) return 1e-3; return frequency; } /** * long char * * @param x */ public static char[] long2char(long x) { char[] c = new char[4]; c[0] = (char) (x >> 48); c[1] = (char) (x >> 32); c[2] = (char) (x >> 16); c[3] = (char) (x); return c; } /** * long string * * @param x * @return */ public static String long2String(long x) { char[] cArray = long2char(x); StringBuilder sbResult = new StringBuilder(cArray.length); for (char c : cArray) { sbResult.append(c); } return sbResult.toString(); } /** * * * @param e * @return */ public static String exceptionToString(Exception e) { StringWriter sw = new StringWriter(); PrintWriter pw = new PrintWriter(sw); e.printStackTrace(pw); return sw.toString(); } /** * * * @param c * @return true, false */ public static boolean isChinese(char c) { String regex = "[\\u4e00-\\u9fa5]"; return String.valueOf(c).matches(regex); } /** * keyword srcText * * @param keyword * @param srcText * @return */ public static int count(String keyword, String srcText) { int count = 0; int leng = srcText.length(); int j = 0; for (int i = 0; i < leng; i++) { if (srcText.charAt(i) == keyword.charAt(j)) { j++; if (j == keyword.length()) { count++; j = 0; } } else { i = i - j;// should rollback when not match j = 0; } } return count; } /** * String * * @param s * @param out * @throws IOException */ public static void writeString(String s, DataOutputStream out) throws IOException { out.writeInt(s.length()); for (char c : s.toCharArray()) { out.writeChar(c); } } /** * (null ) * * @param cs * @return */ public static boolean isBlank(CharSequence cs) { int strLen; if (cs == null || (strLen = cs.length()) == 0) { return true; } for (int i = 0; i < strLen; i++) { if (!Character.isWhitespace(cs.charAt(i))) { return false; } } return true; }