JAva判断中国語文字
16803 ワード
// HanLP ,git :https://github.com/hankcs/HanLP/blob/master/src/main/java/com/hankcs/hanlp/utility/TextUtility.java
/**
*
*/
public class TextUtility
{
/**
*
*/
public static final int CT_SINGLE = 5;// SINGLE byte
/**
* "!,.?()[]{}+=
*/
public static final int CT_DELIMITER = CT_SINGLE + 1;// delimiter
/**
*
*/
public static final int CT_CHINESE = CT_SINGLE + 2;// Chinese Char
/**
*
*/
public static final int CT_LETTER = CT_SINGLE + 3;// HanYu Pinyin
/**
*
*/
public static final int CT_NUM = CT_SINGLE + 4;// HanYu Pinyin
/**
*
*/
public static final int CT_INDEX = CT_SINGLE + 5;// HanYu Pinyin
/**
*
*/
public static final int CT_CNUM = CT_SINGLE + 6;
/**
*
*/
public static final int CT_OTHER = CT_SINGLE + 12;// Other
public static int charType(char c)
{
return charType(String.valueOf(c));
}
/**
*
* @param str
* @return
*/
public static int charType(String str)
{
if (str != null && str.length() > 0)
{
if (" ○ ".contains(str)) return CT_CNUM;
byte[] b;
try
{
b = str.getBytes("GBK");
}
catch (UnsupportedEncodingException e)
{
b = str.getBytes();
e.printStackTrace();
}
byte b1 = b[0];
byte b2 = b.length > 1 ? b[1] : 0;
int ub1 = getUnsigned(b1);
int ub2 = getUnsigned(b2);
if (ub1 < 128)
{
if (ub1 < 32) return CT_DELIMITER; // NON PRINTABLE CHARACTERS
if (' ' == b1) return CT_OTHER;
if ('
' == b1) return CT_DELIMITER;
if ("*\"!,.?()[]{}+=/\\;:|".indexOf((char) b1) != -1)
return CT_DELIMITER;
if ("0123456789".indexOf((char)b1) != -1)
return CT_NUM;
return CT_SINGLE;
}
else if (ub1 == 162)
return CT_INDEX;
else if (ub1 == 163 && ub2 > 175 && ub2 < 186)
return CT_NUM;
else if (ub1 == 163
&& (ub2 >= 193 && ub2 <= 218 || ub2 >= 225
&& ub2 <= 250))
return CT_LETTER;
else if (ub1 == 161 || ub1 == 163)
return CT_DELIMITER;
else if (ub1 >= 176 && ub1 <= 247)
return CT_CHINESE;
}
return CT_OTHER;
}
/**
*
* @param str
* @return
*/
public static boolean isAllChinese(String str)
{
return str.matches("[\\u4E00-\\u9FA5]+");
}
/**
*
* @param sString
* @return
*/
public static boolean isAllNonChinese(byte[] sString)
{
int nLen = sString.length;
int i = 0;
while (i < nLen)
{
if (getUnsigned(sString[i]) < 248 && getUnsigned(sString[i]) > 175)
return false;
if (sString[i] < 0)
i += 2;
else
i += 1;
}
return true;
}
/**
*
* @param str
* @return
*/
public static boolean isAllSingleByte(String str)
{
assert str != null;
for (int i = 0; i < str.length(); i++)
{
if (str.charAt(i) >128)
{
return false;
}
}
return true;
}
/**
*
*
* @param str
* @return , 。 , -1。
*/
public static int cint(String str)
{
if (str != null)
try
{
int i = new Integer(str).intValue();
return i;
}
catch (NumberFormatException e)
{
}
return -1;
}
/**
*
* @param str
* @return
*/
public static boolean isAllNum(String str)
{
if (str == null)
return false;
int i = 0;
/** +- */
if ("±+-+-—".indexOf(str.charAt(0)) != -1)
i++;
/** 0123456789 * */
while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
i++;
// Get middle delimiter such as .
if (i > 0 && i < str.length())
{
char ch = str.charAt(i);
if ("·∶:,,..//".indexOf(ch) != -1)
{// 98.1%
i++;
while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
i++;
}
}
if (i >= str.length())
return true;
/** 0123456789 * */
while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
i++;
// Get middle delimiter such as .
if (i > 0 && i < str.length())
{
char ch = str.charAt(i);
if (',' == ch || '.' == ch || '/' == ch || ':' == ch || "∶·,./".indexOf(ch) != -1)
{// 98.1%
i++;
while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
i++;
}
}
if (i < str.length())
{
if (" %%‰".indexOf(str.charAt(i)) != -1)
i++;
}
if (i >= str.length())
return true;
return false;
}
/**
*
* @param sString
* @return
*/
public static boolean isAllIndex(byte[] sString)
{
int nLen = sString.length;
int i = 0;
while (i < nLen - 1 && getUnsigned(sString[i]) == 162)
{
i += 2;
}
if (i >= nLen)
return true;
while (i < nLen && (sString[i] > 'A' - 1 && sString[i] < 'Z' + 1)
|| (sString[i] > 'a' - 1 && sString[i] < 'z' + 1))
{// single
// byte
// number
// char
i += 1;
}
if (i < nLen)
return false;
return true;
}
/**
*
*
* @param text
* @return
*/
public static boolean isAllLetter(String text)
{
for (int i = 0; i < text.length(); ++i)
{
char c = text.charAt(i);
if ((((c < 'a' || c > 'z')) && ((c < 'A' || c > 'Z'))))
{
return false;
}
}
return true;
}
/**
*
*
* @param text
* @return
*/
public static boolean isAllLetterOrNum(String text)
{
for (int i = 0; i < text.length(); ++i)
{
char c = text.charAt(i);
if ((((c < 'a' || c > 'z')) && ((c < 'A' || c > 'Z')) && ((c < '0' || c > '9'))))
{
return false;
}
}
return true;
}
/**
*
* @param sString
* @return
*/
public static boolean isAllDelimiter(byte[] sString)
{
int nLen = sString.length;
int i = 0;
while (i < nLen - 1 && (getUnsigned(sString[i]) == 161 || getUnsigned(sString[i]) == 163))
{
i += 2;
}
if (i < nLen)
return false;
return true;
}
/**
*
* @param word
* @return
*/
public static boolean isAllChineseNum(String word)
{//
String chineseNum = " ○ ∶·./ ";//
String prefix = " ";
String surfix = " ";
boolean round = false;
if (word == null)
return false;
char[] temp = word.toCharArray();
for (int i = 0; i < temp.length; i++)
{
if (word.startsWith(" ", i))//
{
i += 1;
continue;
}
char tchar = temp[i];
if (i == 0 && prefix.indexOf(tchar) != -1)
{
round = true;
}
else if (i == temp.length-1 && !round && surfix.indexOf(tchar) != -1)
{
round = true;
}
else if (chineseNum.indexOf(tchar) == -1)
return false;
}
return true;
}
/**
*
*
* @param charSet
* @param word
* @return
*/
public static int getCharCount(String charSet, String word)
{
int nCount = 0;
if (word != null)
{
String temp = word + " ";
for (int i = 0; i < word.length(); i++)
{
String s = temp.substring(i, i + 1);
if (charSet.indexOf(s) != -1)
nCount++;
}
}
return nCount;
}
/**
*
*
* @param b
* @return
*/
public static int getUnsigned(byte b)
{
if (b > 0)
return (int) b;
else
return (b & 0x7F + 128);
}
/**
*
*
* @param snum
* @return
*/
public static boolean isYearTime(String snum)
{
if (snum != null)
{
int len = snum.length();
String first = snum.substring(0, 1);
// 1992 , 98 ,06
if (isAllSingleByte(snum)
&& (len == 4 || len == 2 && (cint(first) > 4 || cint(first) == 0)))
return true;
if (isAllNum(snum) && (len >= 3 || len == 2 && "056789".indexOf(first) != -1))
return true;
if (getCharCount(" ○ ", snum) == len && len >= 2)
return true;
if (len == 4 && getCharCount(" ○", snum) == 2)//
return true;
if (len == 1 && getCharCount(" ", snum) == 1)
return true;
if (len == 2 && getCharCount(" ", snum) == 1
&& getCharCount(" ", snum.substring(1)) == 1)
return true;
}
return false;
}
/**
*
*
* @param aggr
* @param str
* @return
*/
public static boolean isInAggregate(String aggr, String str)
{
if (aggr != null && str != null)
{
str += "1";
for (int i = 0; i < str.length(); i++)
{
String s = str.substring(i, i + 1);
if (aggr.indexOf(s) == -1)
return false;
}
return true;
}
return false;
}
/**
*
*
* @param str
* @return
*/
public static boolean isDBCCase(String str)
{
if (str != null)
{
str += " ";
for (int i = 0; i < str.length(); i++)
{
String s = str.substring(i, i + 1);
int length = 0;
try
{
length = s.getBytes("GBK").length;
}
catch (UnsupportedEncodingException e)
{
e.printStackTrace();
length = s.getBytes().length;
}
if (length != 1)
return false;
}
return true;
}
return false;
}
/**
*
*
* @param str
* @return
*/
public static boolean isSBCCase(String str)
{
if (str != null)
{
str += " ";
for (int i = 0; i < str.length(); i++)
{
String s = str.substring(i, i + 1);
int length = 0;
try
{
length = s.getBytes("GBK").length;
}
catch (UnsupportedEncodingException e)
{
e.printStackTrace();
length = s.getBytes().length;
}
if (length != 2)
return false;
}
return true;
}
return false;
}
/**
* ( )
*
* @param str
* @return
*/
public static boolean isDelimiter(String str)
{
if (str != null && ("-".equals(str) || "-".equals(str)))
return true;
else
return false;
}
public static boolean isUnknownWord(String word)
{
if (word != null && word.indexOf(" ##") == 0)
return true;
else
return false;
}
/**
* 0
*
* @param frequency
* @return
*/
public static double nonZero(double frequency)
{
if (frequency == 0) return 1e-3;
return frequency;
}
/**
* long char
*
* @param x
*/
public static char[] long2char(long x)
{
char[] c = new char[4];
c[0] = (char) (x >> 48);
c[1] = (char) (x >> 32);
c[2] = (char) (x >> 16);
c[3] = (char) (x);
return c;
}
/**
* long string
*
* @param x
* @return
*/
public static String long2String(long x)
{
char[] cArray = long2char(x);
StringBuilder sbResult = new StringBuilder(cArray.length);
for (char c : cArray)
{
sbResult.append(c);
}
return sbResult.toString();
}
/**
*
*
* @param e
* @return
*/
public static String exceptionToString(Exception e)
{
StringWriter sw = new StringWriter();
PrintWriter pw = new PrintWriter(sw);
e.printStackTrace(pw);
return sw.toString();
}
/**
*
*
* @param c
* @return true, false
*/
public static boolean isChinese(char c)
{
String regex = "[\\u4e00-\\u9fa5]";
return String.valueOf(c).matches(regex);
}
/**
* keyword srcText
*
* @param keyword
* @param srcText
* @return
*/
public static int count(String keyword, String srcText)
{
int count = 0;
int leng = srcText.length();
int j = 0;
for (int i = 0; i < leng; i++)
{
if (srcText.charAt(i) == keyword.charAt(j))
{
j++;
if (j == keyword.length())
{
count++;
j = 0;
}
}
else
{
i = i - j;// should rollback when not match
j = 0;
}
}
return count;
}
/**
* String
*
* @param s
* @param out
* @throws IOException
*/
public static void writeString(String s, DataOutputStream out) throws IOException
{
out.writeInt(s.length());
for (char c : s.toCharArray())
{
out.writeChar(c);
}
}
/**
* (null )
*
* @param cs
* @return
*/
public static boolean isBlank(CharSequence cs)
{
int strLen;
if (cs == null || (strLen = cs.length()) == 0)
{
return true;
}
for (int i = 0; i < strLen; i++)
{
if (!Character.isWhitespace(cs.charAt(i)))
{
return false;
}
}
return true;
}