単純分詞アルゴリズム(二分法java実装)



package com.searchkiller;

public class BinarySplit {

	/**
	 *     ,        (          ):
	 * "   ABC  DE  "
	 *     "  ","  ","ABC","  ","DE", "  "
	 * 
	 */
	String splitString;

	public BinarySplit(String splitString) {
		super();
		this.splitString = splitString;
	}

	public void splitIt() {
		StringBuffer sbChinese = new StringBuffer();
		StringBuffer sbAlpha = new StringBuffer();
		String current = "";
		for (int i = 0; i < splitString.length(); i++) {
			current = splitString.substring(i, i + 1);
			if (current.matches("[\u4e00-\u9fa5]+")) {
				sbChinese.append(splitString.substring(i, i + 1));
				if ((i != splitString.length() - 1
						&& !splitString.substring(i + 1, i + 2).matches(
								"[\u4e00-\u9fa5]+"))||i == splitString.length() - 1) {
					if (sbChinese.length() > 2) {
						for (int j = 0; j +2 <= sbChinese.length(); j++) {
							System.out.println(sbChinese.substring(j, j + 2));
						}
					} else {
						System.out.println(sbChinese.toString());
					}
					sbChinese = new StringBuffer();
				}
			} else {
				sbAlpha.append(current);
				if (i != splitString.length() - 1
						&& splitString.substring(i + 1, i + 2).matches(
								"[\u4e00-\u9fa5]+")) {
					System.out.println(sbAlpha.toString());
					sbAlpha = new StringBuffer();
				}
			}
		}
	}

	public static void main(String[] args) {
		BinarySplit bs = new BinarySplit("   ABC  DE      ");
		bs.splitIt();
	}

}