ワード検出を無効にする

5751 ワード

ミュートワードセンシングワード

utf 8符号化データは、以下のコードを直接使用することができる
最も重要なステップは文字列を単一の字に分解することであり、UTF-8符号化の字は、1バイトしかない場合、その最高バイナリビットは0である.マルチバイトであれば、その最初のバイトが最上位から始まり、連続するバイナリビット値が1の個数が符号化のビット数を決定し、残りの各バイトは10で始まる.
UTF-8は最大6バイトまで使用可能です.
1バイト0 xxxxxxx
2バイト110 xxxxx 10 xxxxx
3バイト1110 xxxx 10 xxxxxx 10 xxxxxx
4バイト11110 xxx 10 xxx 10 xxx 10 xxx 10 xxx 10 xxx
5バイト111110 xxx 10 xxxxx 10 xxxxxxx 10 xxxxx 10 xxxxx
6バイト1111110 x 10 xxxxxxx 10 xxxxxxxxx 10 xxxxxxx 10 xxxxxxx 10 xxxxxxx
他は簡単だ
1、無効ワード処理無効ワード分割後最初にkeyとして保存
2、テスト対象文字列
a)、単語に分割
b)、大文字小文字、アルファベットとスペースは全角半角に回転し、余分なスペースを取り除く(英語のアルファベットの後には最大1つのスペースしかなく、中国語の後にはスペースがあるべきではない)
c)、遍歴文字列のすべての字は、各字に対応する無効文字群が測定対象文字列にあるかどうかを検出する

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

class CDisableWord
{
struct SDisableWord
{
	std::string	str;
};
typedef std::vector 	 VDW;
private:
	std::map<:string>	m_mapDisableWord;
	std::set<:string>		m_setAllDisableWord;

    //   
    std::map<:string>  m_mapSpecialWord;
private:
	//  
	size_t SplitWord(const char* pSrc, unsigned int len, std::vector<:string>& output);
    //  
    const std::string* GetSpecialWord(const std::string& str);
public:
    CDisableWord();

    //  
	void AddOneDisableWord(const std::string& str);

    //  
	bool CheckStr(const char* pSrc, unsigned int len);
	bool CheckStr(const std::string& str);
};

CDisableWord::CDisableWord()
{
	std::string qjdx[26] = {"Ａ","Ｂ","Ｃ","Ｄ","Ｅ","Ｆ","Ｇ","Ｈ","Ｉ","Ｊ","Ｋ","Ｌ","Ｍ","Ｎ","Ｏ","Ｐ","Ｑ","Ｒ","Ｓ","Ｔ","Ｕ","Ｖ","Ｗ","Ｘ","Ｙ","Ｚ"};
	std::string qjxx[26] = {"ａ","ｂ","ｃ","ｄ","ｅ","ｆ","ｇ","ｈ","ｉ","ｊ","ｋ","ｌ","ｍ","ｎ","ｏ","ｐ","ｑ","ｒ","ｓ","ｔ","ｕ","ｖ","ｗ","ｘ","ｙ","ｚ"};
	std::string dx = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
	std::string rst = "abcdefghijklmnopqrstuvwxyz";

    std::string str1 = "a";
    std::string str2 = "a";
	for(int i = 0; i & output)
{
    std::string ch;
    unsigned char byte;
    for(unsigned int i = 0, wlen = 0; i = 0xFC)
            wlen = 6;  
        else if (byte >= 0xF8)
            wlen = 5;
        else if (byte >= 0xF0)
            wlen = 4;
        else if (byte >= 0xE0)
            wlen = 3;
        else if (byte >= 0xC0)
            wlen = 2;
        else
            wlen = 1;

        if(i + wlen > len)
            break;

        ch.clear();
        for(unsigned int j = 0; j ::iterator miter = m_mapSpecialWord.find(str);
    if(miter == m_mapSpecialWord.end())
        return NULL;

    return &(miter->second);
}

void CDisableWord::AddOneDisableWord(const std::string& str)
{
    if(m_setAllDisableWord.find(str) != m_setAllDisableWord.end())
        return;

    std::vector<:string> output;
    if(SplitWord(str.c_str(), str.size(), output) == 0 || output[0].size() == 0)
        return;

    std::map<:string>::iterator miter = m_mapDisableWord.find(output[0]);
    if(miter == m_mapDisableWord.end())
    {
        m_mapDisableWord[output[0]] = VDW();
        miter = m_mapDisableWord.find(output[0]);
    }

    if(miter == m_mapDisableWord.end())
        return;

    SDisableWord sdw;
    sdw.str = str;
    miter->second.push_back(sdw);
}

bool CDisableWord::CheckStr(const char* pSrc, unsigned int len)
{
    if(len == 0)
        return true;

    std::string str(pSrc, len);
    return CheckStr(str);
}

bool CDisableWord::CheckStr(const std::string& str)
{
    if(str.size() == 0)
        return true;

    std::vector<:string> output;
    if(SplitWord(str.c_str(), str.size(), output) == 0 || output[0].size() == 0)
        return false;

    //     
    for(size_t i = 0; i  sonly;
    for(size_t i = 0; i  0 && output[i] == " ")
        {
            bnoadd = true;
            for(int j = int(i - 1); j >= 0; --j)
            {
                if(output[j] == " ")
                    continue;

                if(output[j].size() > 1)
                    bnoadd = false;
                else if(j + 1 == int(i)) //  
					bnoadd = false;

                break;
            }
        }

        if(!bnoadd)
       		StrDelSpace += output[i];
    }
    bool bSame = (StrDelSpace == StrSrc);

    std::set<:string>::iterator siter = sonly.begin();
    for(; siter != sonly.end(); ++siter)
    {
        std::map<:string>::iterator miter = m_mapDisableWord.find(*siter);
        if(miter == m_mapDisableWord.end())
            continue;

        for(size_t j = 0; j second.size(); ++j)
        {
            SDisableWord& sdw = miter->second[j];
            if(StrSrc.find(sdw.str) != std::string::npos)
                return false;
            else if(!bSame && StrDelSpace.find(sdw.str) != std::string::npos)
                return false;
        }
    }

    return true;
}

int main()
{
    CDisableWord cdw;

	//  
    std::string strdw[] = {" ", " ", " ", "aabb", "   ", "cc dd"};
    for(int i = 0; i

ubuntuの下のテキストエディタgeditの文字化けし問題を解決する

中国語の文字化けし問題のまとめ