about charset op

2534 ワード

s 60 Webkitで見たのですが、これまでのプロジェクトでもこの機能(テキストファイルが何のコードかを判断する)が必要で、自分で資料を探して書いていたので、時間がかかるのは正確ではありません.
/*この文字列が合法的であるか否かを判断するutf-8符号化*/
bool validateUtf8(const char* chs, int len, int& validMultiByteChars)
{
    // test if this really is utf8
    const char* b = chs;
    const char* e = b+len;
    const char* c = b;
    int seqRem=0;
    bool isUtf8 = true;
    bool wrongUtf8 = false;
    validMultiByteChars = 0;
    while (c<e)
        {
        if( seqRem == 0 ) 
            b = c;
        // test validity of a multibyte sequence
        if (seqRem>0) {
            // a byte in sequence must match 10xxxxxx
            if ((*c & 0xc0) == 0x80)
                seqRem--;
            else {
                //Some times, there is wrong UTF-8 encoding in the characters.
                //For example ( 0xd8 0x73). Then allow atleat one byte wrong encoding.
                //This has been found in the real site. Also, this means here that is a 
                //ASCII character.  ASCII can be handled in the UTF-8 also. 
                wrongUtf8 = true;
            }
            if (seqRem==0)
                validMultiByteChars++;
        }
        // single byte in ascii range 0xxxxxxx
        else if ((*c & 0x80) == 0x00)
            { } // do nothing
        // start of a two byte sequence 110xxxxx 10xxxxxx
        else if ((*c & 0xe0) == 0xc0)
            seqRem=1;
        // start of a three byte sequence 1110xxxx 10xxxxxx 10xxxxxx
        else if ((*c & 0xf0) == 0xe0)
            seqRem=2;
        // start of a four byte sequence 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        else if ((*c & 0xf8) == 0xf0)
            seqRem=3;
        else if (c-b>3) {
            isUtf8 = false;
            break;
        }
        if ( wrongUtf8 ){
             if( seqRem == 1 ){
                 seqRem--; 
                 wrongUtf8 = false;
             }
             else {
                 isUtf8 = false;
                 break; 
             }
         } //end if ( wrongUtf8 )
        c++;
    } //end while
    return isUtf8;
}

 
 
 
aaaaaaaaaaa
 
 
 
bbbbbbbbbbb
 
 
 
ccccccccccccccc