C#入力(バイトストリーム)BYTE[]から符号化フォーマットがUTF 8でネット上で回ったかどうかを判断し、アドレスを忘れた

5788 ワード

public static bool IsTextUTF8(byte[] inputStream)

        {

            int encodingBytesCount = 0;

            bool allTextsAreASCIIChars = true;



            for (int i = 0; i < inputStream.Length; i++)

            {

                byte current = inputStream[i];



                if ((current & 0x80) == 0x80)

                {

                    allTextsAreASCIIChars = false;

                }

                // First byte

                if (encodingBytesCount == 0)

                {

                    if ((current & 0x80) == 0)

                    {

                        // ASCII chars, from 0x00-0x7F

                        continue;

                    }



                    if ((current & 0xC0) == 0xC0)

                    {

                        encodingBytesCount = 1;

                        current <<= 2;



                        // More than two bytes used to encoding a unicode char.

                        // Calculate the real length.

                        while ((current & 0x80) == 0x80)

                        {

                            current <<= 1;

                            encodingBytesCount++;

                        }

                    }

                    else

                    {

                        // Invalid bits structure for UTF8 encoding rule.

                        return false;

                    }

                }

                else

                {

                    // Following bytes, must start with 10.

                    if ((current & 0xC0) == 0x80)

                    {

                        encodingBytesCount--;

                    }

                    else

                    {

                        // Invalid bits structure for UTF8 encoding rule.

                        return false;

                    }

                }

            }



            if (encodingBytesCount != 0)

            {

                // Invalid bits structure for UTF8 encoding rule.

                // Wrong following bytes count.

                return false;

            }



            // Although UTF8 supports encoding for ASCII chars, we regard as a input stream, whose contents are all ASCII as default encoding.

            return !allTextsAreASCIIChars;

        }