C#入力(バイトストリーム)BYTE[]から符号化フォーマットがUTF 8でネット上で回ったかどうかを判断し、アドレスを忘れた
5788 ワード
public static bool IsTextUTF8(byte[] inputStream)
{
int encodingBytesCount = 0;
bool allTextsAreASCIIChars = true;
for (int i = 0; i < inputStream.Length; i++)
{
byte current = inputStream[i];
if ((current & 0x80) == 0x80)
{
allTextsAreASCIIChars = false;
}
// First byte
if (encodingBytesCount == 0)
{
if ((current & 0x80) == 0)
{
// ASCII chars, from 0x00-0x7F
continue;
}
if ((current & 0xC0) == 0xC0)
{
encodingBytesCount = 1;
current <<= 2;
// More than two bytes used to encoding a unicode char.
// Calculate the real length.
while ((current & 0x80) == 0x80)
{
current <<= 1;
encodingBytesCount++;
}
}
else
{
// Invalid bits structure for UTF8 encoding rule.
return false;
}
}
else
{
// Following bytes, must start with 10.
if ((current & 0xC0) == 0x80)
{
encodingBytesCount--;
}
else
{
// Invalid bits structure for UTF8 encoding rule.
return false;
}
}
}
if (encodingBytesCount != 0)
{
// Invalid bits structure for UTF8 encoding rule.
// Wrong following bytes count.
return false;
}
// Although UTF8 supports encoding for ASCII chars, we regard as a input stream, whose contents are all ASCII as default encoding.
return !allTextsAreASCIIChars;
}