|
| 1 | +using System; |
| 2 | +using System.Buffers; |
| 3 | +using System.IO; |
| 4 | +using System.Text; |
| 5 | +using System.Threading.Tasks; |
| 6 | + |
| 7 | +#pragma warning disable IDE0130 |
| 8 | +namespace EncodingUtf8AndGBKDifferentiater |
| 9 | +{ |
| 10 | + /// <summary> |
| 11 | + /// 区分文件编码 |
| 12 | + /// </summary> |
| 13 | + /// Copy from: https://github.com/dotnet-campus/EncodingNormalior/blob/1c7cee71b1626e340a40f783882148aa2ac29958/EncodingUtf8AndGBKDifferentiater/EncodingDifferentiater.cs |
| 14 | + public class EncodingDifferentiater : IDisposable |
| 15 | + { |
| 16 | + public EncodingDifferentiater(Stream stream) |
| 17 | + { |
| 18 | + _stream = stream; |
| 19 | + if (!stream.CanSeek) |
| 20 | + { |
| 21 | + throw new ArgumentException(); |
| 22 | + } |
| 23 | + } |
| 24 | + |
| 25 | + private byte[]? CountBuffer |
| 26 | + { |
| 27 | + set; get; |
| 28 | + } |
| 29 | + |
| 30 | + private readonly Stream _stream; |
| 31 | + |
| 32 | + public async ValueTask<InspectFileEncodingResult> InspectFileEncodingAsync() |
| 33 | + { |
| 34 | + double confidenceCount = 1; |
| 35 | + var stream = _stream; |
| 36 | + |
| 37 | + const int headAmount = 4; |
| 38 | + if (stream.Length < headAmount) |
| 39 | + { |
| 40 | + // 太短了,无法识别 |
| 41 | + return new InspectFileEncodingResult(Encoding.ASCII, 0); |
| 42 | + } |
| 43 | + |
| 44 | + var headByte = ReadFileHeadByte(stream, headAmount); |
| 45 | + stream.Position = 0; |
| 46 | + |
| 47 | + //从文件获取编码 |
| 48 | + var encoding = AutoEncoding(headByte); |
| 49 | + |
| 50 | + //Encoding.UTF8 |
| 51 | + // uft8无签名 |
| 52 | + if (encoding.Equals(Encoding.ASCII)) //GBK utf8 |
| 53 | + { |
| 54 | + //如果都是ASCII,那么无法知道编码 |
| 55 | + //如果属于 Utf8的byte数大于 GBK byte数,那么编码是 utf8,否则是GBK |
| 56 | + //如果两个数相同,那么不知道是哪个 |
| 57 | + |
| 58 | + var countUtf8 = await CountUtf8Async(); |
| 59 | + if (countUtf8 == 0) |
| 60 | + { |
| 61 | + encoding = Encoding.ASCII; |
| 62 | + } |
| 63 | + else |
| 64 | + { |
| 65 | + var countGbk = await CountGbkAsync(); |
| 66 | + if (countUtf8 > countGbk) |
| 67 | + { |
| 68 | + encoding = Encoding.UTF8; |
| 69 | + confidenceCount = (double) countUtf8 / (countUtf8 + countGbk); |
| 70 | + } |
| 71 | + else |
| 72 | + { |
| 73 | + encoding = Encoding.GetEncoding("GBK"); |
| 74 | + confidenceCount = (double) countGbk / (countUtf8 + countGbk); |
| 75 | + } |
| 76 | + } |
| 77 | + } |
| 78 | + else |
| 79 | + { |
| 80 | + //EncodingScrutatorFile.Encoding = encoding;//不需要 |
| 81 | + confidenceCount = 1; |
| 82 | + } |
| 83 | + |
| 84 | + return new(encoding, confidenceCount); |
| 85 | + } |
| 86 | + |
| 87 | + /// <summary> |
| 88 | + /// 统计文件属于 GBK 的 byte数 |
| 89 | + /// </summary> |
| 90 | + /// <returns></returns> |
| 91 | + private async ValueTask<int> CountGbkAsync() |
| 92 | + { |
| 93 | + var count = 0; //存在GBK的byte |
| 94 | + |
| 95 | + CountBuffer ??= ArrayPool<byte>.Shared.Rent(1024); |
| 96 | + |
| 97 | + var stream = _stream; |
| 98 | + stream.Position = 0; |
| 99 | + |
| 100 | + int readCount; |
| 101 | + while ((readCount = await stream.ReadAsync(CountBuffer, 0, CountBuffer.Length)) > 0) |
| 102 | + { |
| 103 | + var length = readCount; |
| 104 | + var buffer = CountBuffer; |
| 105 | + |
| 106 | + const char head = (char) 0x80; //小于127 通过 &head==0 |
| 107 | + |
| 108 | + for (var i = 0; i < length; i++) |
| 109 | + { |
| 110 | + var firstByte = buffer[i]; //第一个byte,GBK有两个 |
| 111 | + if ((firstByte & head) == 0) //如果是127以下,那么就是英文等字符,不确定是不是GBK |
| 112 | + { |
| 113 | + continue; //文件全部都是127以下字符,可能是Utf-8 或ASCII |
| 114 | + } |
| 115 | + if (i + 1 >= length) //如果是大于127,需要两字符,如果只有一个,那么文件错了,但是我也没法做什么 |
| 116 | + { |
| 117 | + break; |
| 118 | + } |
| 119 | + var secondByte = buffer[i + 1]; //如果是GBK,那么添加GBK byte 2 |
| 120 | + if (firstByte >= 161 && firstByte <= 247 && |
| 121 | + secondByte >= 161 && secondByte <= 254) |
| 122 | + { |
| 123 | + count += 2; |
| 124 | + i++; |
| 125 | + } |
| 126 | + } |
| 127 | + } |
| 128 | + |
| 129 | + return count; |
| 130 | + } |
| 131 | + |
| 132 | + /// <summary> |
| 133 | + /// 属于 UTF8 的 byte 数 |
| 134 | + /// </summary> |
| 135 | + /// <returns></returns> |
| 136 | + private async ValueTask<int> CountUtf8Async() |
| 137 | + { |
| 138 | + var count = 0; |
| 139 | + |
| 140 | + CountBuffer ??= ArrayPool<byte>.Shared.Rent(1024); |
| 141 | + |
| 142 | + var stream = _stream; |
| 143 | + stream.Position = 0; |
| 144 | + |
| 145 | + int readCount; |
| 146 | + const char head = (char) 0x80; |
| 147 | + while ((readCount = await stream.ReadAsync(CountBuffer, 0, CountBuffer.Length)) > 0) |
| 148 | + { |
| 149 | + var length = readCount; |
| 150 | + var buffer = CountBuffer; |
| 151 | + for (var i = 0; i < length; i++) |
| 152 | + { |
| 153 | + var temp = buffer[i]; |
| 154 | + if (temp < 128) // !(temp&head) |
| 155 | + { |
| 156 | + //utf8 一开始如果byte大小在 0-127 表示英文等,使用一byte |
| 157 | + //length++; 我们记录的是和CountGBK比较 |
| 158 | + continue; |
| 159 | + } |
| 160 | + var tempHead = head; |
| 161 | + var wordLength = 0; //单词长度,一个字使用多少个byte |
| 162 | + |
| 163 | + while ((temp & tempHead) != 0) //存在多少个byte |
| 164 | + { |
| 165 | + wordLength++; |
| 166 | + tempHead >>= 1; |
| 167 | + } |
| 168 | + |
| 169 | + if (wordLength <= 1) |
| 170 | + { |
| 171 | + //utf8最小长度为2 |
| 172 | + continue; |
| 173 | + } |
| 174 | + |
| 175 | + wordLength--; //去掉最后一个,可以让后面的 point大于wordLength |
| 176 | + if (wordLength + i >= length) |
| 177 | + { |
| 178 | + break; |
| 179 | + } |
| 180 | + var point = 1; //utf8的这个word 是多少 byte |
| 181 | + //utf8在两字节和三字节的编码,除了最后一个 byte |
| 182 | + //其他byte 大于127 |
| 183 | + //所以 除了最后一个byte,其他的byte &head >0 |
| 184 | + for (; point <= wordLength; point++) |
| 185 | + { |
| 186 | + var secondChar = buffer[i + point]; |
| 187 | + if ((secondChar & head) == 0) |
| 188 | + { |
| 189 | + break; |
| 190 | + } |
| 191 | + } |
| 192 | + |
| 193 | + if (point > wordLength) |
| 194 | + { |
| 195 | + count += wordLength + 1; |
| 196 | + i += wordLength; |
| 197 | + } |
| 198 | + } |
| 199 | + } |
| 200 | + |
| 201 | + return count; |
| 202 | + } |
| 203 | + |
| 204 | + //[MemberNotNull(nameof(CountBuffer))] |
| 205 | + //private async ValueTask ReadStreamAsync() |
| 206 | + //{ |
| 207 | + // var stream = _stream; |
| 208 | + // stream.Position = 0; |
| 209 | + // var length = (int) stream.Length; |
| 210 | + |
| 211 | + // // 先跳过 Ascii 方面 |
| 212 | + |
| 213 | + // // 不用全读取,读取一些就可以了 |
| 214 | + // length = Math.Max(length, 1024); |
| 215 | + // CountBuffer = new byte[length]; |
| 216 | + // await stream.ReadExactlyAsync(CountBuffer, 0, length); |
| 217 | + //} |
| 218 | + |
| 219 | + /// <summary> |
| 220 | + /// 读取文件的头4个byte |
| 221 | + /// </summary> |
| 222 | + /// <param name="stream">文件流</param> |
| 223 | + /// <param name="headAmount">读取长度</param> |
| 224 | + /// <returns>文件头4个byte</returns> |
| 225 | + private byte[] ReadFileHeadByte(Stream stream, int headAmount = 4) |
| 226 | + { |
| 227 | + //var headAmount = 4; |
| 228 | + var buffer = new byte[headAmount]; |
| 229 | + int n = stream.Read(buffer, 0, headAmount); |
| 230 | + if (n < headAmount) |
| 231 | + { |
| 232 | + throw new ArgumentException("读取到的文件长度太小,实际读取长度" + n + ",需要的长度" + headAmount); |
| 233 | + } |
| 234 | + stream.Position = 0; |
| 235 | + return buffer; |
| 236 | + } |
| 237 | + |
| 238 | + |
| 239 | + private static Encoding AutoEncoding(byte[] bom) |
| 240 | + { |
| 241 | + if (bom.Length != 4) |
| 242 | + { |
| 243 | + throw new ArgumentException("EncodingScrutator.AutoEncoding 参数大小不等于4"); |
| 244 | + } |
| 245 | + |
| 246 | + // Analyze the BOM |
| 247 | + |
| 248 | + if (bom[0] == 0x2b && bom[1] == 0x2f && bom[2] == 0x76) |
| 249 | +#pragma warning disable SYSLIB0001 |
| 250 | + return Encoding.UTF7; //85 116 102 55 //utf7 aa 97 97 0 0 |
| 251 | + //utf7 编码 = 43 102 120 90 |
| 252 | + |
| 253 | + if (bom[0] == 0xef && bom[1] == 0xbb && bom[2] == 0xbf) |
| 254 | + return Encoding.UTF8; //无签名 117 116 102 56 |
| 255 | + // 130 151 160 231 |
| 256 | + if (bom[0] == 0xff && bom[1] == 0xfe) |
| 257 | + return Encoding.Unicode; //UTF-16LE |
| 258 | + |
| 259 | + if (bom[0] == 0xfe && bom[1] == 0xff) |
| 260 | + return Encoding.BigEndianUnicode; //UTF-16BE |
| 261 | + |
| 262 | + if (bom[0] == 0 && bom[1] == 0 && bom[2] == 0xfe && bom[3] == 0xff) |
| 263 | + return Encoding.UTF32; |
| 264 | + |
| 265 | + return Encoding.ASCII; //如果返回ASCII可能是GBK 无签名utf8 |
| 266 | + } |
| 267 | + |
| 268 | + public void Dispose() |
| 269 | + { |
| 270 | + if (CountBuffer != null) |
| 271 | + { |
| 272 | + ArrayPool<byte>.Shared.Return(CountBuffer); |
| 273 | + } |
| 274 | + } |
| 275 | + } |
| 276 | + |
| 277 | + /// <summary> |
| 278 | + /// 判断一个文件的编码的结果 |
| 279 | + /// </summary> |
| 280 | + /// <param name="Encoding">文件的编码</param> |
| 281 | + /// <param name="ConfidenceCount">文件的编码可信度,注意 ASCII 文件的可信度为 0 在可信度为 1 的时候就是确定</param> |
| 282 | + public readonly record struct InspectFileEncodingResult(Encoding Encoding, double ConfidenceCount); |
| 283 | +} |
0 commit comments