Skip to content

Commit 084469a

Browse files
committed
Merge commit '1571198d9af8fd6fbe03d0a3bde2d4545e37866f'
2 parents 9d1c075 + 1571198 commit 084469a

File tree

10 files changed

+449
-88
lines changed

10 files changed

+449
-88
lines changed

Blog/RallreakechuFeakenalldea/Docs/README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,8 @@
99

1010
## 待办功能
1111

12-
- [ ] 支持打开加密文档,弹出加密对话框
12+
- [ ] 支持打开加密文档,弹出加密对话框
13+
- [ ] 查找与替换的功能
14+
- [ ] 在代码片里面按下回车,应该跟随缩进
15+
- [ ] 滚动条跟随光标
16+
- [ ] 滚动条可以超过文档范围,滚动得更下一些

Blog/RallreakechuFeakenalldea/RallreakechuFeakenalldea.Desktop/Program.cs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1+
using Avalonia;
2+
13
using System;
24
using System.IO;
35
using System.Runtime.CompilerServices;
46
using System.Runtime.InteropServices;
5-
using Avalonia;
7+
using System.Text;
68

79
namespace RallreakechuFeakenalldea.Desktop;
810

@@ -14,6 +16,7 @@ class Program
1416
[STAThread]
1517
public static void Main(string[] args)
1618
{
19+
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
1720
RunAvalonia(args);
1821
}
1922

Blog/RallreakechuFeakenalldea/RallreakechuFeakenalldea.Desktop/Properties/launchSettings.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"profiles": {
33
"RallreakechuFeakenalldea.Desktop": {
44
"commandName": "Project",
5-
"commandLineArgs": "C:\\lindexi\\TestFile.txt"
5+
"commandLineArgs": "C:\\lindexi\\TestFile_GBK.txt"
66
}
77
}
88
}

Blog/RallreakechuFeakenalldea/RallreakechuFeakenalldea.Desktop/RallreakechuFeakenalldea.Desktop.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
<ItemGroup>
1717
<PackageReference Include="Avalonia.Desktop" Version="$(AvaloniaVersion)" />
18+
<PackageReference Include="System.Text.Encoding.CodePages" Version="10.0.3" />
1819
</ItemGroup>
1920

2021
<ItemGroup>
Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
using System;
2+
using System.Buffers;
3+
using System.IO;
4+
using System.Text;
5+
using System.Threading.Tasks;
6+
7+
#pragma warning disable IDE0130
8+
namespace EncodingUtf8AndGBKDifferentiater
9+
{
10+
/// <summary>
11+
/// 区分文件编码
12+
/// </summary>
13+
/// Copy from: https://github.com/dotnet-campus/EncodingNormalior/blob/1c7cee71b1626e340a40f783882148aa2ac29958/EncodingUtf8AndGBKDifferentiater/EncodingDifferentiater.cs
14+
public class EncodingDifferentiater : IDisposable
15+
{
16+
public EncodingDifferentiater(Stream stream)
17+
{
18+
_stream = stream;
19+
if (!stream.CanSeek)
20+
{
21+
throw new ArgumentException();
22+
}
23+
}
24+
25+
private byte[]? CountBuffer
26+
{
27+
set; get;
28+
}
29+
30+
private readonly Stream _stream;
31+
32+
public async ValueTask<InspectFileEncodingResult> InspectFileEncodingAsync()
33+
{
34+
double confidenceCount = 1;
35+
var stream = _stream;
36+
37+
const int headAmount = 4;
38+
if (stream.Length < headAmount)
39+
{
40+
// 太短了,无法识别
41+
return new InspectFileEncodingResult(Encoding.ASCII, 0);
42+
}
43+
44+
var headByte = ReadFileHeadByte(stream, headAmount);
45+
stream.Position = 0;
46+
47+
//从文件获取编码
48+
var encoding = AutoEncoding(headByte);
49+
50+
//Encoding.UTF8
51+
// uft8无签名
52+
if (encoding.Equals(Encoding.ASCII)) //GBK utf8
53+
{
54+
//如果都是ASCII,那么无法知道编码
55+
//如果属于 Utf8的byte数大于 GBK byte数,那么编码是 utf8,否则是GBK
56+
//如果两个数相同,那么不知道是哪个
57+
58+
var countUtf8 = await CountUtf8Async();
59+
if (countUtf8 == 0)
60+
{
61+
encoding = Encoding.ASCII;
62+
}
63+
else
64+
{
65+
var countGbk = await CountGbkAsync();
66+
if (countUtf8 > countGbk)
67+
{
68+
encoding = Encoding.UTF8;
69+
confidenceCount = (double) countUtf8 / (countUtf8 + countGbk);
70+
}
71+
else
72+
{
73+
encoding = Encoding.GetEncoding("GBK");
74+
confidenceCount = (double) countGbk / (countUtf8 + countGbk);
75+
}
76+
}
77+
}
78+
else
79+
{
80+
//EncodingScrutatorFile.Encoding = encoding;//不需要
81+
confidenceCount = 1;
82+
}
83+
84+
return new(encoding, confidenceCount);
85+
}
86+
87+
/// <summary>
88+
/// 统计文件属于 GBK 的 byte数
89+
/// </summary>
90+
/// <returns></returns>
91+
private async ValueTask<int> CountGbkAsync()
92+
{
93+
var count = 0; //存在GBK的byte
94+
95+
CountBuffer ??= ArrayPool<byte>.Shared.Rent(1024);
96+
97+
var stream = _stream;
98+
stream.Position = 0;
99+
100+
int readCount;
101+
while ((readCount = await stream.ReadAsync(CountBuffer, 0, CountBuffer.Length)) > 0)
102+
{
103+
var length = readCount;
104+
var buffer = CountBuffer;
105+
106+
const char head = (char) 0x80; //小于127 通过 &head==0
107+
108+
for (var i = 0; i < length; i++)
109+
{
110+
var firstByte = buffer[i]; //第一个byte,GBK有两个
111+
if ((firstByte & head) == 0) //如果是127以下,那么就是英文等字符,不确定是不是GBK
112+
{
113+
continue; //文件全部都是127以下字符,可能是Utf-8 或ASCII
114+
}
115+
if (i + 1 >= length) //如果是大于127,需要两字符,如果只有一个,那么文件错了,但是我也没法做什么
116+
{
117+
break;
118+
}
119+
var secondByte = buffer[i + 1]; //如果是GBK,那么添加GBK byte 2
120+
if (firstByte >= 161 && firstByte <= 247 &&
121+
secondByte >= 161 && secondByte <= 254)
122+
{
123+
count += 2;
124+
i++;
125+
}
126+
}
127+
}
128+
129+
return count;
130+
}
131+
132+
/// <summary>
133+
/// 属于 UTF8 的 byte 数
134+
/// </summary>
135+
/// <returns></returns>
136+
private async ValueTask<int> CountUtf8Async()
137+
{
138+
var count = 0;
139+
140+
CountBuffer ??= ArrayPool<byte>.Shared.Rent(1024);
141+
142+
var stream = _stream;
143+
stream.Position = 0;
144+
145+
int readCount;
146+
const char head = (char) 0x80;
147+
while ((readCount = await stream.ReadAsync(CountBuffer, 0, CountBuffer.Length)) > 0)
148+
{
149+
var length = readCount;
150+
var buffer = CountBuffer;
151+
for (var i = 0; i < length; i++)
152+
{
153+
var temp = buffer[i];
154+
if (temp < 128) // !(temp&head)
155+
{
156+
//utf8 一开始如果byte大小在 0-127 表示英文等,使用一byte
157+
//length++; 我们记录的是和CountGBK比较
158+
continue;
159+
}
160+
var tempHead = head;
161+
var wordLength = 0; //单词长度,一个字使用多少个byte
162+
163+
while ((temp & tempHead) != 0) //存在多少个byte
164+
{
165+
wordLength++;
166+
tempHead >>= 1;
167+
}
168+
169+
if (wordLength <= 1)
170+
{
171+
//utf8最小长度为2
172+
continue;
173+
}
174+
175+
wordLength--; //去掉最后一个,可以让后面的 point大于wordLength
176+
if (wordLength + i >= length)
177+
{
178+
break;
179+
}
180+
var point = 1; //utf8的这个word 是多少 byte
181+
//utf8在两字节和三字节的编码,除了最后一个 byte
182+
//其他byte 大于127
183+
//所以 除了最后一个byte,其他的byte &head >0
184+
for (; point <= wordLength; point++)
185+
{
186+
var secondChar = buffer[i + point];
187+
if ((secondChar & head) == 0)
188+
{
189+
break;
190+
}
191+
}
192+
193+
if (point > wordLength)
194+
{
195+
count += wordLength + 1;
196+
i += wordLength;
197+
}
198+
}
199+
}
200+
201+
return count;
202+
}
203+
204+
//[MemberNotNull(nameof(CountBuffer))]
205+
//private async ValueTask ReadStreamAsync()
206+
//{
207+
// var stream = _stream;
208+
// stream.Position = 0;
209+
// var length = (int) stream.Length;
210+
211+
// // 先跳过 Ascii 方面
212+
213+
// // 不用全读取,读取一些就可以了
214+
// length = Math.Max(length, 1024);
215+
// CountBuffer = new byte[length];
216+
// await stream.ReadExactlyAsync(CountBuffer, 0, length);
217+
//}
218+
219+
/// <summary>
220+
/// 读取文件的头4个byte
221+
/// </summary>
222+
/// <param name="stream">文件流</param>
223+
/// <param name="headAmount">读取长度</param>
224+
/// <returns>文件头4个byte</returns>
225+
private byte[] ReadFileHeadByte(Stream stream, int headAmount = 4)
226+
{
227+
//var headAmount = 4;
228+
var buffer = new byte[headAmount];
229+
int n = stream.Read(buffer, 0, headAmount);
230+
if (n < headAmount)
231+
{
232+
throw new ArgumentException("读取到的文件长度太小,实际读取长度" + n + ",需要的长度" + headAmount);
233+
}
234+
stream.Position = 0;
235+
return buffer;
236+
}
237+
238+
239+
private static Encoding AutoEncoding(byte[] bom)
240+
{
241+
if (bom.Length != 4)
242+
{
243+
throw new ArgumentException("EncodingScrutator.AutoEncoding 参数大小不等于4");
244+
}
245+
246+
// Analyze the BOM
247+
248+
if (bom[0] == 0x2b && bom[1] == 0x2f && bom[2] == 0x76)
249+
#pragma warning disable SYSLIB0001
250+
return Encoding.UTF7; //85 116 102 55 //utf7 aa 97 97 0 0
251+
//utf7 编码 = 43 102 120 90
252+
253+
if (bom[0] == 0xef && bom[1] == 0xbb && bom[2] == 0xbf)
254+
return Encoding.UTF8; //无签名 117 116 102 56
255+
// 130 151 160 231
256+
if (bom[0] == 0xff && bom[1] == 0xfe)
257+
return Encoding.Unicode; //UTF-16LE
258+
259+
if (bom[0] == 0xfe && bom[1] == 0xff)
260+
return Encoding.BigEndianUnicode; //UTF-16BE
261+
262+
if (bom[0] == 0 && bom[1] == 0 && bom[2] == 0xfe && bom[3] == 0xff)
263+
return Encoding.UTF32;
264+
265+
return Encoding.ASCII; //如果返回ASCII可能是GBK 无签名utf8
266+
}
267+
268+
public void Dispose()
269+
{
270+
if (CountBuffer != null)
271+
{
272+
ArrayPool<byte>.Shared.Return(CountBuffer);
273+
}
274+
}
275+
}
276+
277+
/// <summary>
278+
/// 判断一个文件的编码的结果
279+
/// </summary>
280+
/// <param name="Encoding">文件的编码</param>
281+
/// <param name="ConfidenceCount">文件的编码可信度,注意 ASCII 文件的可信度为 0 在可信度为 1 的时候就是确定</param>
282+
public readonly record struct InspectFileEncodingResult(Encoding Encoding, double ConfidenceCount);
283+
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.IO;
4+
using System.Linq;
5+
using System.Text;
6+
using System.Threading.Tasks;
7+
8+
using EncodingUtf8AndGBKDifferentiater;
9+
10+
using LightTextEditorPlus;
11+
12+
namespace SimpleWrite.Business.FileHandlers;
13+
14+
internal class TextFileReader
15+
{
16+
public async Task ReadToTextEditor(FileInfo file, TextEditor textEditor)
17+
{
18+
// 先识别文件编码
19+
await using var fileStream = file.Open(FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
20+
using var encodingDifferentiater = new EncodingDifferentiater(fileStream);
21+
var result = await encodingDifferentiater.InspectFileEncodingAsync();
22+
23+
fileStream.Position = 0;
24+
var streamReader = new StreamReader(fileStream, result.Encoding);
25+
var text = await streamReader.ReadToEndAsync();
26+
textEditor.Text = text;
27+
}
28+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
//using System;
2+
//using System.Collections.Generic;
3+
//using System.Linq;
4+
//using System.Text;
5+
//using System.Threading.Tasks;
6+
//using LightTextEditorPlus;
7+
8+
//namespace SimpleWrite.Business.TextEditors;
9+
10+
//public class TextEditorCreator(Func<TextEditor> editorCreator)
11+
//{
12+
// public TextEditor Create() => editorCreator();
13+
//}

Blog/RallreakechuFeakenalldea/SimpleWrite/SimpleWrite.csproj

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
<PackageReference Include="dotnetCampus.Configurations" Version="1.9.1" />
1818

1919
<PackageReference Include="Markdig" Version="1.0.0" />
20+
21+
<PackageReference Include="System.Text.Encoding.CodePages" Version="10.0.3" />
2022
</ItemGroup>
2123

2224
<ItemGroup>

0 commit comments

Comments
 (0)