Skip to content

Commit a8030de

Browse files
committed
Improve encodings handling
1 parent d1be19d commit a8030de

12 files changed

+379
-63
lines changed

src/DocSharp.Common/Encodings.cs

Lines changed: 266 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77

88
namespace DocSharp;
99

10+
// For reference:
11+
// https://learn.microsoft.com/en-us/windows/win32/intl/character-sets
12+
// https://learn.microsoft.com/en-us/windows/win32/intl/code-pages
13+
// https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
14+
1015
public static class Encodings
1116
{
1217

@@ -32,13 +37,270 @@ static Encodings()
3237

3338
public static Encoding UTF16BE => Encoding.BigEndianUnicode;
3439

35-
public static Encoding ISO8859_1 => Encoding.GetEncoding(28591); // Same as Latin1 (not available on .NET Framework)
40+
public static Encoding ISO8859_1 => Encoding.GetEncoding(28591); // Same as Latin1
3641

3742
public static Encoding ANSI => Encoding.GetEncoding(CultureInfo.CurrentCulture.TextInfo.ANSICodePage);
3843

39-
public static BinaryEncoding Binary => new BinaryEncoding();
44+
public static int SystemCodePage => CultureInfo.CurrentCulture.TextInfo.ANSICodePage;
45+
46+
public static int GetCodePage(int charSet)
47+
{
48+
switch (charSet)
49+
{
50+
case 0:
51+
return 1252; // ANSI
52+
case 1:
53+
return 0; // Default
54+
case 2:
55+
return 42; // Symbol
56+
case 77:
57+
return 10000; // Mac Roman
58+
case 78:
59+
return 10001; // Mac Shift Jis
60+
case 79:
61+
return 10003; // Mac Hangul
62+
case 80:
63+
return 10008; // Mac GB2312
64+
case 81:
65+
return 10002; // Mac Big5
66+
case 82:
67+
return 0; // Mac Johab (old)
68+
case 83:
69+
return 10005; // Mac Hebrew
70+
case 84:
71+
return 10004; // Mac Arabic
72+
case 85:
73+
return 10006; // Mac Greek
74+
case 86:
75+
return 10081; // Mac Turkish
76+
case 87:
77+
return 10021; // Mac Thai
78+
case 88:
79+
return 10029; // Mac East Europe
80+
case 89:
81+
return 10007; // Mac Russian
82+
case 128:
83+
return 932; // Shift JIS
84+
case 129:
85+
return 949; // Hangul
86+
case 130:
87+
return 1361; // Johab
88+
case 134:
89+
return 936; // GB2312
90+
case 136:
91+
return 950; // Big5
92+
case 161:
93+
return 1253; // Greek
94+
case 162:
95+
return 1254; // Turkish
96+
case 163:
97+
return 1258; // Vietnamese
98+
case 177:
99+
return 1255; // Hebrew
100+
case 178:
101+
return 1256; // Arabic
102+
case 179:
103+
return 0; // Arabic Traditional (old)
104+
case 180:
105+
return 0; // Arabic user (old)
106+
case 181:
107+
return 0; // Hebrew user (old)
108+
case 186:
109+
return 1257; // Baltic
110+
case 204:
111+
return 1251; // Russian
112+
case 222:
113+
return 874; // Thai
114+
case 238:
115+
return 1250; // Eastern European
116+
case 254:
117+
return 437; // PC 437
118+
case 255:
119+
return 850; // OEM
120+
}
121+
return 0;
122+
}
40123

41-
// For reference: https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
124+
internal static readonly Dictionary<int, string> _codePages = new Dictionary<int, string>()
125+
{
126+
{37, "IBM037"},
127+
{437, "IBM437"},
128+
{500, "IBM500"},
129+
{708, "ASMO-708"},
130+
{720, "DOS-720"},
131+
{737, "ibm737"},
132+
{775, "ibm775"},
133+
{819, "ISO-8859-1"},
134+
{850, "ibm850"},
135+
{852, "ibm852"},
136+
{855, "IBM855"},
137+
{857, "ibm857"},
138+
{858, "IBM00858"},
139+
{860, "IBM860"},
140+
{861, "ibm861"},
141+
{862, "DOS-862"},
142+
{863, "IBM863"},
143+
{864, "IBM864"},
144+
{865, "IBM865"},
145+
{866, "cp866"},
146+
{869, "ibm869"},
147+
{870, "IBM870"},
148+
{874, "windows-874"},
149+
{875, "cp875"},
150+
{932, "shift_jis"},
151+
{936, "gb2312"},
152+
{949, "ks_c_5601-1987"},
153+
{950, "big5"},
154+
{1026, "IBM1026"},
155+
{1047, "IBM01047"},
156+
{1140, "IBM01140"},
157+
{1141, "IBM01141"},
158+
{1142, "IBM01142"},
159+
{1143, "IBM01143"},
160+
{1144, "IBM01144"},
161+
{1145, "IBM01145"},
162+
{1146, "IBM01146"},
163+
{1147, "IBM01147"},
164+
{1148, "IBM01148"},
165+
{1149, "IBM01149"},
166+
{1200, "utf-16"},
167+
{1201, "utf-16BE"},
168+
{1250, "windows-1250"},
169+
{1251, "windows-1251"},
170+
{1252, "windows-1252"},
171+
{1253, "windows-1253"},
172+
{1254, "windows-1254"},
173+
{1255, "windows-1255"},
174+
{1256, "windows-1256"},
175+
{1257, "windows-1257"},
176+
{1258, "windows-1258"},
177+
{1361, "Johab"},
178+
{10000, "macintosh"},
179+
{10001, "x-mac-japanese"},
180+
{10002, "x-mac-chinesetrad"},
181+
{10003, "x-mac-korean"},
182+
{10004, "x-mac-arabic"},
183+
{10005, "x-mac-hebrew"},
184+
{10006, "x-mac-greek"},
185+
{10007, "x-mac-cyrillic"},
186+
{10008, "x-mac-chinesesimp"},
187+
{10010, "x-mac-romanian"},
188+
{10017, "x-mac-ukrainian"},
189+
{10021, "x-mac-thai"},
190+
{10029, "x-mac-ce"},
191+
{10079, "x-mac-icelandic"},
192+
{10081, "x-mac-turkish"},
193+
{10082, "x-mac-croatian"},
194+
{12000, "utf-32"},
195+
{12001, "utf-32BE"},
196+
{20000, "x-Chinese-CNS"},
197+
{20001, "x-cp20001"},
198+
{20002, "x-Chinese-Eten"},
199+
{20003, "x-cp20003"},
200+
{20004, "x-cp20004"},
201+
{20005, "x-cp20005"},
202+
{20105, "x-IA5"},
203+
{20106, "x-IA5-German"},
204+
{20107, "x-IA5-Swedish"},
205+
{20108, "x-IA5-Norwegian"},
206+
{20127, "us-ascii"},
207+
{20261, "x-cp20261"},
208+
{20269, "x-cp20269"},
209+
{20273, "IBM273"},
210+
{20277, "IBM277"},
211+
{20278, "IBM278"},
212+
{20280, "IBM280"},
213+
{20284, "IBM284"},
214+
{20285, "IBM285"},
215+
{20290, "IBM290"},
216+
{20297, "IBM297"},
217+
{20420, "IBM420"},
218+
{20423, "IBM423"},
219+
{20424, "IBM424"},
220+
{20833, "x-EBCDIC-KoreanExtended"},
221+
{20838, "IBM-Thai"},
222+
{20866, "koi8-r"},
223+
{20871, "IBM871"},
224+
{20880, "IBM880"},
225+
{20905, "IBM905"},
226+
{20924, "IBM00924"},
227+
{20932, "EUC-JP"},
228+
{20936, "x-cp20936"},
229+
{20949, "x-cp20949"},
230+
{21025, "cp1025"},
231+
{21866, "koi8-u"},
232+
{28591, "iso-8859-1"},
233+
{28592, "iso-8859-2"},
234+
{28593, "iso-8859-3"},
235+
{28594, "iso-8859-4"},
236+
{28595, "iso-8859-5"},
237+
{28596, "iso-8859-6"},
238+
{28597, "iso-8859-7"},
239+
{28598, "iso-8859-8"},
240+
{28599, "iso-8859-9"},
241+
{28603, "iso-8859-13"},
242+
{28605, "iso-8859-15"},
243+
{29001, "x-Europa"},
244+
{38598, "iso-8859-8-i"},
245+
{50220, "iso-2022-jp"},
246+
{50221, "csISO2022JP"},
247+
{50222, "iso-2022-jp"},
248+
{50225, "iso-2022-kr"},
249+
{50227, "x-cp50227"},
250+
{51932, "euc-jp"},
251+
{51936, "EUC-CN"},
252+
{51949, "euc-kr"},
253+
{52936, "hz-gb-2312"},
254+
{54936, "GB18030"},
255+
{57002, "x-iscii-de"},
256+
{57003, "x-iscii-be"},
257+
{57004, "x-iscii-ta"},
258+
{57005, "x-iscii-te"},
259+
{57006, "x-iscii-as"},
260+
{57007, "x-iscii-or"},
261+
{57008, "x-iscii-ka"},
262+
{57009, "x-iscii-ma"},
263+
{57010, "x-iscii-gu"},
264+
{57011, "x-iscii-pa"},
265+
{65000, "utf-7"},
266+
{65001, "utf-8"}
267+
};
268+
269+
private static HashSet<string> _eastAsianEncodings = new HashSet<string>()
270+
{
271+
"windows-874",
272+
"shift_jis",
273+
"gb2312",
274+
"ks_c_5601-1987",
275+
"big5",
276+
"EUC-JP",
277+
"euc-jp",
278+
"EUC-CN",
279+
"euc-kr",
280+
};
281+
282+
public static bool IsEastAsian(Encoding encoding)
283+
{
284+
return _eastAsianEncodings.Contains(encoding?.WebName ?? "");
285+
}
286+
287+
public static Encoding EncodingFromCharSet(int charSet)
288+
{
289+
return EncodingFromCodePage(GetCodePage(charSet));
290+
}
291+
292+
public static Encoding EncodingFromCodePage(int codePage)
293+
{
294+
switch (codePage)
295+
{
296+
case 0: // Default
297+
case 42: // "Symbol fake" code page
298+
return Encodings.ANSI;
299+
// return Encoding.GetEncoding("Windows-1252");
300+
default:
301+
return Encoding.GetEncoding(_codePages[codePage]);
302+
}
303+
}
42304
}
43305

44306
public class BinaryEncoding : Encoding
@@ -77,4 +339,4 @@ public override int GetMaxCharCount(int byteCount)
77339
{
78340
return byteCount;
79341
}
80-
}
342+
}

src/DocSharp.Common/Writers/RtfStringWriter.cs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,9 @@ public void WriteRtfUnicodeChar(int charCode)
6363
Write(RtfHelpers.ConvertUnicodeChar(charCode));
6464
}
6565

66-
public void WriteRtfHeader()
66+
public void WriteRtfHeader(int codePage)
6767
{
68-
Write(@"{\rtf1\ansi\deff0\nouicompat");
68+
Write(@$"{{\rtf1\ansi\ansicpg{codePage}\deff0\nouicompat");
6969
}
7070

7171
public void Write(System.Drawing.Color color)
@@ -197,12 +197,12 @@ public void WriteRtfDate(DateTime value)
197197
Date/time references use the following bit field structure (DTTM):
198198
| Bit numbers | Information | Range |
199199
| ----------- | ------------ | ----------------|
200-
| 05 | Minute | 059 |
201-
| 610 | Hour | 023 |
202-
| 1115 | Day of month | 131 |
203-
| 1619 | Month | 112 |
204-
| 2028 | Year | = Year 1900 |
205-
| 2931 | Day of week | 0 (Sun)6 (Sat) |
200+
| 0-5 | Minute | 0-59 |
201+
| 6-10 | Hour | 0-23 |
202+
| 11-15 | Day of month | 1-31 |
203+
| 16-19 | Month | 1-12 |
204+
| 20-28 | Year | = Year - 1900 |
205+
| 29-31 | Day of week | 0 (Sun)-6 (Sat) |
206206
*/
207207

208208
long nDT = (uint)value.DayOfWeek;

src/DocSharp.Docx/DocxToHtml/DocxToHtmlConverter.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ namespace DocSharp.Docx;
2020
/// </summary>
2121
public partial class DocxToHtmlConverter : DocxToXmlWriterBase<HtmlTextWriter>
2222
{
23+
public override Encoding DefaultEncoding => Encodings.UTF8NoBOM;
24+
2325
public override HtmlTextWriter CreateXmlWriter(TextWriter writer) => new HtmlTextWriter(writer);
2426

2527
/// <summary>

src/DocSharp.Docx/DocxToMarkdownConverter.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ namespace DocSharp.Docx;
2424
/// </summary>
2525
public class DocxToMarkdownConverter : DocxToStringWriterBase<MarkdownStringWriter>
2626
{
27+
public override Encoding DefaultEncoding => Encodings.UTF8NoBOM;
28+
2729
/// <summary>
2830
/// If this property is set to a directory, images will be exported to that folder
2931
/// and a reference will be added in Markdown syntax,

src/DocSharp.Docx/DocxToRtf/DocxToRtfConverter.cs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,26 @@ namespace DocSharp.Docx;
2222
/// </summary>
2323
public partial class DocxToRtfConverter : DocxToStringWriterBase<RtfStringWriter>
2424
{
25+
/// <summary>
26+
/// This property should not be changed.
27+
/// According to the specification, RTF writer should use ASCII (chars 0-127)
28+
/// and escape other chars as '\xx (using code pages for chars 128-255)
29+
/// or as \uc1\u{X} (using Unicode).
30+
/// If an old reader does not support Unicode it will skip these characters rather than corrupting the document.
31+
/// </summary>
32+
public override Encoding DefaultEncoding => Encoding.ASCII;
33+
34+
/// <summary>
35+
/// Specifies the default code page number, that will be written in the RTF header and used to encode special characters.
36+
/// For example, "à" is encoded as \'e0 when the encoding is Windows-1252.
37+
/// For characters not available in the code page, Unicode will be used, for example (e.g. \uc1\u915).
38+
/// By default, the code page of the system region is used and changing it is not recommended.
39+
/// A full list of code pages can be found at https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers,
40+
/// but note that not all code pages are supported in RTF (notably, UTF-8 is not supported).
41+
/// The RTF specification can be found in the repository.
42+
/// </summary>
43+
public int DefaultCodePage { get; set; } = Encodings.SystemCodePage;
44+
2545
/// <summary>
2646
/// Gets or set the default font and paragraph properties used in (rare) cases where
2747
/// they are not specified in in neither the document body, styles or default style.
@@ -59,7 +79,7 @@ public override void Append(WordprocessingDocument inputDocument, string outputF
5979

6080
internal override void ProcessDocument(Document document, RtfStringWriter sb)
6181
{
62-
sb.WriteRtfHeader();
82+
sb.WriteRtfHeader(DefaultCodePage);
6383

6484
if (document.MainDocumentPart?.StyleDefinitionsPart?.Styles is Styles styles)
6585
{

0 commit comments

Comments
 (0)