77
88namespace DocSharp ;
99
10+ // For reference:
11+ // https://learn.microsoft.com/en-us/windows/win32/intl/character-sets
12+ // https://learn.microsoft.com/en-us/windows/win32/intl/code-pages
13+ // https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
14+
1015public static class Encodings
1116{
1217
@@ -32,13 +37,270 @@ static Encodings()
3237
3338 public static Encoding UTF16BE => Encoding . BigEndianUnicode ;
3439
35- public static Encoding ISO8859_1 => Encoding . GetEncoding ( 28591 ) ; // Same as Latin1 (not available on .NET Framework)
40+ public static Encoding ISO8859_1 => Encoding . GetEncoding ( 28591 ) ; // Same as Latin1
3641
3742 public static Encoding ANSI => Encoding . GetEncoding ( CultureInfo . CurrentCulture . TextInfo . ANSICodePage ) ;
3843
39- public static BinaryEncoding Binary => new BinaryEncoding ( ) ;
44+ public static int SystemCodePage => CultureInfo . CurrentCulture . TextInfo . ANSICodePage ;
45+
46+ public static int GetCodePage ( int charSet )
47+ {
48+ switch ( charSet )
49+ {
50+ case 0 :
51+ return 1252 ; // ANSI
52+ case 1 :
53+ return 0 ; // Default
54+ case 2 :
55+ return 42 ; // Symbol
56+ case 77 :
57+ return 10000 ; // Mac Roman
58+ case 78 :
59+ return 10001 ; // Mac Shift Jis
60+ case 79 :
61+ return 10003 ; // Mac Hangul
62+ case 80 :
63+ return 10008 ; // Mac GB2312
64+ case 81 :
65+ return 10002 ; // Mac Big5
66+ case 82 :
67+ return 0 ; // Mac Johab (old)
68+ case 83 :
69+ return 10005 ; // Mac Hebrew
70+ case 84 :
71+ return 10004 ; // Mac Arabic
72+ case 85 :
73+ return 10006 ; // Mac Greek
74+ case 86 :
75+ return 10081 ; // Mac Turkish
76+ case 87 :
77+ return 10021 ; // Mac Thai
78+ case 88 :
79+ return 10029 ; // Mac East Europe
80+ case 89 :
81+ return 10007 ; // Mac Russian
82+ case 128 :
83+ return 932 ; // Shift JIS
84+ case 129 :
85+ return 949 ; // Hangul
86+ case 130 :
87+ return 1361 ; // Johab
88+ case 134 :
89+ return 936 ; // GB2312
90+ case 136 :
91+ return 950 ; // Big5
92+ case 161 :
93+ return 1253 ; // Greek
94+ case 162 :
95+ return 1254 ; // Turkish
96+ case 163 :
97+ return 1258 ; // Vietnamese
98+ case 177 :
99+ return 1255 ; // Hebrew
100+ case 178 :
101+ return 1256 ; // Arabic
102+ case 179 :
103+ return 0 ; // Arabic Traditional (old)
104+ case 180 :
105+ return 0 ; // Arabic user (old)
106+ case 181 :
107+ return 0 ; // Hebrew user (old)
108+ case 186 :
109+ return 1257 ; // Baltic
110+ case 204 :
111+ return 1251 ; // Russian
112+ case 222 :
113+ return 874 ; // Thai
114+ case 238 :
115+ return 1250 ; // Eastern European
116+ case 254 :
117+ return 437 ; // PC 437
118+ case 255 :
119+ return 850 ; // OEM
120+ }
121+ return 0 ;
122+ }
40123
41- // For reference: https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
124+ internal static readonly Dictionary < int , string > _codePages = new Dictionary < int , string > ( )
125+ {
126+ { 37 , "IBM037" } ,
127+ { 437 , "IBM437" } ,
128+ { 500 , "IBM500" } ,
129+ { 708 , "ASMO-708" } ,
130+ { 720 , "DOS-720" } ,
131+ { 737 , "ibm737" } ,
132+ { 775 , "ibm775" } ,
133+ { 819 , "ISO-8859-1" } ,
134+ { 850 , "ibm850" } ,
135+ { 852 , "ibm852" } ,
136+ { 855 , "IBM855" } ,
137+ { 857 , "ibm857" } ,
138+ { 858 , "IBM00858" } ,
139+ { 860 , "IBM860" } ,
140+ { 861 , "ibm861" } ,
141+ { 862 , "DOS-862" } ,
142+ { 863 , "IBM863" } ,
143+ { 864 , "IBM864" } ,
144+ { 865 , "IBM865" } ,
145+ { 866 , "cp866" } ,
146+ { 869 , "ibm869" } ,
147+ { 870 , "IBM870" } ,
148+ { 874 , "windows-874" } ,
149+ { 875 , "cp875" } ,
150+ { 932 , "shift_jis" } ,
151+ { 936 , "gb2312" } ,
152+ { 949 , "ks_c_5601-1987" } ,
153+ { 950 , "big5" } ,
154+ { 1026 , "IBM1026" } ,
155+ { 1047 , "IBM01047" } ,
156+ { 1140 , "IBM01140" } ,
157+ { 1141 , "IBM01141" } ,
158+ { 1142 , "IBM01142" } ,
159+ { 1143 , "IBM01143" } ,
160+ { 1144 , "IBM01144" } ,
161+ { 1145 , "IBM01145" } ,
162+ { 1146 , "IBM01146" } ,
163+ { 1147 , "IBM01147" } ,
164+ { 1148 , "IBM01148" } ,
165+ { 1149 , "IBM01149" } ,
166+ { 1200 , "utf-16" } ,
167+ { 1201 , "utf-16BE" } ,
168+ { 1250 , "windows-1250" } ,
169+ { 1251 , "windows-1251" } ,
170+ { 1252 , "windows-1252" } ,
171+ { 1253 , "windows-1253" } ,
172+ { 1254 , "windows-1254" } ,
173+ { 1255 , "windows-1255" } ,
174+ { 1256 , "windows-1256" } ,
175+ { 1257 , "windows-1257" } ,
176+ { 1258 , "windows-1258" } ,
177+ { 1361 , "Johab" } ,
178+ { 10000 , "macintosh" } ,
179+ { 10001 , "x-mac-japanese" } ,
180+ { 10002 , "x-mac-chinesetrad" } ,
181+ { 10003 , "x-mac-korean" } ,
182+ { 10004 , "x-mac-arabic" } ,
183+ { 10005 , "x-mac-hebrew" } ,
184+ { 10006 , "x-mac-greek" } ,
185+ { 10007 , "x-mac-cyrillic" } ,
186+ { 10008 , "x-mac-chinesesimp" } ,
187+ { 10010 , "x-mac-romanian" } ,
188+ { 10017 , "x-mac-ukrainian" } ,
189+ { 10021 , "x-mac-thai" } ,
190+ { 10029 , "x-mac-ce" } ,
191+ { 10079 , "x-mac-icelandic" } ,
192+ { 10081 , "x-mac-turkish" } ,
193+ { 10082 , "x-mac-croatian" } ,
194+ { 12000 , "utf-32" } ,
195+ { 12001 , "utf-32BE" } ,
196+ { 20000 , "x-Chinese-CNS" } ,
197+ { 20001 , "x-cp20001" } ,
198+ { 20002 , "x-Chinese-Eten" } ,
199+ { 20003 , "x-cp20003" } ,
200+ { 20004 , "x-cp20004" } ,
201+ { 20005 , "x-cp20005" } ,
202+ { 20105 , "x-IA5" } ,
203+ { 20106 , "x-IA5-German" } ,
204+ { 20107 , "x-IA5-Swedish" } ,
205+ { 20108 , "x-IA5-Norwegian" } ,
206+ { 20127 , "us-ascii" } ,
207+ { 20261 , "x-cp20261" } ,
208+ { 20269 , "x-cp20269" } ,
209+ { 20273 , "IBM273" } ,
210+ { 20277 , "IBM277" } ,
211+ { 20278 , "IBM278" } ,
212+ { 20280 , "IBM280" } ,
213+ { 20284 , "IBM284" } ,
214+ { 20285 , "IBM285" } ,
215+ { 20290 , "IBM290" } ,
216+ { 20297 , "IBM297" } ,
217+ { 20420 , "IBM420" } ,
218+ { 20423 , "IBM423" } ,
219+ { 20424 , "IBM424" } ,
220+ { 20833 , "x-EBCDIC-KoreanExtended" } ,
221+ { 20838 , "IBM-Thai" } ,
222+ { 20866 , "koi8-r" } ,
223+ { 20871 , "IBM871" } ,
224+ { 20880 , "IBM880" } ,
225+ { 20905 , "IBM905" } ,
226+ { 20924 , "IBM00924" } ,
227+ { 20932 , "EUC-JP" } ,
228+ { 20936 , "x-cp20936" } ,
229+ { 20949 , "x-cp20949" } ,
230+ { 21025 , "cp1025" } ,
231+ { 21866 , "koi8-u" } ,
232+ { 28591 , "iso-8859-1" } ,
233+ { 28592 , "iso-8859-2" } ,
234+ { 28593 , "iso-8859-3" } ,
235+ { 28594 , "iso-8859-4" } ,
236+ { 28595 , "iso-8859-5" } ,
237+ { 28596 , "iso-8859-6" } ,
238+ { 28597 , "iso-8859-7" } ,
239+ { 28598 , "iso-8859-8" } ,
240+ { 28599 , "iso-8859-9" } ,
241+ { 28603 , "iso-8859-13" } ,
242+ { 28605 , "iso-8859-15" } ,
243+ { 29001 , "x-Europa" } ,
244+ { 38598 , "iso-8859-8-i" } ,
245+ { 50220 , "iso-2022-jp" } ,
246+ { 50221 , "csISO2022JP" } ,
247+ { 50222 , "iso-2022-jp" } ,
248+ { 50225 , "iso-2022-kr" } ,
249+ { 50227 , "x-cp50227" } ,
250+ { 51932 , "euc-jp" } ,
251+ { 51936 , "EUC-CN" } ,
252+ { 51949 , "euc-kr" } ,
253+ { 52936 , "hz-gb-2312" } ,
254+ { 54936 , "GB18030" } ,
255+ { 57002 , "x-iscii-de" } ,
256+ { 57003 , "x-iscii-be" } ,
257+ { 57004 , "x-iscii-ta" } ,
258+ { 57005 , "x-iscii-te" } ,
259+ { 57006 , "x-iscii-as" } ,
260+ { 57007 , "x-iscii-or" } ,
261+ { 57008 , "x-iscii-ka" } ,
262+ { 57009 , "x-iscii-ma" } ,
263+ { 57010 , "x-iscii-gu" } ,
264+ { 57011 , "x-iscii-pa" } ,
265+ { 65000 , "utf-7" } ,
266+ { 65001 , "utf-8" }
267+ } ;
268+
269+ private static HashSet < string > _eastAsianEncodings = new HashSet < string > ( )
270+ {
271+ "windows-874" ,
272+ "shift_jis" ,
273+ "gb2312" ,
274+ "ks_c_5601-1987" ,
275+ "big5" ,
276+ "EUC-JP" ,
277+ "euc-jp" ,
278+ "EUC-CN" ,
279+ "euc-kr" ,
280+ } ;
281+
282+ public static bool IsEastAsian ( Encoding encoding )
283+ {
284+ return _eastAsianEncodings . Contains ( encoding ? . WebName ?? "" ) ;
285+ }
286+
287+ public static Encoding EncodingFromCharSet ( int charSet )
288+ {
289+ return EncodingFromCodePage ( GetCodePage ( charSet ) ) ;
290+ }
291+
292+ public static Encoding EncodingFromCodePage ( int codePage )
293+ {
294+ switch ( codePage )
295+ {
296+ case 0 : // Default
297+ case 42 : // "Symbol fake" code page
298+ return Encodings . ANSI ;
299+ // return Encoding.GetEncoding("Windows-1252");
300+ default :
301+ return Encoding . GetEncoding ( _codePages [ codePage ] ) ;
302+ }
303+ }
42304}
43305
44306public class BinaryEncoding : Encoding
@@ -77,4 +339,4 @@ public override int GetMaxCharCount(int byteCount)
77339 {
78340 return byteCount ;
79341 }
80- }
342+ }
0 commit comments