Skip to content

Commit a2ab5fb

Browse files
committed
Add support for UTF-8 encoding.
Add comments noting details of other encodings
1 parent 4becf02 commit a2ab5fb

File tree

1 file changed

+142
-0
lines changed

1 file changed

+142
-0
lines changed

RTF Parser Kit/src/com/rtfparserkit/parser/standard/Encoding.java

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,37 @@ class Encoding
3232
public static final Map<String, String> LOCALEID_MAPPING = new HashMap<String, String>();
3333
static
3434
{
35+
// Comment lines based on: https://msdn.microsoft.com/en-us/library/windows/desktop/dd317756(v=vs.85).aspx
36+
37+
// 037 IBM037 IBM EBCDIC US-Canada
38+
// 437 IBM437 OEM United States
39+
// 500 IBM500 IBM EBCDIC International
40+
// 708 ASMO-708 Arabic (ASMO 708)
41+
// 709 Arabic (ASMO-449+, BCON V4)
42+
// 710 Arabic - Transparent Arabic
43+
// 720 DOS-720 Arabic (Transparent ASMO); Arabic (DOS)
44+
// 737 ibm737 OEM Greek (formerly 437G); Greek (DOS)
45+
// 775 ibm775 OEM Baltic; Baltic (DOS)
46+
// 850 ibm850 OEM Multilingual Latin 1; Western European (DOS)
47+
// 852 ibm852 OEM Latin 2; Central European (DOS)
48+
// 855 IBM855 OEM Cyrillic (primarily Russian)
49+
// 857 ibm857 OEM Turkish; Turkish (DOS)
50+
// 858 IBM00858 OEM Multilingual Latin 1 + Euro symbol
51+
// 860 IBM860 OEM Portuguese; Portuguese (DOS)
52+
// 861 ibm861 OEM Icelandic; Icelandic (DOS)
53+
// 862 DOS-862 OEM Hebrew; Hebrew (DOS)
54+
// 863 IBM863 OEM French Canadian; French Canadian (DOS)
55+
// 864 IBM864 OEM Arabic; Arabic (864)
56+
// 865 IBM865 OEM Nordic; Nordic (DOS)
57+
// 866 cp866 OEM Russian; Cyrillic (DOS)
58+
// 869 ibm869 OEM Modern Greek; Greek, Modern (DOS)
59+
// 870 IBM870 IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
60+
// 874 windows-874 ANSI/OEM Thai (ISO 8859-11); Thai (Windows)
61+
// 875 cp875 IBM EBCDIC Greek Modern
3562
LOCALEID_MAPPING.put("932", "SJIS"); // Japanese
3663
LOCALEID_MAPPING.put("936", "Cp936"); // Simplified Chinese
3764
LOCALEID_MAPPING.put("949", "Cp949"); // Korean
65+
// 950 big5 ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
3866
LOCALEID_MAPPING.put("1025", "Cp1256"); // Arabic (Saudi Arabia)
3967
LOCALEID_MAPPING.put("1026", "Cp1251"); // Bulgarian
4068
LOCALEID_MAPPING.put("1028", "Cp950"); // Chinese (Taiwan)
@@ -45,6 +73,7 @@ class Encoding
4573
LOCALEID_MAPPING.put("1041", "SJIS"); // Japanese
4674
LOCALEID_MAPPING.put("1042", "Cp949"); // Korean
4775
LOCALEID_MAPPING.put("1045", "Cp1250"); // Polish
76+
// 1047 IBM01047 IBM EBCDIC Latin 1/Open System
4877
LOCALEID_MAPPING.put("1048", "Cp1250"); // Romanian
4978
LOCALEID_MAPPING.put("1049", "Cp1251"); // Russian
5079
LOCALEID_MAPPING.put("1050", "Cp1250"); // Croatian
@@ -68,13 +97,28 @@ class Encoding
6897
LOCALEID_MAPPING.put("1091", "Cp1254"); // Uzbek (Latin)
6998
LOCALEID_MAPPING.put("1092", "Cp1251"); // Tatar
7099
LOCALEID_MAPPING.put("1104", "Cp1251"); // Mongolian (Cyrillic)
100+
// 1140 IBM01140 IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
101+
// 1141 IBM01141 IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
102+
// 1142 IBM01142 IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
103+
// 1143 IBM01143 IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
104+
// 1144 IBM01144 IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
105+
// 1145 IBM01145 IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
106+
// 1146 IBM01146 IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
107+
// 1147 IBM01147 IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
108+
// 1148 IBM01148 IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
109+
// 1149 IBM01149 IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
110+
// 1200 utf-16 Unicode UTF-16, little endian byte order (BMP of ISO 10646)
111+
// 1201 unicodeFFFE Unicode UTF-16, big endian byte order
71112
LOCALEID_MAPPING.put("1250", "Cp1250"); // Windows Latin 2 (Central Europe)
72113
LOCALEID_MAPPING.put("1251", "Cp1251"); // Cyrillic
73114
LOCALEID_MAPPING.put("1252", "Cp1252"); // Latin
74115
LOCALEID_MAPPING.put("1253", "Cp1253"); // Greek
75116
LOCALEID_MAPPING.put("1254", "Cp1254"); // Turkish
76117
LOCALEID_MAPPING.put("1255", "Cp1255"); // Windows Hebrew
77118
LOCALEID_MAPPING.put("1256", "Cp1256"); // Arabic (Iraq)
119+
LOCALEID_MAPPING.put("1257", "Cp1257"); // Baltic
120+
LOCALEID_MAPPING.put("1258", "Cp1258"); // Vietnamese
121+
// 1361 Johab Korean (Johab)
78122
LOCALEID_MAPPING.put("2049", "Cp1256"); // Arabic (Iraq)
79123
LOCALEID_MAPPING.put("2052", "MS936"); // Chinese (PRC)
80124
LOCALEID_MAPPING.put("2074", "Cp1250"); // Serbian (Latin)
@@ -92,12 +136,110 @@ class Encoding
92136
LOCALEID_MAPPING.put("8193", "Cp1256"); // Arabic (Oman)
93137
LOCALEID_MAPPING.put("9217", "Cp1256"); // Arabic (Yemen)
94138
LOCALEID_MAPPING.put("10000", "MacRoman"); // Mac Roman
139+
// 10001 x-mac-japanese Japanese (Mac)
140+
// 10002 x-mac-chinesetrad MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
141+
// 10003 x-mac-korean Korean (Mac)
142+
// 10004 x-mac-arabic Arabic (Mac)
143+
// 10005 x-mac-hebrew Hebrew (Mac)
144+
// 10006 x-mac-greek Greek (Mac)
145+
// 10007 x-mac-cyrillic Cyrillic (Mac)
146+
// 10008 x-mac-chinesesimp MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
147+
// 10010 x-mac-romanian Romanian (Mac)
148+
// 10017 x-mac-ukrainian Ukrainian (Mac)
149+
// 10021 x-mac-thai Thai (Mac)
150+
// 10029 x-mac-ce MAC Latin 2; Central European (Mac)
151+
// 10079 x-mac-icelandic Icelandic (Mac)
152+
// 10081 x-mac-turkish Turkish (Mac)
153+
// 10082 x-mac-croatian Croatian (Mac)
95154
LOCALEID_MAPPING.put("10241", "Cp1256"); // Arabic (Syria)
96155
LOCALEID_MAPPING.put("11265", "Cp1256"); // Arabic (Jordan)
156+
// 12000 utf-32 Unicode UTF-32, little endian byte order
157+
// 12001 utf-32BE Unicode UTF-32, big endian byte order
97158
LOCALEID_MAPPING.put("12289", "Cp1256"); // Arabic (Lebanon)
98159
LOCALEID_MAPPING.put("13313", "Cp1256"); // Arabic (Kuwait)
99160
LOCALEID_MAPPING.put("14337", "Cp1256"); // Arabic (U.A.E.)
100161
LOCALEID_MAPPING.put("15361", "Cp1256"); // Arabic (Bahrain)
101162
LOCALEID_MAPPING.put("16385", "Cp1256"); // Arabic (Qatar)
163+
// 20000 x-Chinese_CNS CNS Taiwan; Chinese Traditional (CNS)
164+
// 20001 x-cp20001 TCA Taiwan
165+
// 20002 x_Chinese-Eten Eten Taiwan; Chinese Traditional (Eten)
166+
// 20003 x-cp20003 IBM5550 Taiwan
167+
// 20004 x-cp20004 TeleText Taiwan
168+
// 20005 x-cp20005 Wang Taiwan
169+
// 20105 x-IA5 IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
170+
// 20106 x-IA5-German IA5 German (7-bit)
171+
// 20107 x-IA5-Swedish IA5 Swedish (7-bit)
172+
// 20108 x-IA5-Norwegian IA5 Norwegian (7-bit)
173+
// 20127 us-ascii US-ASCII (7-bit)
174+
// 20261 x-cp20261 T.61
175+
// 20269 x-cp20269 ISO 6937 Non-Spacing Accent
176+
// 20273 IBM273 IBM EBCDIC Germany
177+
// 20277 IBM277 IBM EBCDIC Denmark-Norway
178+
// 20278 IBM278 IBM EBCDIC Finland-Sweden
179+
// 20280 IBM280 IBM EBCDIC Italy
180+
// 20284 IBM284 IBM EBCDIC Latin America-Spain
181+
// 20285 IBM285 IBM EBCDIC United Kingdom
182+
// 20290 IBM290 IBM EBCDIC Japanese Katakana Extended
183+
// 20297 IBM297 IBM EBCDIC France
184+
// 20420 IBM420 IBM EBCDIC Arabic
185+
// 20423 IBM423 IBM EBCDIC Greek
186+
// 20424 IBM424 IBM EBCDIC Hebrew
187+
// 20833 x-EBCDIC-KoreanExtended IBM EBCDIC Korean Extended
188+
// 20838 IBM-Thai IBM EBCDIC Thai
189+
// 20866 koi8-r Russian (KOI8-R); Cyrillic (KOI8-R)
190+
// 20871 IBM871 IBM EBCDIC Icelandic
191+
// 20880 IBM880 IBM EBCDIC Cyrillic Russian
192+
// 20905 IBM905 IBM EBCDIC Turkish
193+
// 20924 IBM00924 IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
194+
// 20932 EUC-JP Japanese (JIS 0208-1990 and 0212-1990)
195+
// 20936 x-cp20936 Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
196+
// 20949 x-cp20949 Korean Wansung
197+
// 21025 cp1025 IBM EBCDIC Cyrillic Serbian-Bulgarian
198+
// 21027 (deprecated)
199+
// 21866 koi8-u Ukrainian (KOI8-U); Cyrillic (KOI8-U)
200+
// 28591 iso-8859-1 ISO 8859-1 Latin 1; Western European (ISO)
201+
// 28592 iso-8859-2 ISO 8859-2 Central European; Central European (ISO)
202+
// 28593 iso-8859-3 ISO 8859-3 Latin 3
203+
// 28594 iso-8859-4 ISO 8859-4 Baltic
204+
// 28595 iso-8859-5 ISO 8859-5 Cyrillic
205+
// 28596 iso-8859-6 ISO 8859-6 Arabic
206+
// 28597 iso-8859-7 ISO 8859-7 Greek
207+
// 28598 iso-8859-8 ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
208+
// 28599 iso-8859-9 ISO 8859-9 Turkish
209+
// 28603 iso-8859-13 ISO 8859-13 Estonian
210+
// 28605 iso-8859-15 ISO 8859-15 Latin 9
211+
// 29001 x-Europa Europa 3
212+
// 38598 iso-8859-8-i ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
213+
// 50220 iso-2022-jp ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
214+
// 50221 csISO2022JP ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
215+
// 50222 iso-2022-jp ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
216+
// 50225 iso-2022-kr ISO 2022 Korean
217+
// 50227 x-cp50227 ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
218+
// 50229 ISO 2022 Traditional Chinese
219+
// 50930 EBCDIC Japanese (Katakana) Extended
220+
// 50931 EBCDIC US-Canada and Japanese
221+
// 50933 EBCDIC Korean Extended and Korean
222+
// 50935 EBCDIC Simplified Chinese Extended and Simplified Chinese
223+
// 50936 EBCDIC Simplified Chinese
224+
// 50937 EBCDIC US-Canada and Traditional Chinese
225+
// 50939 EBCDIC Japanese (Latin) Extended and Japanese
226+
// 51932 euc-jp EUC Japanese
227+
// 51936 EUC-CN EUC Simplified Chinese; Chinese Simplified (EUC)
228+
// 51949 euc-kr EUC Korean
229+
// 51950 EUC Traditional Chinese
230+
// 52936 hz-gb-2312 HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
231+
// 54936 GB18030 Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
232+
// 57002 x-iscii-de ISCII Devanagari
233+
// 57003 x-iscii-be ISCII Bangla
234+
// 57004 x-iscii-ta ISCII Tamil
235+
// 57005 x-iscii-te ISCII Telugu
236+
// 57006 x-iscii-as ISCII Assamese
237+
// 57007 x-iscii-or ISCII Odia
238+
// 57008 x-iscii-ka ISCII Kannada
239+
// 57009 x-iscii-ma ISCII Malayalam
240+
// 57010 x-iscii-gu ISCII Gujarati
241+
// 57011 x-iscii-pa ISCII Punjabi
242+
LOCALEID_MAPPING.put("65000", null); // UTF-7 - not a supported Java encoding, see: http://stackoverflow.com/questions/19861987/java-io-unsupportedencodingexception-unicode-1-1-utf-7
243+
LOCALEID_MAPPING.put("65001", "UTF-8"); // UTF-8
102244
}
103245
}

0 commit comments

Comments
 (0)