Skip to content

Commit 6152c89

Browse files
committed
in_tail: tests: runtime: Add test cases for encoding conversions from non UTF-16 encodings
Signed-off-by: Hiroshi Hatake <[email protected]>
1 parent 81d6640 commit 6152c89

27 files changed

+617
-0
lines changed
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import os
4+
5+
# This script generates a set of text files for testing various character encodings.
6+
# Each file contains a curated list of common, neutral words appropriate for the
7+
# target language and encoding.
8+
#
9+
# The word lists specifically exclude:
10+
# - Religious terminology
11+
# - Names of capital cities
12+
#
13+
# To use this script:
14+
# 1. Save it as a Python file (e.g., `generate_files.py`).
15+
# 2. Run it from your terminal: `python generate_files.py`
16+
# 3. The script will create several .txt files in the same directory.
17+
18+
# Dictionary of encodings and their corresponding test data.
19+
# The keys are the encoding names (and will be used in the filenames).
20+
# The values are lists of strings to be written to the files.
21+
ENCODING_DATA = {
22+
# --- East Asian Encodings ---
23+
"sjis": [
24+
"こんにちは", # Hello
25+
"ありがとう", # Thank you
26+
"さようなら", # Goodbye
27+
"日本", # Japan
28+
"猫", # Cat
29+
"犬", # Dog
30+
"食べる", # To eat
31+
"飲む", # To drink
32+
"空", # Sky
33+
"海", # Sea
34+
"月", # Moon
35+
"花", # Flower
36+
],
37+
"big5": [
38+
"你好", # Hello
39+
"謝謝", # Thank you
40+
"再見", # Goodbye
41+
"貓", # Cat
42+
"狗", # Dog
43+
"吃", # To eat
44+
"喝", # To drink
45+
"天", # Sky
46+
"海", # Sea
47+
"月亮", # Moon
48+
"花卉", # Flower
49+
],
50+
"gbk": [
51+
"你好", # Hello
52+
"谢谢", # Thank you
53+
"再见", # Goodbye
54+
"中国", # China
55+
"猫", # Cat
56+
"狗", # Dog
57+
"吃", # To eat
58+
"喝", # To drink
59+
"天", # Sky
60+
"海", # Sea
61+
"月亮", # Moon
62+
"花", # Flower
63+
],
64+
"gb18030": [ # Superset of GBK, can include the same + more
65+
"你好", "谢谢", "再见", "中国", "猫", "狗", "吃", "喝", "天", "海",
66+
"欧元符号€", # Euro symbol to test expanded range
67+
"龘", "龍", # Complex characters
68+
],
69+
"euc-kr": [ # Often used for Korean, UHC is a Microsoft equivalent
70+
"안녕하세요", # Hello
71+
"감사합니다", # Thank you
72+
"안녕히 가세요",# Goodbye
73+
"한국", # Korea
74+
"고양이", # Cat
75+
"개", # Dog
76+
"먹다", # To eat
77+
"마시다", # To drink
78+
"하늘", # Sky
79+
"바다", # Sea
80+
"달", # Moon
81+
"꽃", # Flower
82+
],
83+
84+
# --- Windows Codepage Encodings ---
85+
"cp866": [ # Cyrillic (DOS)
86+
"Привет", # Hello
87+
"Спасибо", # Thank you
88+
"До свидания", # Goodbye
89+
"Компьютер", # Computer
90+
"Информация", # Information
91+
"Программа", # Program
92+
"Файл", # File
93+
],
94+
"cp874": [ # Thai
95+
"สวัสดี", # Hello
96+
"ขอบคุณ", # Thank you
97+
"ลาก่อน", # Goodbye
98+
"ภาษาไทย", # Thai language
99+
"แมว", # Cat
100+
"สุนัข", # Dog
101+
"กิน", # Eat
102+
"ดื่ม", # Drink
103+
],
104+
"cp1250": [ # Central European (Polish, Czech, etc.)
105+
"Cześć", "Dziękuję", # Polish
106+
"Ahoj", "Děkuji", # Czech
107+
"Žluťoučký kůň", # Czech phrase with diacritics
108+
"Gęślą jaźń", # Polish phrase with diacritics
109+
"Árvíztűrő tükörfúrógép", # Hungarian
110+
],
111+
"cp1251": [ # Cyrillic (Windows)
112+
"Привет", "Спасибо", "До свидания",
113+
"Кошка", "Собака", "Небо", "Море",
114+
"Български език", # Bulgarian
115+
"Українська мова",# Ukrainian
116+
"Беларуская мова",# Belarusian
117+
],
118+
"cp1252": [ # Western European
119+
"Hello", "Thank you", "Goodbye", # English
120+
"Bonjour", "Merci", "Au revoir", # French
121+
"Hallo", "Danke", "Auf Wiedersehen", # German
122+
"Hola", "Gracias", "Adiós", # Spanish
123+
"Crème brûlée", "Piñata", "Fjord",
124+
],
125+
"cp1253": [ # Greek
126+
"Γειά σου", # Hello
127+
"Ευχαριστώ", # Thank you
128+
"Αντίο", # Goodbye
129+
"Ελληνικά", # Greek
130+
"Γάτα", # Cat
131+
"Σκύλος", # Dog
132+
"Ουρανός", # Sky
133+
"Θάλασσα", # Sea
134+
],
135+
"cp1254": [ # Turkish
136+
"Merhaba", "Teşekkür ederim", "Hoşça kal",
137+
"Türkiye", "Kedi", "Köpek",
138+
"Yemek", "İçmek", "Gök", "Deniz",
139+
"Öğrenci", "Işık", "Ağaç", # Words with specific Turkish chars
140+
],
141+
"cp1255": [ # Hebrew
142+
"שלום", # Hello/Peace
143+
"תודה", # Thank you
144+
"להתראות", # Goodbye
145+
"עברית", # Hebrew
146+
"חתול", # Cat
147+
"כלב", # Dog
148+
"שמיים", # Sky
149+
"ים", # Sea
150+
],
151+
"cp1256": [ # Arabic
152+
"مرحبا", # Hello
153+
"شكرا", # Thank you
154+
"مع السلامة", # Goodbye
155+
"العربية", # Arabic
156+
"قط", # Cat
157+
"كلب", # Dog
158+
"سماء", # Sky
159+
"بحر", # Sea
160+
],
161+
}
162+
163+
def generate_files():
164+
"""
165+
Iterates through the ENCODING_DATA dictionary and creates a file for each entry.
166+
"""
167+
# Get the directory where the script is running to save files there.
168+
output_dir = os.path.dirname(os.path.abspath(__file__))
169+
print(f"Files will be generated in: {output_dir}\n")
170+
171+
for encoding, content_list in ENCODING_DATA.items():
172+
# Sanitize encoding name for use in filename, replacing cp with win
173+
# for clarity as requested. UHC is an alias for euc-kr in this context.
174+
if encoding.startswith("cp"):
175+
filename_prefix = encoding.replace("cp", "win")
176+
elif encoding == "euc-kr":
177+
filename_prefix = "uhc"
178+
else:
179+
filename_prefix = encoding
180+
181+
file_path = os.path.join(output_dir, "log", f"generic_enc_{filename_prefix}.log")
182+
183+
try:
184+
# Open the file with the specified encoding
185+
with open(file_path, 'w', encoding=encoding) as f:
186+
# Join the list of words with newline characters
187+
f.write('\n'.join(content_list))
188+
f.write('\n')
189+
print(f"Successfully created: {os.path.basename(file_path)} (Encoding: {encoding})")
190+
191+
except UnicodeEncodeError as e:
192+
print(f"Error: Could not encode content for '{encoding}'.")
193+
print(f" - File not created: {os.path.basename(file_path)}")
194+
print(f" - Details: {e}")
195+
except Exception as e:
196+
print(f"An unexpected error occurred for '{encoding}': {e}")
197+
198+
if __name__ == "__main__":
199+
generate_files()
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
�A�n
2+
����
3+
�A��
4+
��
5+
��
6+
�Y
7+
��
8+
��
9+
��
10+
��G
11+
��c
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
���
2+
лл
3+
�ټ�
4+
�й�
5+
è
6+
��
7+
��
8+
��
9+
��
10+
��
11+
ŷԪ���Ţ�
12+
��
13+
��
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
���
2+
лл
3+
�ټ�
4+
�й�
5+
è
6+
��
7+
��
8+
��
9+
��
10+
��
11+
����
12+
��
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
����ɂ���
2+
���肪�Ƃ�
3+
���悤�Ȃ�
4+
���{
5+
�L
6+
��
7+
�H�ׂ�
8+
����
9+
��
10+
�C
11+
��
12+
��
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
�ȳ��ϼ���
2+
�����մϴ�
3+
�ȳ��� ������
4+
�ѱ�
5+
������
6+
��
7+
�Դ�
8+
���ô�
9+
�ϴ�
10+
�ٴ�
11+
��
12+
��
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Cze��
2+
Dzi�kuj�
3+
Ahoj
4+
D�kuji
5+
�lu�ou�k� k��
6+
G�l� ja��
7+
�rv�zt�r� t�k�rf�r�g�p
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
������
2+
�������
3+
�� ��������
4+
�����
5+
������
6+
����
7+
����
8+
��������� ����
9+
��������� ����
10+
���������� ����
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
Hello
2+
Thank you
3+
Goodbye
4+
Bonjour
5+
Merci
6+
Au revoir
7+
Hallo
8+
Danke
9+
Auf Wiedersehen
10+
Hola
11+
Gracias
12+
Adi�s
13+
Cr�me br�l�e
14+
Pi�ata
15+
Fjord
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
���� ���
2+
���������
3+
�����
4+
��������
5+
����
6+
������
7+
�������
8+
�������

0 commit comments

Comments
 (0)