|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +import os |
| 4 | + |
| 5 | +# This script generates a set of text files for testing various character encodings. |
| 6 | +# Each file contains a curated list of common, neutral words appropriate for the |
| 7 | +# target language and encoding. |
| 8 | +# |
| 9 | +# The word lists specifically exclude: |
| 10 | +# - Religious terminology |
| 11 | +# - Names of capital cities |
| 12 | +# |
| 13 | +# To use this script: |
| 14 | +# 1. Save it as a Python file (e.g., `generate_files.py`). |
| 15 | +# 2. Run it from your terminal: `python generate_files.py` |
| 16 | +# 3. The script will create several .txt files in the same directory. |
| 17 | + |
| 18 | +# Dictionary of encodings and their corresponding test data. |
| 19 | +# The keys are the encoding names (and will be used in the filenames). |
| 20 | +# The values are lists of strings to be written to the files. |
| 21 | +ENCODING_DATA = { |
| 22 | + # --- East Asian Encodings --- |
| 23 | + "sjis": [ |
| 24 | + "こんにちは", # Hello |
| 25 | + "ありがとう", # Thank you |
| 26 | + "さようなら", # Goodbye |
| 27 | + "日本", # Japan |
| 28 | + "猫", # Cat |
| 29 | + "犬", # Dog |
| 30 | + "食べる", # To eat |
| 31 | + "飲む", # To drink |
| 32 | + "空", # Sky |
| 33 | + "海", # Sea |
| 34 | + "月", # Moon |
| 35 | + "花", # Flower |
| 36 | + ], |
| 37 | + "big5": [ |
| 38 | + "你好", # Hello |
| 39 | + "謝謝", # Thank you |
| 40 | + "再見", # Goodbye |
| 41 | + "貓", # Cat |
| 42 | + "狗", # Dog |
| 43 | + "吃", # To eat |
| 44 | + "喝", # To drink |
| 45 | + "天", # Sky |
| 46 | + "海", # Sea |
| 47 | + "月亮", # Moon |
| 48 | + "花卉", # Flower |
| 49 | + ], |
| 50 | + "gbk": [ |
| 51 | + "你好", # Hello |
| 52 | + "谢谢", # Thank you |
| 53 | + "再见", # Goodbye |
| 54 | + "中国", # China |
| 55 | + "猫", # Cat |
| 56 | + "狗", # Dog |
| 57 | + "吃", # To eat |
| 58 | + "喝", # To drink |
| 59 | + "天", # Sky |
| 60 | + "海", # Sea |
| 61 | + "月亮", # Moon |
| 62 | + "花", # Flower |
| 63 | + ], |
| 64 | + "gb18030": [ # Superset of GBK, can include the same + more |
| 65 | + "你好", "谢谢", "再见", "中国", "猫", "狗", "吃", "喝", "天", "海", |
| 66 | + "欧元符号€", # Euro symbol to test expanded range |
| 67 | + "龘", "龍", # Complex characters |
| 68 | + ], |
| 69 | + "euc-kr": [ # Often used for Korean, UHC is a Microsoft equivalent |
| 70 | + "안녕하세요", # Hello |
| 71 | + "감사합니다", # Thank you |
| 72 | + "안녕히 가세요",# Goodbye |
| 73 | + "한국", # Korea |
| 74 | + "고양이", # Cat |
| 75 | + "개", # Dog |
| 76 | + "먹다", # To eat |
| 77 | + "마시다", # To drink |
| 78 | + "하늘", # Sky |
| 79 | + "바다", # Sea |
| 80 | + "달", # Moon |
| 81 | + "꽃", # Flower |
| 82 | + ], |
| 83 | + |
| 84 | + # --- Windows Codepage Encodings --- |
| 85 | + "cp866": [ # Cyrillic (DOS) |
| 86 | + "Привет", # Hello |
| 87 | + "Спасибо", # Thank you |
| 88 | + "До свидания", # Goodbye |
| 89 | + "Компьютер", # Computer |
| 90 | + "Информация", # Information |
| 91 | + "Программа", # Program |
| 92 | + "Файл", # File |
| 93 | + ], |
| 94 | + "cp874": [ # Thai |
| 95 | + "สวัสดี", # Hello |
| 96 | + "ขอบคุณ", # Thank you |
| 97 | + "ลาก่อน", # Goodbye |
| 98 | + "ภาษาไทย", # Thai language |
| 99 | + "แมว", # Cat |
| 100 | + "สุนัข", # Dog |
| 101 | + "กิน", # Eat |
| 102 | + "ดื่ม", # Drink |
| 103 | + ], |
| 104 | + "cp1250": [ # Central European (Polish, Czech, etc.) |
| 105 | + "Cześć", "Dziękuję", # Polish |
| 106 | + "Ahoj", "Děkuji", # Czech |
| 107 | + "Žluťoučký kůň", # Czech phrase with diacritics |
| 108 | + "Gęślą jaźń", # Polish phrase with diacritics |
| 109 | + "Árvíztűrő tükörfúrógép", # Hungarian |
| 110 | + ], |
| 111 | + "cp1251": [ # Cyrillic (Windows) |
| 112 | + "Привет", "Спасибо", "До свидания", |
| 113 | + "Кошка", "Собака", "Небо", "Море", |
| 114 | + "Български език", # Bulgarian |
| 115 | + "Українська мова",# Ukrainian |
| 116 | + "Беларуская мова",# Belarusian |
| 117 | + ], |
| 118 | + "cp1252": [ # Western European |
| 119 | + "Hello", "Thank you", "Goodbye", # English |
| 120 | + "Bonjour", "Merci", "Au revoir", # French |
| 121 | + "Hallo", "Danke", "Auf Wiedersehen", # German |
| 122 | + "Hola", "Gracias", "Adiós", # Spanish |
| 123 | + "Crème brûlée", "Piñata", "Fjord", |
| 124 | + ], |
| 125 | + "cp1253": [ # Greek |
| 126 | + "Γειά σου", # Hello |
| 127 | + "Ευχαριστώ", # Thank you |
| 128 | + "Αντίο", # Goodbye |
| 129 | + "Ελληνικά", # Greek |
| 130 | + "Γάτα", # Cat |
| 131 | + "Σκύλος", # Dog |
| 132 | + "Ουρανός", # Sky |
| 133 | + "Θάλασσα", # Sea |
| 134 | + ], |
| 135 | + "cp1254": [ # Turkish |
| 136 | + "Merhaba", "Teşekkür ederim", "Hoşça kal", |
| 137 | + "Türkiye", "Kedi", "Köpek", |
| 138 | + "Yemek", "İçmek", "Gök", "Deniz", |
| 139 | + "Öğrenci", "Işık", "Ağaç", # Words with specific Turkish chars |
| 140 | + ], |
| 141 | + "cp1255": [ # Hebrew |
| 142 | + "שלום", # Hello/Peace |
| 143 | + "תודה", # Thank you |
| 144 | + "להתראות", # Goodbye |
| 145 | + "עברית", # Hebrew |
| 146 | + "חתול", # Cat |
| 147 | + "כלב", # Dog |
| 148 | + "שמיים", # Sky |
| 149 | + "ים", # Sea |
| 150 | + ], |
| 151 | + "cp1256": [ # Arabic |
| 152 | + "مرحبا", # Hello |
| 153 | + "شكرا", # Thank you |
| 154 | + "مع السلامة", # Goodbye |
| 155 | + "العربية", # Arabic |
| 156 | + "قط", # Cat |
| 157 | + "كلب", # Dog |
| 158 | + "سماء", # Sky |
| 159 | + "بحر", # Sea |
| 160 | + ], |
| 161 | +} |
| 162 | + |
| 163 | +def generate_files(): |
| 164 | + """ |
| 165 | + Iterates through the ENCODING_DATA dictionary and creates a file for each entry. |
| 166 | + """ |
| 167 | + # Get the directory where the script is running to save files there. |
| 168 | + output_dir = os.path.dirname(os.path.abspath(__file__)) |
| 169 | + print(f"Files will be generated in: {output_dir}\n") |
| 170 | + |
| 171 | + for encoding, content_list in ENCODING_DATA.items(): |
| 172 | + # Sanitize encoding name for use in filename, replacing cp with win |
| 173 | + # for clarity as requested. UHC is an alias for euc-kr in this context. |
| 174 | + if encoding.startswith("cp"): |
| 175 | + filename_prefix = encoding.replace("cp", "win") |
| 176 | + elif encoding == "euc-kr": |
| 177 | + filename_prefix = "uhc" |
| 178 | + else: |
| 179 | + filename_prefix = encoding |
| 180 | + |
| 181 | + file_path = os.path.join(output_dir, "log", f"generic_enc_{filename_prefix}.log") |
| 182 | + |
| 183 | + try: |
| 184 | + # Open the file with the specified encoding |
| 185 | + with open(file_path, 'w', encoding=encoding) as f: |
| 186 | + # Join the list of words with newline characters |
| 187 | + f.write('\n'.join(content_list)) |
| 188 | + f.write('\n') |
| 189 | + print(f"Successfully created: {os.path.basename(file_path)} (Encoding: {encoding})") |
| 190 | + |
| 191 | + except UnicodeEncodeError as e: |
| 192 | + print(f"Error: Could not encode content for '{encoding}'.") |
| 193 | + print(f" - File not created: {os.path.basename(file_path)}") |
| 194 | + print(f" - Details: {e}") |
| 195 | + except Exception as e: |
| 196 | + print(f"An unexpected error occurred for '{encoding}': {e}") |
| 197 | + |
| 198 | +if __name__ == "__main__": |
| 199 | + generate_files() |
0 commit comments