Updated README

andreihar · andreihar · commit f0532a0001d2 · 2024-05-14T02:12:41.000-07:00
diff --git a/README.md b/README.md
@@ -58,7 +58,12 @@ Includes word tokeniser for Taiwanese Hokkien.
             <li><a href="#convert-non-cjk">Convert non-CJK</a></li>
           </ul>
         </li>
-        <li><a href="#tokeniser">Tokeniser</a></li>
+        <li>
+          <a href="#tokeniser">Tokeniser</a>
+          <ul>
+            <li><a href="#keep-original">Keep original</a></li>
+          </ul>
+        </li>
         <li><a href="#other-functions">Other Functions</a></li>
       </ul>
     </li>
@@ -135,7 +140,7 @@ c.get(input)
 
 `format` String - format in which tones will be represented in the converted sentence.
 
-* `mark` (default) - uses diacritics for each syllable. Not available for TLPA.
+* `mark` (default) - uses diacritics for each syllable. Not available for TLPA
 * `number` - add a number which represents the tone at the end of the syllable
 * `strip` - removes any tone marking
 
@@ -173,9 +178,9 @@ Default value depends on the chosen `system`:
 * `auto` - for `Tongiong`
 * `none` - for `Tailo`, `POJ`, `Zhuyin`, `TLPA`, `Pingyim`, `IPA`
 
-| text             | none                      | auto                       | exc_last                  | incl_last                 |
-| ---------------- | ------------------------- | -------------------------- | ------------------------- | ------------------------- |
-| 這是你的手機仔無 | Tse sī lí ê tshiú-ki-á bô | Tse sì li ē tshiu-kī-á bô? | Tsē sì li ē tshiu-kī-a bô | Tsē sì li ē tshiu-kī-a bō |
+| text             | none                    | auto                   | exc_last               | incl_last              |
+| ---------------- | ----------------------- | ---------------------- | ---------------------- | ---------------------- |
+| 這是你的茶桌仔無 | Tse sī lí ê tê-toh-á bô | Tse sì li ē tē-to-á bô | Tsē sì li ē tē-tó-a bô | Tsē sì li ē tē-tó-a bō |
 
 Sandhi rules also change depending on the dialect chosen.
 
@@ -187,8 +192,8 @@ Sandhi rules also change depending on the dialect chosen.
 
 `punctuation` String
 
-* `format` (default) - converts Chinese-style punctuation to Latin-style punctuation and capitalises words at the beginning of each sentence.
-* `none` - preserves Chinese-style punctuation and doesn't capitalise words at the beginning of new sentences.
+* `format` (default) - converts Chinese-style punctuation to Latin-style punctuation and capitalises words at the beginning of each sentence
+* `none` - preserves Chinese-style punctuation and doesn't capitalise words at the beginning of new sentences
 
 | text                                                                           | format                                                                                            | none                                                                                                 |
 | ------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------- |
@@ -211,24 +216,38 @@ Sandhi rules also change depending on the dialect chosen.
 
 ```python
 # Constructor
-t = Tokeniser()
+t = Tokeniser(keep_original)
 
 # Tokenise Taiwanese Hokkien sentence
 t.tokenise(input)
 ```
 
+#### Keep original
+
+`keep_original` Boolean - defines whether the original characters of the input are retained.
+
+* `True` (default) - preserve original characters
+* `False` - replace original characters with characters defined in the dataset
+
+| text         | True                 | False                |
+| ------------ | -------------------- | -------------------- |
+| 臺灣火鸡肉饭 | ['臺灣', '火鸡肉饭'] | ['台灣', '火雞肉飯'] |
+
 ### Other Functions
 
 Handy functions for NLP tasks in Taiwanese Hokkien.
 
+`to_traditional` function converts input to Traditional Chinese characters that are used in the dataset. Also accounts for different variants of Traditional Chinese characters.
+
+`to_simplified` function converts input to Simplified Chinese characters.
+
+`is_cjk` function checks whether the input string consists entirely of Chinese characters.
+
 ```python
-# Convert to Traditional
 to_traditional(input)
 
-# Convert to Simplified
 to_simplified(input)
 
-# Check if the string is fully composed of Chinese characters
 is_cjk(input)
 ```
 
@@ -283,20 +302,20 @@ c.get("先生講，學生恬恬聽。")
 
 ## Sandhi
 c = Converter() # for Tailo, sandhi none by default
-c.get("這是台灣囡仔")
->> Tse sī Tâi-uân gín-á
+c.get("這是你的茶桌仔無")
+>> Tse sī lí ê tê-toh-á bô
 
 c = Converter(sandhi='auto')
-c.get("這是台灣囡仔")
->> Tse sì Tāi-uān gin-á
+c.get("這是你的茶桌仔無")
+>> Tse sì li ē tē-to-á bô
 
 c = Converter(sandhi='exc_last')
-c.get("這是台灣囡仔")
->> Tsē sì Tāi-uān gin-á
+c.get("這是你的茶桌仔無")
+>> Tsē sì li ē tē-tó-a bô
 
 c = Converter(sandhi='incl_last')
-c.get("這是台灣囡仔")
->> Tsē sì Tāi-uān gin-a
+c.get("這是你的茶桌仔無")
+>> Tsē sì li ē tē-tó-a bō
 
 ## Punctuation
 c = Converter() # format punctuation default
@@ -308,11 +327,11 @@ c.get("太空朋友，恁好！恁食飽未？")
 >> thài-khong pîng-iú，lín-hó！lín tsia̍h-pá buē？
 
 ## Convert non-CJK
-c = Convert(system='Zhuyin') # False convert_non_cjk default
+c = Converter(system='Zhuyin') # False convert_non_cjk default
 c.get("我食pháng")
 >> ㆣㄨㄚˋ ㄐㄧㄚㆷ˙ pháng
 
-c = Convert(system='Zhuyin', convert_non_cjk=True)
+c = Converter(system='Zhuyin', convert_non_cjk=True)
 c.get("我食pháng")
 >> ㆣㄨㄚˋ ㄐㄧㄚㆷ˙ ㄆㄤˋ
 
@@ -324,16 +343,40 @@ t = Tokeniser()
 t.tokenise("太空朋友，恁好！恁食飽未？")
 >> ['太空', '朋友', '，', '恁好', '！', '恁', '食飽', '未', '？']
 
+## Keep Original
+t = Tokeniser() # True keep_original default
+t.tokenise("爲啥物臺灣遮爾好？")
+>> ['爲啥物', '臺灣', '遮爾', '好', '？']
+
+t.tokenise("为啥物台湾遮尔好？")
+>> ['为啥物', '台湾', '遮尔', '好', '？']
+
+t = Tokeniser(False)
+t.tokenise("爲啥物臺灣遮爾好？")
+>> ['為啥物', '台灣', '遮爾', '好', '？']
+
+t.tokenise("为啥物台湾遮尔好？")
+>> ['為啥物', '台灣', '遮爾', '好', '？']
+
 
 # Other Functions
 from taibun import to_traditional, to_simplified, is_cjk
 
-to_traditional("我听无台湾话")
->> 我聽無台灣話
+## to_traditional
+to_traditional("我听无台语")
+>> 我聽無台語
+
+to_traditional("我爱这个个人台面")
+>> 我愛這个個人檯面
+
+to_traditional("爲啥物")
+>> 為啥物
 
-to_simplified("我聽無臺灣話")
->> 我听无台湾话
+## to_simplified
+to_simplified("我聽無台語")
+>> 我听无台语
 
+## is_cjk
 is_cjk('我食麭')
 >> True
 
@@ -377,7 +420,7 @@ The data is licensed under [CC BY-SA 4.0][data-cc]
 [licence-badge]: https://img.shields.io/github/license/andreihar/taibun?color=000000&style=for-the-badge
 [licence]: LICENSE
 [linkedin-badge]: https://img.shields.io/badge/LinkedIn-0077b5?style=for-the-badge&logo=linkedin&logoColor=ffffff
-[linkedin]: https://www.linkedin.com/in/andrei-harbachov/
+[linkedin]: https://www.linkedin.com/in/andreihar/
 [js-badge]: https://img.shields.io/badge/JS_Version-f7df1e?style=for-the-badge&logo=javascript&logoColor=000000
 [js-link]: https://github.com/andreihar/taibun.js
 [downloads-badge]: https://img.shields.io/pypi/dm/taibun.svg?style=for-the-badge
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = taibun
-version = 1.1.1
+version = 1.1.2
 author = Andrei Harbachov
 author_email = andrei.harbachov@gmail.com
 description = Taiwanese Hokkien Transliterator and Tokeniser
diff --git a/taibun/data/words.json b/taibun/data/words.json
@@ -2386,7 +2386,6 @@
     "無論": "bô-lūn",
     "無名": "bô-miâ",
     "無名化": "bô-miâ-huà",
-    "無名先生": "bô-miâ-sin-senn/bô-miâ-sin-sinn",
     "無命": "bô-miā",
     "無暝無日": "bô-mê-bô-ji̍t/bô-mî-bô-li̍t",
     "無我": "bô-ngóo",
@@ -3880,7 +3879,7 @@
     "玩具": "guán-khū",
     "玩樂": "guán-lo̍k",
     "阮兩人": "guán-nn̄g-lâng",
-    "阮先生": "guán-sian-senn/guán-sian-sinn",
+    "阮先生": "guán-sian-sinn",
     "阮太太": "guán-thài-thài",
     "玩者": "guán-tsiá",
     "元": "guân",
@@ -24168,7 +24167,7 @@
     "老步": "lāu-pōo",
     "老步定": "lāu-pōo-tiānn",
     "老步在": "lāu-pōo-tsāi",
-    "老先生": "lāu-sian-senn/lāu-sian-sinn",
+    "老先生": "lāu-sian-sinn",
     "漏洩": "lāu-sia̍p",
     "老身": "lāu-sin",
     "老生": "lāu-sing",
@@ -28634,7 +28633,7 @@
     "壁爐": "piah-lôo",
     "壁邊": "piah-pinn",
     "壁報": "piah-pò",
-    "壁先生": "piah-sian-senn/piah-sian-sinn",
+    "壁先生": "piah-sian-sinn",
     "壁頭": "piah-thâu",
     "壁燈": "piah-ting",
     "壁鐘": "piah-tsing",
@@ -30038,7 +30037,7 @@
     "拜師傅": "pài-sai-hū",
     "拜三": "pài-sann",
     "拜生日": "pài-senn-ji̍t/pài-sinn-li̍t",
-    "拜先生": "pài-sian-senn/pài-sian-sinn",
+    "拜先生": "pài-sian-sinn",
     "拜上帝": "pài-siāng-tè",
     "拜壽": "pài-siū",
     "拜歲蘭": "pài-suè-lân",
@@ -32173,12 +32172,12 @@
     "仙巴掌": "sian-pa-tsiáng",
     "仙拚仙": "sian-piànn-sian",
     "先輩": "sian-puè",
-    "先生": "sian-senn/sian-sinn",
-    "先生公": "sian-senn-kong/sian-sinn-kong",
-    "先生禮": "sian-senn-lé/sian-sinn-lé",
-    "先生媽": "sian-senn-má/sian-sinn-má",
-    "先生娘": "sian-senn-niû/sian-sinn-niû",
-    "先生仔": "sian-senn-á/sian-sinn-á",
+    "先生": "sian-sinn",
+    "先生公": "sian-sinn-kong",
+    "先生禮": "sian-sinn-lé",
+    "先生媽": "sian-sinn-má",
+    "先生娘": "sian-sinn-niû",
+    "先生仔": "sian-sinn-á",
     "仙屎": "sian-sái",
     "仙丹": "sian-tan",
     "仙丹花": "sian-tan-hue",
@@ -53012,7 +53011,7 @@
     "地理": "tē-lí/tuē-lí",
     "地理仙": "tē-lí-sian/tuē-lí-sian",
     "地理先": "tē-lí-sian/tuē-lí-sian",
-    "地理先生": "tē-lí-sian-senn/tuē-lí-sian-sinn",
+    "地理先生": "tē-lí-sian-sinn/tuē-lí-sian-sinn",
     "地理仙仔": "tē-lí-sian-á/tuē-lí-sian-á",
     "地理師": "tē-lí-su/tuē-lí-su",
     "地理師仔": "tē-lí-su-á/tuē-lí-su-á",
@@ -53614,7 +53613,7 @@
     "蝹碖蜷": "un-lún-khûn",
     "溫瓶": "un-pân",
     "溫房": "un-pâng",
-    "蝹先生": "un-sian-senn/un-sian-sinn",
+    "蝹先生": "un-sian-sinn",
     "溫室": "un-sik",
     "溫室效應": "un-sik-hāu-ìng",
     "溫燒": "un-sio",
@@ -54973,8 +54972,8 @@
     "法蘭克福": "Huat-lân-khik-hok",
     "法蘭西": "Huat-lân-se",
     "法西斯": "Huat-se-su",
-    "花蓮": "Hue-liân",
-    "花蓮港": "Hue-liân-káng",
+    "花蓮": "Hua-liân",
+    "花蓮港": "Hua-liân-káng",
     "花霸王": "Hue-pà-ông",
     "花壇": "Hue-tuânn",
     "花壇鄉": "Hue-tuânn-hiong",