add setup

kargaranamir · kargaranamir · commit 0a9de238983d · 2023-09-12T13:35:50.000+02:00
diff --git a/LangScriptID/LangScriptID.py b/LangScriptID/LangScriptID.py
@@ -11,11 +11,9 @@
 Original code repository: https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/utils/predict_script.py
 """
 
-import re
 import string
 import typing as tp
 from collections import Counter, defaultdict
-from pathlib import Path
 
 
 SCRIPT_RANGES = {
@@ -185,26 +183,6 @@
 }
 
 
-def get_script_map(language_script_file: Path) -> tp.Dict[str, str]:
-    """Returns a dict mapping a lang to its expected script in a single read run"""
-    lang_map: tp.Dict[str, str] = defaultdict(str)
-    with language_script_file.open("r", encoding="utf-8") as ls:
-        for row in ls:
-            columns = row.split("\t")
-            lang_map[columns[0]] = columns[1]
-    return lang_map
-
-
-def find_lang_script(lang: str, language_script_file: Path) -> tp.Optional[str]:
-    """Returns the expected script for a single lang"""
-    with language_script_file.open("r", encoding="utf-8") as ls:
-        for row in ls:
-            if row.startswith(lang):
-                columns = row.split("\t")
-                return columns[1]
-        return None
-
-
 ScoredScript = tp.Tuple[tp.Optional[str], float]
 
 
@@ -224,48 +202,19 @@ def get_script_predictor() -> tp.Callable[[str], ScoredScript]:
         for c in string.whitespace + string.punctuation + string.digits
     }
 
-    def predict_script_org(sent: str) -> ScoredScript:
-        sent = sent.translate(replacement_map)
-
-        char_counts = Counter(sent).most_common()
-
-        script_count: tp.Dict[str, int] = defaultdict(int)
-        total = 0
-
-        for char, count in char_counts:
-            ordinal = ord(char)
-            for script_name in hist_map.get(ordinal, []):
-                total += count
-                script_count[script_name] += count
-
-        max_score = 0.0
-        max_script = None
-        for script, count in script_count.items():
-            score = abs(count / total)
-            if score > max_score:
-                max_score = score
-                max_script = script
-
-        if len(script_count) > 1 and max_score == (1 / len(script_count)):
-            return (None, 0)
-
-        return (max_script, max_score)
-
-
     def predict_script(sent: str) -> ScoredScript:
         sent = sent.translate(replacement_map)
 
         char_counts = Counter(sent)
         script_count: tp.Dict[str, int] = defaultdict(int)
-        total = 0
 
         for char, count in char_counts.items():
             ordinal = ord(char)
             for script_name in hist_map.get(ordinal, []):
                 script_count[script_name] += count
 
 
-        # sort script_count
+        # sort script_count alphabetically
         script_count = dict(sorted(script_count.items()))
 
         max_score = 0.0
@@ -277,16 +226,17 @@ def predict_script(sent: str) -> ScoredScript:
                 max_script = script
 
 
-        # Report all the scores
+        # sort all the scores
         sorted_scores = {script: abs(count / len(sent)) for script, count in script_count.items()}
         sorted_scores = dict(sorted(sorted_scores.items(), key=lambda item: item[1], reverse=True))
         
         if len(sorted_scores) > 1:
             second_score = list(sorted_scores.values())[1]
             interval = max_score - second_score
             tie = True if interval == 0 else False
-
             return (max_script, max_score, {'details': sorted_scores, 'tie': tie, 'interval': interval})
+        elif max_score == 0:
+            return (None, 0, {'details': None, 'tie': None, 'interval': None})
         else:
             return (max_script, max_score, {'details': sorted_scores, 'tie': False, 'interval': 1})
 
diff --git a/LangScriptID/__init__.py b/LangScriptID/__init__.py
@@ -0,0 +1,3 @@
+from .LangScriptID import get_script_predictor
+
+__version__ = '0.1'
diff --git a/README.md b/README.md
@@ -1,11 +1,17 @@
 # LangScriptID
-Detect the script of text based on ISO 15924
+Detect the script of text based on ISO 15924.
+- The codes were sourced from [Wikipedia ISO_15924](https://en.wikipedia.org/wiki/ISO_15924).
+- Unicode ranges were extracted from [Unicode Character Database](https://www.unicode.org/Public/15.0.0/ucd/Scripts.txt).
 
-# Usage
 
-```python
-# Download https://raw.githubusercontent.com/kargaranamir/LangScript/main/LangScriptID.py
+## Install
+```bash
+pip3 install LangScriptID@git+https://github.com/kargaranamir/LangScriptID
+```
 
+## Usage
+
+```python
 from LangScriptID import get_script_predictor
 sp = get_script_predictor()
 ```
@@ -62,5 +68,5 @@ If you use any part of this library in your research, please cite it using the f
 - [Unicode Subset Bitfields - Microsoft](https://learn.microsoft.com/en-us/windows/win32/intl/unicode-subset-bitfields)
 - [Stops - FAIR NLLB FB](https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/utils/predict_script.py)
 - [Gradient Boosting on Decision Trees - catboost](https://github.com/catboost/catboost/blob/master/contrib/python/fonttools/fontTools/unicodedata/Blocks.py)
-- [Blender](https://github.com/blender/blender/blob/main/source/blender/blenfont/intern/blf_glyph.c)
+- [Blender](https://github.com/blender/blender/blob/main/source/blender/blenfont/intern/blf_glyph.cc)
 - [Unicode Wikipedia](https://en.wikipedia.org/wiki/Unicode_block)
diff --git a/setup.py b/setup.py
@@ -0,0 +1,24 @@
+from setuptools import setup, find_packages
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+setup(
+    name="LangScriptID",
+    version="0.1",
+    author="Amir Hossein Kargaran",
+    author_email="kargaranamir@email.com",
+    description="A package for detecting the script and language of given texts.",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/kargaranamir/LangScriptID",
+    packages=find_packages(),
+    classifiers=[
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+    ],
+)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .LangScriptID import get_script_predictor`
	`2`	`+`
	`3`	`+__version__ = '0.1'`