Skip to content

Commit 0a9de23

Browse files
committed
add setup
1 parent 113e128 commit 0a9de23

File tree

4 files changed

+42
-59
lines changed

4 files changed

+42
-59
lines changed

LangScriptID.py renamed to LangScriptID/LangScriptID.py

Lines changed: 4 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,9 @@
1111
Original code repository: https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/utils/predict_script.py
1212
"""
1313

14-
import re
1514
import string
1615
import typing as tp
1716
from collections import Counter, defaultdict
18-
from pathlib import Path
1917

2018

2119
SCRIPT_RANGES = {
@@ -185,26 +183,6 @@
185183
}
186184

187185

188-
def get_script_map(language_script_file: Path) -> tp.Dict[str, str]:
189-
"""Returns a dict mapping a lang to its expected script in a single read run"""
190-
lang_map: tp.Dict[str, str] = defaultdict(str)
191-
with language_script_file.open("r", encoding="utf-8") as ls:
192-
for row in ls:
193-
columns = row.split("\t")
194-
lang_map[columns[0]] = columns[1]
195-
return lang_map
196-
197-
198-
def find_lang_script(lang: str, language_script_file: Path) -> tp.Optional[str]:
199-
"""Returns the expected script for a single lang"""
200-
with language_script_file.open("r", encoding="utf-8") as ls:
201-
for row in ls:
202-
if row.startswith(lang):
203-
columns = row.split("\t")
204-
return columns[1]
205-
return None
206-
207-
208186
ScoredScript = tp.Tuple[tp.Optional[str], float]
209187

210188

@@ -224,48 +202,19 @@ def get_script_predictor() -> tp.Callable[[str], ScoredScript]:
224202
for c in string.whitespace + string.punctuation + string.digits
225203
}
226204

227-
def predict_script_org(sent: str) -> ScoredScript:
228-
sent = sent.translate(replacement_map)
229-
230-
char_counts = Counter(sent).most_common()
231-
232-
script_count: tp.Dict[str, int] = defaultdict(int)
233-
total = 0
234-
235-
for char, count in char_counts:
236-
ordinal = ord(char)
237-
for script_name in hist_map.get(ordinal, []):
238-
total += count
239-
script_count[script_name] += count
240-
241-
max_score = 0.0
242-
max_script = None
243-
for script, count in script_count.items():
244-
score = abs(count / total)
245-
if score > max_score:
246-
max_score = score
247-
max_script = script
248-
249-
if len(script_count) > 1 and max_score == (1 / len(script_count)):
250-
return (None, 0)
251-
252-
return (max_script, max_score)
253-
254-
255205
def predict_script(sent: str) -> ScoredScript:
256206
sent = sent.translate(replacement_map)
257207

258208
char_counts = Counter(sent)
259209
script_count: tp.Dict[str, int] = defaultdict(int)
260-
total = 0
261210

262211
for char, count in char_counts.items():
263212
ordinal = ord(char)
264213
for script_name in hist_map.get(ordinal, []):
265214
script_count[script_name] += count
266215

267216

268-
# sort script_count
217+
# sort script_count alphabetically
269218
script_count = dict(sorted(script_count.items()))
270219

271220
max_score = 0.0
@@ -277,16 +226,17 @@ def predict_script(sent: str) -> ScoredScript:
277226
max_script = script
278227

279228

280-
# Report all the scores
229+
# sort all the scores
281230
sorted_scores = {script: abs(count / len(sent)) for script, count in script_count.items()}
282231
sorted_scores = dict(sorted(sorted_scores.items(), key=lambda item: item[1], reverse=True))
283232

284233
if len(sorted_scores) > 1:
285234
second_score = list(sorted_scores.values())[1]
286235
interval = max_score - second_score
287236
tie = True if interval == 0 else False
288-
289237
return (max_script, max_score, {'details': sorted_scores, 'tie': tie, 'interval': interval})
238+
elif max_score == 0:
239+
return (None, 0, {'details': None, 'tie': None, 'interval': None})
290240
else:
291241
return (max_script, max_score, {'details': sorted_scores, 'tie': False, 'interval': 1})
292242

LangScriptID/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .LangScriptID import get_script_predictor
2+
3+
__version__ = '0.1'

README.md

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,17 @@
11
# LangScriptID
2-
Detect the script of text based on ISO 15924
2+
Detect the script of text based on ISO 15924.
3+
- The codes were sourced from [Wikipedia ISO_15924](https://en.wikipedia.org/wiki/ISO_15924).
4+
- Unicode ranges were extracted from [Unicode Character Database](https://www.unicode.org/Public/15.0.0/ucd/Scripts.txt).
35

4-
# Usage
56

6-
```python
7-
# Download https://raw.githubusercontent.com/kargaranamir/LangScript/main/LangScriptID.py
7+
## Install
8+
```bash
9+
pip3 install LangScriptID@git+https://github.com/kargaranamir/LangScriptID
10+
```
811

12+
## Usage
13+
14+
```python
915
from LangScriptID import get_script_predictor
1016
sp = get_script_predictor()
1117
```
@@ -62,5 +68,5 @@ If you use any part of this library in your research, please cite it using the f
6268
- [Unicode Subset Bitfields - Microsoft](https://learn.microsoft.com/en-us/windows/win32/intl/unicode-subset-bitfields)
6369
- [Stops - FAIR NLLB FB](https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/utils/predict_script.py)
6470
- [Gradient Boosting on Decision Trees - catboost](https://github.com/catboost/catboost/blob/master/contrib/python/fonttools/fontTools/unicodedata/Blocks.py)
65-
- [Blender](https://github.com/blender/blender/blob/main/source/blender/blenfont/intern/blf_glyph.c)
71+
- [Blender](https://github.com/blender/blender/blob/main/source/blender/blenfont/intern/blf_glyph.cc)
6672
- [Unicode Wikipedia](https://en.wikipedia.org/wiki/Unicode_block)

setup.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from setuptools import setup, find_packages
2+
3+
with open("README.md", "r") as fh:
4+
long_description = fh.read()
5+
6+
setup(
7+
name="LangScriptID",
8+
version="0.1",
9+
author="Amir Hossein Kargaran",
10+
author_email="[email protected]",
11+
description="A package for detecting the script and language of given texts.",
12+
long_description=long_description,
13+
long_description_content_type="text/markdown",
14+
url="https://github.com/kargaranamir/LangScriptID",
15+
packages=find_packages(),
16+
classifiers=[
17+
"License :: OSI Approved :: MIT License",
18+
"Programming Language :: Python :: 3",
19+
"Programming Language :: Python :: 3.6",
20+
"Programming Language :: Python :: 3.7",
21+
"Programming Language :: Python :: 3.8",
22+
"Programming Language :: Python :: 3.9",
23+
],
24+
)

0 commit comments

Comments
 (0)