Skip to content

Commit 5ddb0b0

Browse files
committed
Merge github.com:Mathics3/mathics-scanner
2 parents 869b668 + 295fe97 commit 5ddb0b0

File tree

4 files changed

+60
-22
lines changed

4 files changed

+60
-22
lines changed

README.rst

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,15 @@ This module consists mostly of translation tables between WL and unicode/ascii.
2323
Because of the large size of this tables, it was decided to store them in a
2424
file and read them from disk at runtime (when the module is imported). Our
2525
tests showed that storing the tables as JSON and using
26-
[ujson](https://github.com/ultrajson/ultrajson) to read them is the most
26+
`ujson <https://github.com/ultrajson/ultrajson>`_ to read them is the most
2727
efficient way to access them. However, this is merelly an implementation
2828
detail and consumers of this library should not relly on this assumption.
2929

3030
For maintainability and effeciency, we decided to store this data in a
3131
human-readable YAML file (`data/named-characters.yml`) and compile them into
3232
the JSON tables used internally by the library (`data/characters.json`) for
3333
faster access at runtime. The conversion of the data is performed by the
34-
script `admin-tools/compile-translation-tables.py` at each commit to the
35-
`master` branch via GitHub Actions.
34+
script `mathics_scanner/build-tables.py`.
3635

3736

3837
Contributing

mathics_scanner/build_tables.py

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,28 +18,53 @@ def re_from_keys(d: dict) -> str:
1818
sorted(map(re.escape, d.keys()), key=lambda k: (-len(k), k))
1919
)
2020

21+
def get_plain_text(char_name: str, char_data: dict, use_unicode: bool) -> str:
22+
"""
23+
Takes in data about a named character and returns the appropriate
24+
plain text representation according to use_unicode
25+
"""
26+
uni = char_data.get("unicode-equivalent")
27+
28+
if uni is not None:
29+
if use_unicode:
30+
return uni
31+
32+
# If all of the characters in the unicode representation are valid
33+
# ASCII then return the unicode representation
34+
elif all(ord(c) < 127 for c in uni):
35+
return uni
36+
37+
return f"\\[{char_name}]"
38+
2139
def compile_tables(data: dict) -> dict:
2240
"""
2341
Compiles the general table into the tables used internally by the library
2442
for fast access
2543
"""
2644

2745
# Conversion from WL to the fully qualified names
28-
wl_to_ascii_dict = {v["wl-unicode"]: f"\\[{k}]" for k, v in data.items()}
46+
# We filter the dictionary after it's first created to redundant entries
47+
wl_to_ascii_dict = {v["wl-unicode"]: get_plain_text(k, v, False)
48+
for k, v in data.items()}
49+
wl_to_ascii_dict = {k: v for k, v in wl_to_ascii_dict.items() if k != v}
2950
wl_to_ascii_re = re_from_keys(wl_to_ascii_dict)
3051

3152
# Conversion from wl to unicode
32-
wl_to_unicode_dict = {v["wl-unicode"]: v.get("unicode-equivalent") or f"\\[{k}]"
33-
for k, v in data.items()
34-
if "unicode-equivalent" not in v
35-
or v["unicode-equivalent"] != v["wl-unicode"]}
53+
# We filter the dictionary after it's first created to redundant entries
54+
wl_to_unicode_dict = {v["wl-unicode"]: get_plain_text(k, v, True)
55+
for k, v in data.items()}
56+
wl_to_unicode_dict = {k: v for k, v in wl_to_unicode_dict.items()
57+
if k != v}
3658
wl_to_unicode_re = re_from_keys(wl_to_unicode_dict)
3759

3860
# Conversion from unicode to wl
61+
# We filter the dictionary after it's first created to redundant entries
3962
unicode_to_wl_dict = {v["unicode-equivalent"]: v["wl-unicode"]
40-
for v in data.values()
41-
if "unicode-equivalent" in v
42-
and v["has-unicode-inverse"]}
63+
for v in data.values()
64+
if "unicode-equivalent" in v
65+
and v["has-unicode-inverse"]}
66+
unicode_to_wl_dict = {k: v for k, v in unicode_to_wl_dict.items()
67+
if k != v}
4368
unicode_to_wl_re = re_from_keys(unicode_to_wl_dict)
4469

4570
# Character ranges of letterlikes
@@ -51,7 +76,7 @@ def compile_tables(data: dict) -> dict:
5176

5277
# ESC sequence aliases
5378
aliased_characters = {v["esc-alias"]: v["wl-unicode"]
54-
for v in data.values() if "esc-alias" in v}
79+
for v in data.values() if "esc-alias" in v}
5580

5681
return {
5782
"wl-to-ascii-dict": wl_to_ascii_dict,

mathics_scanner/characters.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
ROOT_DIR = pkg_resources.resource_filename("mathics_scanner", "")
1010

1111
# Load the conversion tables from disk
12-
with open(os.path.join(ROOT_DIR, "data/characters.json"), "r") as f:
12+
with open(os.path.join(ROOT_DIR, "data", "characters.json"), "r") as f:
1313
_data = ujson.load(f)
1414

1515
# Character ranges of letters

mathics_scanner/data/named-characters.yml

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1096,7 +1096,7 @@ Conditioned:
10961096
is-letter-like: false
10971097
wl-unicode: "\uF3D3"
10981098
Congruent:
1099-
esc-alias: Err:510
1099+
esc-alias: ===
11001100
has-unicode-inverse: false
11011101
is-letter-like: false
11021102
unicode-equivalent: "\u2261"
@@ -1526,7 +1526,7 @@ DoubleLongLeftRightArrow:
15261526
wl-unicode: "\u27FA"
15271527
wl-unicode-name: LONG LEFT RIGHT DOUBLE ARROW
15281528
DoubleLongRightArrow:
1529-
esc-alias: Err:510
1529+
esc-alias: ==>
15301530
has-unicode-inverse: false
15311531
is-letter-like: false
15321532
unicode-equivalent: "\u27F9"
@@ -1542,7 +1542,7 @@ DoublePrime:
15421542
wl-unicode: "\u2033"
15431543
wl-unicode-name: DOUBLE PRIME
15441544
DoubleRightArrow:
1545-
esc-alias: =>
1545+
esc-alias: ' =>'
15461546
has-unicode-inverse: false
15471547
is-letter-like: false
15481548
unicode-equivalent: "\u21D2"
@@ -4003,11 +4003,13 @@ ImaginaryJ:
40034003
wl-unicode: "\uF74F"
40044004
ImplicitPlus:
40054005
esc-alias: +
4006-
has-unicode-inverse: false
4006+
has-unicode-inverse: true
40074007
is-letter-like: false
4008+
unicode-equivalent: "\u2064"
4009+
unicode-equivalent-name: INVISIBLE PLUS
40084010
wl-unicode: "\uF39E"
40094011
Implies:
4010-
esc-alias: Err:510
4012+
esc-alias: =>
40114013
has-unicode-inverse: true
40124014
is-letter-like: false
40134015
unicode-equivalent: "\u27F9"
@@ -4041,29 +4043,41 @@ InvisibleApplication:
40414043
esc-alias: '@'
40424044
has-unicode-inverse: false
40434045
is-letter-like: false
4046+
unicode-equivalent: ''
4047+
unicode-equivalent-name: ''
40444048
wl-unicode: "\uF76D"
40454049
InvisibleComma:
4046-
has-unicode-inverse: false
4050+
has-unicode-inverse: true
40474051
is-letter-like: false
4052+
unicode-equivalent: "\u2063"
4053+
unicode-equivalent-name: INVISIBLE SEPARATOR
40484054
wl-unicode: "\uF765"
40494055
InvisiblePostfixScriptBase:
40504056
esc-alias: -i
40514057
has-unicode-inverse: false
40524058
is-letter-like: false
4059+
unicode-equivalent: ''
4060+
unicode-equivalent-name: ''
40534061
wl-unicode: "\uF3B4"
40544062
InvisiblePrefixScriptBase:
40554063
esc-alias: i-
40564064
has-unicode-inverse: false
40574065
is-letter-like: false
4066+
unicode-equivalent: ''
4067+
unicode-equivalent-name: ''
40584068
wl-unicode: "\uF3B3"
40594069
InvisibleSpace:
40604070
esc-alias: is
40614071
has-unicode-inverse: false
40624072
is-letter-like: false
4073+
unicode-equivalent: ''
4074+
unicode-equivalent-name: ''
40634075
wl-unicode: "\uF360"
40644076
InvisibleTimes:
4065-
has-unicode-inverse: false
4077+
has-unicode-inverse: true
40664078
is-letter-like: false
4079+
unicode-equivalent: "\u2062"
4080+
unicode-equivalent-name: INVISIBLE TIMES
40674081
wl-unicode: "\u2062"
40684082
wl-unicode-name: INVISIBLE TIMES
40694083
Iota:
@@ -4428,8 +4442,8 @@ LongEqual:
44284442
esc-alias: l=
44294443
has-unicode-inverse: true
44304444
is-letter-like: false
4431-
unicode-equivalent: '='
4432-
unicode-equivalent-name: EQUALS SIGN
4445+
unicode-equivalent: '=='
4446+
unicode-equivalent-name: EQUALS SIGN + EQUALS SIGN
44334447
wl-unicode: "\uF7D9"
44344448
LongLeftArrow:
44354449
esc-alias: <--

0 commit comments

Comments
 (0)