Merge branch 'master' of github.com:Mathics3/mathics-scanner

rocky · rocky · commit 869b668e1f2f · 2021-01-24T19:46:40.000-05:00
diff --git a/README.rst b/README.rst
@@ -13,6 +13,27 @@ Uses
 
 This is used as the scanner inside `Mathics <https://mathics.org>`_ but it can also be used for tokenizing and formatting WL code. In fact we intend to write one.
 
+Implementation
+==============
+
+mathics_scaner.characters
+-------------------------
+
+This module consists mostly of translation tables between WL and unicode/ascii. 
+Because of the large size of this tables, it was decided to store them in a
+file and read them from disk at runtime (when the module is imported). Our
+tests showed that storing the tables as JSON and using
+[ujson](https://github.com/ultrajson/ultrajson) to read them is the most
+efficient way to access them. However, this is merelly an implementation
+detail and consumers of this library should not relly on this assumption.
+
+For maintainability and effeciency, we decided to store this data in a
+human-readable YAML file (`data/named-characters.yml`) and compile them into
+the JSON tables used internally by the library (`data/characters.json`) for
+faster access at runtime. The conversion of the data is performed by the
+script `admin-tools/compile-translation-tables.py` at each commit to the
+`master` branch via GitHub Actions.
+
 
 Contributing
 ------------
diff --git a/mathics_scanner/__init__.py b/mathics_scanner/__init__.py
@@ -6,6 +6,8 @@
 from mathics_scanner.version import __version__
 
 from mathics_scanner.characters import (
+    aliased_characters,
+    named_characters,
     replace_unicode_with_wl,
     replace_wl_with_plain_text,
 )
diff --git a/mathics_scanner/data/named-characters.yml b/mathics_scanner/data/named-characters.yml
@@ -6943,7 +6943,7 @@ Upsilon:
 # looks more like U+26E2 (Astronomical Symbol for Uranus) than the Standard Unicode equavalent
 # seen at https://www.compart.com/en/unicode/U+2645.
 # As with the Earth, we are going off of the name and the code point rather than the
-# visual representation of the symbo.
+# visual representation of the symbol.
 Uranus:
   has-unicode-inverse: false
   is-letter-like: false
diff --git a/mathics_scanner/version.py b/mathics_scanner/version.py
@@ -5,4 +5,4 @@
 # This file is suitable for sourcing inside POSIX shell as
 # well as importing into Python. That's why there is no
 # space around "=" below.
-__version__="1.0.0.dev"  # noqa
+__version__="1.0.0.dev0"  # noqa
diff --git a/setup.py b/setup.py
@@ -28,6 +28,7 @@
 import sys
 import os.path as osp
 import platform
+import subprocess
 from setuptools import setup, Command, Extension
 
 # Ensure user has the correct Python version
@@ -43,6 +44,7 @@ def get_srcdir():
 def read(*rnames):
     return open(osp.join(get_srcdir(), *rnames)).read()
 
+subprocess.run(["make", "mathics_scanner/data/characters.json"])
 
 # stores __version__ in the current namespace
 exec(compile(open("mathics_scanner/version.py").read(), "mathics_scanner/version.py", "exec"))

Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,8 @@`
`6`	`6`	`from mathics_scanner.version import __version__`
`7`	`7`
`8`	`8`	`from mathics_scanner.characters import (`
	`9`	`+ aliased_characters,`
	`10`	`+ named_characters,`
`9`	`11`	`replace_unicode_with_wl,`
`10`	`12`	`replace_wl_with_plain_text,`
`11`	`13`	`)`