Merge github.com:Mathics3/mathics-scanner

rocky · rocky · commit 5ddb0b01589c · 2021-01-30T21:28:43.000-05:00
diff --git a/README.rst b/README.rst
@@ -23,16 +23,15 @@ This module consists mostly of translation tables between WL and unicode/ascii.
 Because of the large size of this tables, it was decided to store them in a
 file and read them from disk at runtime (when the module is imported). Our
 tests showed that storing the tables as JSON and using
-[ujson](https://github.com/ultrajson/ultrajson) to read them is the most
+`ujson <https://github.com/ultrajson/ultrajson>`_ to read them is the most
 efficient way to access them. However, this is merelly an implementation
 detail and consumers of this library should not relly on this assumption.
 
 For maintainability and effeciency, we decided to store this data in a
 human-readable YAML file (`data/named-characters.yml`) and compile them into
 the JSON tables used internally by the library (`data/characters.json`) for
 faster access at runtime. The conversion of the data is performed by the
-script `admin-tools/compile-translation-tables.py` at each commit to the
-`master` branch via GitHub Actions.
+script `mathics_scanner/build-tables.py`.
 
 
 Contributing
diff --git a/mathics_scanner/build_tables.py b/mathics_scanner/build_tables.py
@@ -18,28 +18,53 @@ def re_from_keys(d: dict) -> str:
         sorted(map(re.escape, d.keys()), key=lambda k: (-len(k), k))
     )
 
+def get_plain_text(char_name: str, char_data: dict, use_unicode: bool) -> str:
+    """
+    Takes in data about a named character and returns the appropriate
+    plain text representation according to use_unicode
+    """
+    uni = char_data.get("unicode-equivalent")
+
+    if uni is not None:
+        if use_unicode:
+            return uni
+
+        # If all of the characters in the unicode representation are valid
+        # ASCII then return the unicode representation
+        elif all(ord(c) < 127 for c in uni): 
+            return uni
+
+    return f"\\[{char_name}]"
+
 def compile_tables(data: dict) -> dict:
     """
     Compiles the general table into the tables used internally by the library
     for fast access
     """
 
     # Conversion from WL to the fully qualified names
-    wl_to_ascii_dict = {v["wl-unicode"]: f"\\[{k}]" for k, v in data.items()}
+    # We filter the dictionary after it's first created to redundant entries
+    wl_to_ascii_dict = {v["wl-unicode"]: get_plain_text(k, v, False)
+                        for k, v in data.items()}
+    wl_to_ascii_dict = {k: v for k, v in wl_to_ascii_dict.items() if k != v}
     wl_to_ascii_re = re_from_keys(wl_to_ascii_dict)
 
     # Conversion from wl to unicode
-    wl_to_unicode_dict = {v["wl-unicode"]: v.get("unicode-equivalent") or f"\\[{k}]"
-                         for k, v in data.items()
-                         if "unicode-equivalent" not in v
-                         or v["unicode-equivalent"] != v["wl-unicode"]}
+    # We filter the dictionary after it's first created to redundant entries
+    wl_to_unicode_dict = {v["wl-unicode"]: get_plain_text(k, v, True)
+                          for k, v in data.items()}
+    wl_to_unicode_dict = {k: v for k, v in wl_to_unicode_dict.items() 
+                          if k != v}
     wl_to_unicode_re = re_from_keys(wl_to_unicode_dict)
 
     # Conversion from unicode to wl
+    # We filter the dictionary after it's first created to redundant entries
     unicode_to_wl_dict = {v["unicode-equivalent"]: v["wl-unicode"]
-                         for v in data.values()
-                         if "unicode-equivalent" in v
-                         and v["has-unicode-inverse"]}
+                          for v in data.values()
+                          if "unicode-equivalent" in v
+                          and v["has-unicode-inverse"]}
+    unicode_to_wl_dict = {k: v for k, v in unicode_to_wl_dict.items() 
+                          if k != v}
     unicode_to_wl_re = re_from_keys(unicode_to_wl_dict)
 
     # Character ranges of letterlikes
@@ -51,7 +76,7 @@ def compile_tables(data: dict) -> dict:
 
     # ESC sequence aliases
     aliased_characters = {v["esc-alias"]: v["wl-unicode"]
-                         for v in data.values() if "esc-alias" in v}
+                          for v in data.values() if "esc-alias" in v}
 
     return {
         "wl-to-ascii-dict": wl_to_ascii_dict,
diff --git a/mathics_scanner/characters.py b/mathics_scanner/characters.py
@@ -9,7 +9,7 @@
 ROOT_DIR = pkg_resources.resource_filename("mathics_scanner", "")
 
 # Load the conversion tables from disk
-with open(os.path.join(ROOT_DIR, "data/characters.json"), "r") as f:
+with open(os.path.join(ROOT_DIR, "data", "characters.json"), "r") as f:
     _data = ujson.load(f)
 
 # Character ranges of letters
diff --git a/mathics_scanner/data/named-characters.yml b/mathics_scanner/data/named-characters.yml
@@ -1096,7 +1096,7 @@ Conditioned:
   is-letter-like: false
   wl-unicode: "\uF3D3"
 Congruent:
-  esc-alias: Err:510
+  esc-alias: ===
   has-unicode-inverse: false
   is-letter-like: false
   unicode-equivalent: "\u2261"
@@ -1526,7 +1526,7 @@ DoubleLongLeftRightArrow:
   wl-unicode: "\u27FA"
   wl-unicode-name: LONG LEFT RIGHT DOUBLE ARROW
 DoubleLongRightArrow:
-  esc-alias: Err:510
+  esc-alias: ==>
   has-unicode-inverse: false
   is-letter-like: false
   unicode-equivalent: "\u27F9"
@@ -1542,7 +1542,7 @@ DoublePrime:
   wl-unicode: "\u2033"
   wl-unicode-name: DOUBLE PRIME
 DoubleRightArrow:
-  esc-alias: =>
+  esc-alias: ' =>'
   has-unicode-inverse: false
   is-letter-like: false
   unicode-equivalent: "\u21D2"
@@ -4003,11 +4003,13 @@ ImaginaryJ:
   wl-unicode: "\uF74F"
 ImplicitPlus:
   esc-alias: +
-  has-unicode-inverse: false
+  has-unicode-inverse: true
   is-letter-like: false
+  unicode-equivalent: "\u2064"
+  unicode-equivalent-name: INVISIBLE PLUS
   wl-unicode: "\uF39E"
 Implies:
-  esc-alias: Err:510
+  esc-alias: =>
   has-unicode-inverse: true
   is-letter-like: false
   unicode-equivalent: "\u27F9"
@@ -4041,29 +4043,41 @@ InvisibleApplication:
   esc-alias: '@'
   has-unicode-inverse: false
   is-letter-like: false
+  unicode-equivalent: ''
+  unicode-equivalent-name: ''
   wl-unicode: "\uF76D"
 InvisibleComma:
-  has-unicode-inverse: false
+  has-unicode-inverse: true
   is-letter-like: false
+  unicode-equivalent: "\u2063"
+  unicode-equivalent-name: INVISIBLE SEPARATOR
   wl-unicode: "\uF765"
 InvisiblePostfixScriptBase:
   esc-alias: -i
   has-unicode-inverse: false
   is-letter-like: false
+  unicode-equivalent: ''
+  unicode-equivalent-name: ''
   wl-unicode: "\uF3B4"
 InvisiblePrefixScriptBase:
   esc-alias: i-
   has-unicode-inverse: false
   is-letter-like: false
+  unicode-equivalent: ''
+  unicode-equivalent-name: ''
   wl-unicode: "\uF3B3"
 InvisibleSpace:
   esc-alias: is
   has-unicode-inverse: false
   is-letter-like: false
+  unicode-equivalent: ''
+  unicode-equivalent-name: ''
   wl-unicode: "\uF360"
 InvisibleTimes:
-  has-unicode-inverse: false
+  has-unicode-inverse: true
   is-letter-like: false
+  unicode-equivalent: "\u2062"
+  unicode-equivalent-name: INVISIBLE TIMES
   wl-unicode: "\u2062"
   wl-unicode-name: INVISIBLE TIMES
 Iota:
@@ -4428,8 +4442,8 @@ LongEqual:
   esc-alias: l=
   has-unicode-inverse: true
   is-letter-like: false
-  unicode-equivalent: '='
-  unicode-equivalent-name: EQUALS SIGN
+  unicode-equivalent: '=='
+  unicode-equivalent-name: EQUALS SIGN + EQUALS SIGN
   wl-unicode: "\uF7D9"
 LongLeftArrow:
   esc-alias: <--