Include ASCII operators in tables

rocky · rocky · commit 8c62b5e2386f · 2022-09-17T21:23:03.000-04:00
Correct DifferentialD unicode - it was correct in master.

Note: \U0001D451 != \u1d451
diff --git a/mathics_scanner/generate/build_tables.py b/mathics_scanner/generate/build_tables.py
@@ -2,6 +2,8 @@
 # This scripts reads the data from named-characters and converts it to the
 # format used by the library internally
 
+from collections import OrderedDict
+
 import click
 
 import json
@@ -113,9 +115,9 @@ def compile_tables(data: dict) -> dict:
 
     # operator-to-unicode dictionary entry
     operator_to_unicode = {
-        v["operator-name"]: v["unicode-equivalent"]
+        v["operator-name"]: v.get("unicode-equivalent", v.get("ascii"))
         for k, v in data.items()
-        if "operator-name" in v and "unicode-equivalent" in v
+        if "operator-name" in v and ("unicode-equivalent" in v or "ascii" in v)
     }
 
     # Conversion from unicode or ascii to wl dictionary entry.
@@ -147,20 +149,21 @@ def compile_tables(data: dict) -> dict:
         [v["ascii"] for v in data.values() if "operator-name" in v and "ascii" in v]
     )
 
-    # unicode-equivalent list entry
-    unicode_operators = sorted(
-        [
-            v["unicode-equivalent"]
+    # Mathics core stores the ascii operator value, Use that to get an operator name
+    # Operators with ASCII sequences list entry
+    ascii_operator_to_name = OrderedDict(
+        {
+            v["ascii"]: rf'\[{v["operator-name"]}]'
             for v in data.values()
-            if "operator-name" in v and "unicode-equivalent" in v
-        ]
+            if "operator-name" in v and "ascii" in v
+        }.items()
     )
 
     # unicode-to-operator dictionary entry
     unicode_to_operator = {
-        v["unicode-equivalent"]: v["operator-name"]
+        v.get("unicode-equivalent", v.get("ascii")): v["operator-name"]
         for k, v in data.items()
-        if "operator-name" in v and "unicode-equivalent" in v
+        if "operator-name" in v
     }
     # Conversion from WL to the fully qualified names dictionary entry
     wl_to_ascii_dict = {
@@ -184,13 +187,14 @@ def compile_tables(data: dict) -> dict:
     return {
         "aliased-characters": aliased_characters,
         "ascii-operators": ascii_operators,
+        "ascii-operator-to-name": ascii_operator_to_name,
         "letterlikes": letterlikes,
         "named-characters": named_characters,
         "operator-to-precedence": operator_to_precedence,
         "operator-to-unicode": operator_to_unicode,
-        "unicode-equivalent": unicode_operators,
+        # unicode-operators is irregular, but this is what
+        # mathics-pygments uses
         "unicode-operators": unicode_to_operator,
-        "unicode-to-operator": unicode_to_operator,
         "unicode-to-wl-dict": unicode_to_wl_dict,
         "unicode-to-wl-re": unicode_to_wl_re,
         "wl-to-ascii-dict": wl_to_ascii_dict,
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
@@ -148,7 +148,7 @@
     ("ConjugateTranspose", r" \uf3c9 "),
     ("HermitianConjugate", r" \uf3ce "),
     ("Integral", r" \u222b "),
-    ("DifferentialD", r" \u1d451 | \uf74c"),
+    ("DifferentialD", r" \U0001D451 | \uf74c"),
     ("Del", r" \u2207 "),
     # uf520 is Wolfram custom, 25ab is standard unicode
     ("Square", r" \uf520 | \u25ab"),
diff --git a/test/test_ascii.py b/test/test_ascii.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+
+from mathics_scanner.load import (
+    load_mathics_character_yaml,
+    load_mathics_character_json,
+)
+
+yaml_data = load_mathics_character_yaml()
+json_data = load_mathics_character_json()
+
+
+def test_ascii():
+    ascii_operator_to_name = json_data["ascii-operator-to-name"]
+    ascii_operators = json_data["ascii-operators"]
+    operator_keys = frozenset(ascii_operator_to_name.keys())
+    # operator_to_precedence = json_data["operator-to-precedence"]
+    for chars in json_data["ascii-operators"]:
+        assert chars in ascii_operators
+        assert chars in operator_keys
+        # assert chars in unicode_to_operator.keys()
+        name = ascii_operator_to_name.get(chars)
+        assert name is not None
+        assert name.startswith(r"\[")
+        assert name.endswith(r"]")
+        raw_name = name[len(r"\[") : -len(r"]")]
+        assert raw_name in yaml_data