Merge pull request #53 from Mathics3/ascii-op-to-unicode

rocky · web-flow · commit c75810219301 · 2022-10-11T14:07:47.000-04:00
Revise to add ascii operator tables
diff --git a/.github/workflows/mathics.yml b/.github/workflows/mathics.yml
@@ -29,5 +29,6 @@ jobs:
       run: |
         # Until next Mathics3/mathics-core release is out...
         python -m pip install -e git+https://github.com/Mathics3/mathics-core#egg=Mathics3[full]
+        (cd src/mathics3 && ./admin-tools/make-op-tables.sh )
         # pip install Mathics3[full]
-        make check-mathics
+        MATHICS_CHARACTER_ENCODING="ASCII" make check-mathics
diff --git a/admin-tools/make-tables.sh b/admin-tools/make-tables.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Create a complete set of tables.
+# This just runs build_tables.py in this distribution
+bs=${BASH_SOURCE[0]}
+mydir=$(dirname $bs)
+PYTHON=${PYTHON:-python}
+
+cd $mydir/../mathics_scanner/data
+$PYTHON ../generate/build_tables.py -o characters.json
diff --git a/mathics_scanner/data/named-characters.yml b/mathics_scanner/data/named-characters.yml
@@ -653,6 +653,7 @@ CapitalDelta:
 
 CapitalDifferentialD:
   amslatex: "\\CapitalDifferentialD"
+  ascii: "d"
   esc-alias: DD
   has-unicode-inverse: true
   is-letter-like: true
@@ -1787,6 +1788,8 @@ DifferentialD:
   has-unicode-inverse: true
   # This can't be letter-like because it is used in derivatives as a function
   is-letter-like: false
+  # TODO: This should be a prefix operator
+  operator-name: DifferentialD
   unicode-equivalent: "\U0001D451"
   unicode-equivalent-name: MATHEMATICAL ITALIC SMALL D
   wl-unicode: "\uF74C"
@@ -5026,6 +5029,7 @@ Integral:
   esc-alias: int
   has-unicode-inverse: false
   is-letter-like: false
+  # TODO: This should be a prefix operator
   operator-name: Integral
   unicode-equivalent: "\u222B"
   unicode-equivalent-name: INTEGRAL
diff --git a/mathics_scanner/generate/build_tables.py b/mathics_scanner/generate/build_tables.py
@@ -1,9 +1,7 @@
-#!/usr/bin/env python3
+#!/usr/bin/env python
 # This scripts reads the data from named-characters and converts it to the
 # format used by the library internally
 
-from collections import OrderedDict
-
 import click
 
 import json
@@ -120,6 +118,13 @@ def compile_tables(data: dict) -> dict:
         if "operator-name" in v and ("unicode-equivalent" in v or "ascii" in v)
     }
 
+    # operator-to-ascii or character symbol name
+    operator_to_ascii = {
+        v["operator-name"]: v.get("ascii", rf'\[{v["operator-name"]}]')
+        for k, v in data.items()
+        if "operator-name" in v and ("unicode-equivalent" in v or "ascii" in v)
+    }
+
     # Conversion from unicode or ascii to wl dictionary entry.
     # We filter the dictionary after it's first created to redundant entries
     unicode_to_wl_dict = {
@@ -144,20 +149,30 @@ def compile_tables(data: dict) -> dict:
         if "wl-unicode" in v
     }
 
-    # Operators with ASCII sequences list entry
-    ascii_operators = sorted(
-        [v["ascii"] for v in data.values() if "operator-name" in v and "ascii" in v]
-    )
-
-    # Mathics core stores the ascii operator value, Use that to get an operator name
-    # Operators with ASCII sequences list entry
-    ascii_operator_to_name = OrderedDict(
-        {
-            v["ascii"]: rf'\[{v["operator-name"]}]'
-            for v in data.values()
-            if "operator-name" in v and "ascii" in v
-        }.items()
-    )
+    operator_names = sorted([k for k, v in data.items() if "operator-name" in v])
+
+    ascii_operators = []
+    ascii_operator_to_character_symbol = {}
+    ascii_operator_to_symbol = {}
+    ascii_operator_to_unicode = {}
+    ascii_operator_to_wl_unicode = {}
+
+    for operator_name in operator_names:
+        # Operators with ASCII sequences list entry
+        v = data[operator_name]
+        ascii_name = v.get("ascii", None)
+        if ascii_name is not None:
+            ascii_operators.append(v["ascii"])
+            ascii_operator_to_character_symbol[ascii_name] = rf'\[{v["operator-name"]}]'
+            ascii_operator_to_symbol[ascii_name] = v["operator-name"]
+            # Mathics core stores the ascii operator value, Use that to get standard unicode
+            # symbol, and failing use the ASCII sequence.
+            ascii_operator_to_unicode[ascii_name] = v.get(
+                "unicode-equivalent", v.get("ascii")
+            )
+            ascii_operator_to_wl_unicode[ascii_name] = v.get(
+                "wl-unicode", v.get("ascii")
+            )
 
     # unicode-to-operator dictionary entry
     unicode_to_operator = {
@@ -187,13 +202,16 @@ def compile_tables(data: dict) -> dict:
     return {
         "aliased-characters": aliased_characters,
         "ascii-operators": ascii_operators,
-        "ascii-operator-to-name": ascii_operator_to_name,
+        "ascii-operator-to-symbol": ascii_operator_to_symbol,
+        "ascii-operator-to-character-symbol": ascii_operator_to_character_symbol,
+        "ascii-operator-to-unicode": ascii_operator_to_unicode,
+        "ascii-operator-to-wl-unicode": ascii_operator_to_wl_unicode,
         "letterlikes": letterlikes,
         "named-characters": named_characters,
+        "operator-names": operator_names,
         "operator-to-precedence": operator_to_precedence,
+        "operator-to-ascii": operator_to_ascii,
         "operator-to-unicode": operator_to_unicode,
-        # unicode-operators is irregular, but this is what
-        # mathics-pygments uses
         "unicode-operators": unicode_to_operator,
         "unicode-to-wl-dict": unicode_to_wl_dict,
         "unicode-to-wl-re": unicode_to_wl_re,
@@ -210,13 +228,17 @@ def compile_tables(data: dict) -> dict:
 ALL_FIELDS = [
     "aliased-characters",
     "ascii-operators",
+    "ascii-operator-to-character-symbol",
+    "ascii-operator-to-symbol",
+    "ascii-operator-to-unicode",
+    "ascii-operator-to-wl-unicode",
     "letterlikes",
     "named-characters",
+    "operator-names",
+    "operator-to-ascii",
     "operator-to-precedence",
     "operator-to-unicode",
-    "unicode-equivalent",
-    "unicode-operators",
-    "unicode-to-operator",
+    #   "unicode-operators",  # not used yet
     "unicode-to-wl-dict",
     "unicode-to-wl-re",
     "wl-to-amslatex",
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
@@ -424,7 +424,13 @@ def _skip_blank(self):
         while True:
             if self.pos >= len(self.code):
                 if comment:
-                    self.incomplete()
+                    try:
+                        self.incomplete()
+                    except ValueError:
+                        # Funny symbols like | in comments can cause a ValueError.
+                        # Until we have a better fix -- like noting we are inside a comment and
+                        # should not try to substitute symbols -- ignore.
+                        pass
                 else:
                     break
             if comment:
diff --git a/mathics_scanner/version.py b/mathics_scanner/version.py
@@ -4,4 +4,4 @@
 # well as importing into Python. That's why there is no
 # space around "=" below.
 # fmt: off
-__version__="1.2.5.dev0"  # noqa
+__version__="1.3.0.dev0"  # noqa
diff --git a/setup.py b/setup.py
@@ -25,7 +25,10 @@
 mathics-users@googlegroups.com and ask for help.
 """
 
+import atexit
+import pkg_resources
 import re
+import subprocess
 import sys
 import os.path as osp
 import platform
@@ -127,3 +130,15 @@ def subdirs(root, file="*.*", depth=10):
     ],
     # TODO: could also include long_description, download_url,
 )
+
+
+def build_json_table() -> int:
+    """Run program to create JSON tables"""
+    ROOT_DIR = pkg_resources.resource_filename("mathics_scanner", "")
+    build_tables_program = osp.join(ROOT_DIR, "generate", "build_tables.py")
+    print(f"Building JSON tables via f{build_tables_program}")
+    result = subprocess.run([sys.executable, build_tables_program])
+    return result.returncode
+
+
+atexit.register(build_json_table)
diff --git a/test/test_ascii.py b/test/test_ascii.py
@@ -10,17 +10,19 @@
 
 
 def test_ascii():
-    ascii_operator_to_name = json_data["ascii-operator-to-name"]
+    ascii_operator_to_character_symbol = json_data["ascii-operator-to-character-symbol"]
+    ascii_operator_to_symbol = json_data["ascii-operator-to-symbol"]
     ascii_operators = json_data["ascii-operators"]
-    operator_keys = frozenset(ascii_operator_to_name.keys())
+    operator_keys = frozenset(ascii_operator_to_symbol.keys())
     # operator_to_precedence = json_data["operator-to-precedence"]
     for chars in json_data["ascii-operators"]:
         assert chars in ascii_operators
         assert chars in operator_keys
         # assert chars in unicode_to_operator.keys()
-        name = ascii_operator_to_name.get(chars)
-        assert name is not None
-        assert name.startswith(r"\[")
-        assert name.endswith(r"]")
-        raw_name = name[len(r"\[") : -len(r"]")]
-        assert raw_name in yaml_data
+        char_symbol = ascii_operator_to_character_symbol.get(chars)
+        assert char_symbol is not None
+        assert char_symbol.startswith(r"\[")
+        assert char_symbol.endswith(r"]")
+        raw_char_symbol = char_symbol[len(r"\[") : -len(r"]")]
+        assert raw_char_symbol in yaml_data
+        assert raw_char_symbol in ascii_operator_to_symbol.values()