Spelling suggestions for linking targets (#416)

i80and · web-flow · commit 59ae761e8b2c · 2022-10-03T14:11:52.000-04:00
* Suggest spelling suggestions in target lookups

* Optimizations

* Finish did_you_mean

* Fix unintentionally creating new entries in the TargetDatabase

* Avoid analyzing domain and type

* Remove Array2D and just use closures
diff --git a/snooty/diagnostics.py b/snooty/diagnostics.py
@@ -288,19 +288,24 @@ class UnknownSubstitution(Diagnostic):
     severity = Diagnostic.Level.warning
 
 
-class TargetNotFound(Diagnostic):
+class TargetNotFound(Diagnostic, MakeCorrectionMixin):
     severity = Diagnostic.Level.error
 
     def __init__(
         self,
         name: str,
         target: str,
+        candidates: Sequence[str],
         start: Union[int, Tuple[int, int]],
         end: Union[None, int, Tuple[int, int]] = None,
     ) -> None:
         super().__init__(f'Target not found: "{name}:{target}"', start, end)
         self.name = name
         self.target = target
+        self.candidates = list(candidates)
+
+    def did_you_mean(self) -> List[str]:
+        return self.candidates
 
 
 class AmbiguousTarget(Diagnostic):
diff --git a/snooty/main.py b/snooty/main.py
@@ -108,7 +108,8 @@ def on_diagnostics(self, path: FileId, diagnostics: List[Diagnostic]) -> None:
 
             if isinstance(diagnostic, MakeCorrectionMixin):
                 did_you_mean = diagnostic.did_you_mean()
-                info["did_you_mean"] = did_you_mean
+                if did_you_mean:
+                    info["did_you_mean"] = did_you_mean
 
             if output == "JSON":
                 document: Dict[str, object] = {"diagnostic": info}
diff --git a/snooty/postprocess.py b/snooty/postprocess.py
@@ -402,7 +402,7 @@ def enter_node(self, fileid_stack: FileIdStack, node: n.Node) -> None:
         if refuri is None:
             line = node.span[0]
             self.context.diagnostics[fileid_stack.current].append(
-                TargetNotFound("extlink", node.refname, line)
+                TargetNotFound("extlink", node.refname, [], line)
             )
             return
 
@@ -1279,8 +1279,11 @@ def enter_node(self, fileid_stack: FileIdStack, node: n.Node) -> None:
             if injection_candidate is not None:
                 injection_candidate.children = [text_node]
 
+            # See if there are any near matches
+            suggestions = self.targets.get_suggestions(key)
+
             self.context.diagnostics[fileid_stack.current].append(
-                TargetNotFound(node.name, node.target, line)
+                TargetNotFound(node.name, node.target, suggestions, line)
             )
             return
 
diff --git a/snooty/target_database.py b/snooty/target_database.py
@@ -1,16 +1,28 @@
 import copy
 import enum
+import itertools
 import logging
+import re
 import threading
 import urllib
 from collections import defaultdict
 from dataclasses import dataclass, field
-from typing import DefaultDict, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
+from typing import (
+    DefaultDict,
+    Dict,
+    Iterable,
+    List,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
 
 import requests.exceptions
 from typing_extensions import Protocol
 
-from . import intersphinx, n, specparser
+from . import intersphinx, n, specparser, util
 from .cache import Cache
 from .n import FileId
 from .types import ProjectConfig, normalize_target
@@ -21,6 +33,8 @@
 #: current project, or a URL (from an intersphinx inventory).
 TargetType = enum.Enum("TargetType", ("fileid", "url"))
 
+PAT_TARGET_PART_SEPARATOR = re.compile(r"[_-]+")
+
 
 @dataclass
 class TargetDatabase:
@@ -68,9 +82,9 @@ def __getitem__(self, key: str) -> Sequence["TargetDatabase.Result"]:
                         canonical_target_name,
                         title,
                     )
-                    for canonical_target_name, fileid, title, html5_id in self.local_definitions[
-                        key
-                    ]
+                    for canonical_target_name, fileid, title, html5_id in self.local_definitions.get(
+                        key, []
+                    )
                 )
             except KeyError:
                 pass
@@ -107,6 +121,46 @@ def __getitem__(self, key: str) -> Sequence["TargetDatabase.Result"]:
 
         return results
 
+    def get_suggestions(self, key: str) -> Sequence[str]:
+        key = normalize_target(key)
+        key = key.split(":", 2)[2]
+        candidates: List[str] = []
+
+        with self.lock:
+            intersphinx_keys: Iterable[str] = itertools.chain.from_iterable(
+                (str(s) for s in inventory.targets.keys())
+                for inventory in self.intersphinx_inventories.values()
+            )
+            all_keys: Iterable[str] = itertools.chain(
+                self.local_definitions.keys(), intersphinx_keys
+            )
+
+            key_parts = PAT_TARGET_PART_SEPARATOR.split(key)
+
+            for original_key_definition in all_keys:
+                key_definition = original_key_definition.split(":", 2)[2]
+                if abs(len(key) - len(key_definition)) > 2:
+                    continue
+
+                # Tokens tend to be separated by - and _: if there's a different number of
+                # separators, don't attempt a typo correction
+                key_definition_parts = PAT_TARGET_PART_SEPARATOR.split(key_definition)
+                if len(key_definition_parts) != len(key_parts):
+                    continue
+
+                # Evaluate each part separately, since we can abort before evaluating the rest.
+                # Small bonus: complexity is O(N*M)
+                if all(
+                    dist <= 2
+                    for dist in (
+                        util.damerau_levenshtein_distance(p1, p2)
+                        for p1, p2 in zip(key_parts, key_definition_parts)
+                    )
+                ):
+                    candidates.append(original_key_definition)
+
+            return candidates
+
     def define_local_target(
         self,
         domain: str,
diff --git a/snooty/test_intersphinx.py b/snooty/test_intersphinx.py
@@ -224,3 +224,40 @@ def test_fetch_failure() -> None:
         assert [type(diag) for diag in result.diagnostics[FileId("snooty.toml")]] == [
             FetchError
         ]
+
+
+def test_suggestions() -> None:
+    inventory_bytes = Path("test_data/test_intersphinx/manual.inv").read_bytes()
+    inventory = Inventory.parse(INVENTORY_URL, inventory_bytes)
+    db = TargetDatabase(intersphinx_inventories={"manual": inventory})
+
+    db.define_local_target(
+        "std",
+        "option",
+        ["--maxVarcharLength", "mongosqld.--maxVarcharLength"],
+        FileId("reference/mongosqld.txt"),
+        [n.Text(span=(7,), value="mongosqld --maxVarcharLength")],
+        "std-option-mongosqld.--maxVarcharLength",
+    )
+
+    db.define_local_target(
+        "std",
+        "label",
+        ["a-label-on-index"],
+        FileId("index.txt"),
+        [n.Text(span=(7,), value="A Label on Index")],
+        "std-label-a-label-on-index",
+    )
+
+    db.define_local_target(
+        "std",
+        "label",
+        ["a-label-on-index-2"],
+        FileId("reference/index.txt"),
+        [n.Text(span=(7,), value="A Label on Index 2")],
+        "std-label-a-label-on-index-2",
+    )
+
+    assert db.get_suggestions("std:label:a-labal-on-index") == [
+        "std:label:a-label-on-index"
+    ]
diff --git a/snooty/test_util.py b/snooty/test_util.py
@@ -183,3 +183,12 @@ def start(event: threading.Event, arg: object) -> None:
     worker = util.WorkerLauncher("worker-test", start)
     with pytest.raises(SomeException):
         worker.run_and_wait(None)
+
+
+def test_damerau_levenshtein_distance() -> None:
+    assert util.damerau_levenshtein_distance("foo", "foo") == 0
+    assert util.damerau_levenshtein_distance("foo", "f1o") == 1
+    assert util.damerau_levenshtein_distance("foo", "fo") == 1
+    assert util.damerau_levenshtein_distance("foo", "fooa") == 1
+    assert util.damerau_levenshtein_distance("foo", "ofo") == 1
+    assert util.damerau_levenshtein_distance("foo", "xoao") == 2
diff --git a/snooty/util.py b/snooty/util.py
@@ -581,3 +581,59 @@ def get(self, block: bool = True, timeout: Optional[float] = None) -> Tuple[_K,
             del self._data[result[0]]
 
         return result
+
+
+def damerau_levenshtein_distance(a: str, b: str) -> int:
+    """Derived from Wikipedia, the best possible source for an algorithm:
+    https://en.wikipedia.org/w/index.php?title=Damerau%E2%80%93Levenshtein_distance&oldid=1050388400#Distance_with_adjacent_transpositions"""
+    # Strings are 1-indexed, and d is -1-indexed.
+
+    da = {ch: 0 for ch in set(a).union(b)}
+
+    width = len(a) + 2
+    height = len(b) + 2
+    d = [0] * width * height
+
+    def matrix_set(x: int, y: int, value: int) -> None:
+        d[(width * (y + 1)) + (x + 1)] = value
+
+    def matrix_get(x: int, y: int) -> int:
+        return d[(width * (y + 1)) + (x + 1)]
+
+    maxdist = len(a) + len(b)
+    matrix_set(-1, -1, maxdist)
+
+    for i in range(0, len(a) + 1):
+        matrix_set(i, -1, maxdist)
+        matrix_set(i, 0, i)
+
+    for j in range(0, len(b) + 1):
+        matrix_set(-1, j, maxdist)
+        matrix_set(0, j, j)
+
+    for i in range(1, len(a) + 1):
+        db = 0
+        for j in range(1, len(b) + 1):
+            k = da[b[j - 1]]
+            l = db
+            if a[i - 1] == b[j - 1]:
+                cost = 0
+                db = j
+            else:
+                cost = 1
+            matrix_set(
+                i,
+                j,
+                min(
+                    matrix_get(i - 1, j - 1) + cost,  # substitution
+                    matrix_get(i, j - 1) + 1,  # insertion
+                    matrix_get(i - 1, j) + 1,  # deletion
+                    matrix_get(k - 1, l - 1)
+                    + (i - k - 1)
+                    + 1
+                    + (j - l - 1),  # transposition
+                ),
+            )
+        da[a[i - 1]] = i
+
+    return matrix_get(len(a), len(b))