Skip to content

Commit 59ae761

Browse files
authored
Spelling suggestions for linking targets (#416)
* Suggest spelling suggestions in target lookups * Optimizations * Finish did_you_mean * Fix unintentionally creating new entries in the TargetDatabase * Avoid analyzing domain and type * Remove Array2D and just use closures
1 parent 0a2359c commit 59ae761

File tree

7 files changed

+174
-9
lines changed

7 files changed

+174
-9
lines changed

snooty/diagnostics.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,19 +288,24 @@ class UnknownSubstitution(Diagnostic):
288288
severity = Diagnostic.Level.warning
289289

290290

291-
class TargetNotFound(Diagnostic):
291+
class TargetNotFound(Diagnostic, MakeCorrectionMixin):
292292
severity = Diagnostic.Level.error
293293

294294
def __init__(
295295
self,
296296
name: str,
297297
target: str,
298+
candidates: Sequence[str],
298299
start: Union[int, Tuple[int, int]],
299300
end: Union[None, int, Tuple[int, int]] = None,
300301
) -> None:
301302
super().__init__(f'Target not found: "{name}:{target}"', start, end)
302303
self.name = name
303304
self.target = target
305+
self.candidates = list(candidates)
306+
307+
def did_you_mean(self) -> List[str]:
308+
return self.candidates
304309

305310

306311
class AmbiguousTarget(Diagnostic):

snooty/main.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,8 @@ def on_diagnostics(self, path: FileId, diagnostics: List[Diagnostic]) -> None:
108108

109109
if isinstance(diagnostic, MakeCorrectionMixin):
110110
did_you_mean = diagnostic.did_you_mean()
111-
info["did_you_mean"] = did_you_mean
111+
if did_you_mean:
112+
info["did_you_mean"] = did_you_mean
112113

113114
if output == "JSON":
114115
document: Dict[str, object] = {"diagnostic": info}

snooty/postprocess.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ def enter_node(self, fileid_stack: FileIdStack, node: n.Node) -> None:
402402
if refuri is None:
403403
line = node.span[0]
404404
self.context.diagnostics[fileid_stack.current].append(
405-
TargetNotFound("extlink", node.refname, line)
405+
TargetNotFound("extlink", node.refname, [], line)
406406
)
407407
return
408408

@@ -1279,8 +1279,11 @@ def enter_node(self, fileid_stack: FileIdStack, node: n.Node) -> None:
12791279
if injection_candidate is not None:
12801280
injection_candidate.children = [text_node]
12811281

1282+
# See if there are any near matches
1283+
suggestions = self.targets.get_suggestions(key)
1284+
12821285
self.context.diagnostics[fileid_stack.current].append(
1283-
TargetNotFound(node.name, node.target, line)
1286+
TargetNotFound(node.name, node.target, suggestions, line)
12841287
)
12851288
return
12861289

snooty/target_database.py

Lines changed: 59 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,28 @@
11
import copy
22
import enum
3+
import itertools
34
import logging
5+
import re
46
import threading
57
import urllib
68
from collections import defaultdict
79
from dataclasses import dataclass, field
8-
from typing import DefaultDict, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
10+
from typing import (
11+
DefaultDict,
12+
Dict,
13+
Iterable,
14+
List,
15+
NamedTuple,
16+
Optional,
17+
Sequence,
18+
Tuple,
19+
Union,
20+
)
921

1022
import requests.exceptions
1123
from typing_extensions import Protocol
1224

13-
from . import intersphinx, n, specparser
25+
from . import intersphinx, n, specparser, util
1426
from .cache import Cache
1527
from .n import FileId
1628
from .types import ProjectConfig, normalize_target
@@ -21,6 +33,8 @@
2133
#: current project, or a URL (from an intersphinx inventory).
2234
TargetType = enum.Enum("TargetType", ("fileid", "url"))
2335

36+
PAT_TARGET_PART_SEPARATOR = re.compile(r"[_-]+")
37+
2438

2539
@dataclass
2640
class TargetDatabase:
@@ -68,9 +82,9 @@ def __getitem__(self, key: str) -> Sequence["TargetDatabase.Result"]:
6882
canonical_target_name,
6983
title,
7084
)
71-
for canonical_target_name, fileid, title, html5_id in self.local_definitions[
72-
key
73-
]
85+
for canonical_target_name, fileid, title, html5_id in self.local_definitions.get(
86+
key, []
87+
)
7488
)
7589
except KeyError:
7690
pass
@@ -107,6 +121,46 @@ def __getitem__(self, key: str) -> Sequence["TargetDatabase.Result"]:
107121

108122
return results
109123

124+
def get_suggestions(self, key: str) -> Sequence[str]:
125+
key = normalize_target(key)
126+
key = key.split(":", 2)[2]
127+
candidates: List[str] = []
128+
129+
with self.lock:
130+
intersphinx_keys: Iterable[str] = itertools.chain.from_iterable(
131+
(str(s) for s in inventory.targets.keys())
132+
for inventory in self.intersphinx_inventories.values()
133+
)
134+
all_keys: Iterable[str] = itertools.chain(
135+
self.local_definitions.keys(), intersphinx_keys
136+
)
137+
138+
key_parts = PAT_TARGET_PART_SEPARATOR.split(key)
139+
140+
for original_key_definition in all_keys:
141+
key_definition = original_key_definition.split(":", 2)[2]
142+
if abs(len(key) - len(key_definition)) > 2:
143+
continue
144+
145+
# Tokens tend to be separated by - and _: if there's a different number of
146+
# separators, don't attempt a typo correction
147+
key_definition_parts = PAT_TARGET_PART_SEPARATOR.split(key_definition)
148+
if len(key_definition_parts) != len(key_parts):
149+
continue
150+
151+
# Evaluate each part separately, since we can abort before evaluating the rest.
152+
# Small bonus: complexity is O(N*M)
153+
if all(
154+
dist <= 2
155+
for dist in (
156+
util.damerau_levenshtein_distance(p1, p2)
157+
for p1, p2 in zip(key_parts, key_definition_parts)
158+
)
159+
):
160+
candidates.append(original_key_definition)
161+
162+
return candidates
163+
110164
def define_local_target(
111165
self,
112166
domain: str,

snooty/test_intersphinx.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,3 +224,40 @@ def test_fetch_failure() -> None:
224224
assert [type(diag) for diag in result.diagnostics[FileId("snooty.toml")]] == [
225225
FetchError
226226
]
227+
228+
229+
def test_suggestions() -> None:
230+
inventory_bytes = Path("test_data/test_intersphinx/manual.inv").read_bytes()
231+
inventory = Inventory.parse(INVENTORY_URL, inventory_bytes)
232+
db = TargetDatabase(intersphinx_inventories={"manual": inventory})
233+
234+
db.define_local_target(
235+
"std",
236+
"option",
237+
["--maxVarcharLength", "mongosqld.--maxVarcharLength"],
238+
FileId("reference/mongosqld.txt"),
239+
[n.Text(span=(7,), value="mongosqld --maxVarcharLength")],
240+
"std-option-mongosqld.--maxVarcharLength",
241+
)
242+
243+
db.define_local_target(
244+
"std",
245+
"label",
246+
["a-label-on-index"],
247+
FileId("index.txt"),
248+
[n.Text(span=(7,), value="A Label on Index")],
249+
"std-label-a-label-on-index",
250+
)
251+
252+
db.define_local_target(
253+
"std",
254+
"label",
255+
["a-label-on-index-2"],
256+
FileId("reference/index.txt"),
257+
[n.Text(span=(7,), value="A Label on Index 2")],
258+
"std-label-a-label-on-index-2",
259+
)
260+
261+
assert db.get_suggestions("std:label:a-labal-on-index") == [
262+
"std:label:a-label-on-index"
263+
]

snooty/test_util.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,3 +183,12 @@ def start(event: threading.Event, arg: object) -> None:
183183
worker = util.WorkerLauncher("worker-test", start)
184184
with pytest.raises(SomeException):
185185
worker.run_and_wait(None)
186+
187+
188+
def test_damerau_levenshtein_distance() -> None:
189+
assert util.damerau_levenshtein_distance("foo", "foo") == 0
190+
assert util.damerau_levenshtein_distance("foo", "f1o") == 1
191+
assert util.damerau_levenshtein_distance("foo", "fo") == 1
192+
assert util.damerau_levenshtein_distance("foo", "fooa") == 1
193+
assert util.damerau_levenshtein_distance("foo", "ofo") == 1
194+
assert util.damerau_levenshtein_distance("foo", "xoao") == 2

snooty/util.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -581,3 +581,59 @@ def get(self, block: bool = True, timeout: Optional[float] = None) -> Tuple[_K,
581581
del self._data[result[0]]
582582

583583
return result
584+
585+
586+
def damerau_levenshtein_distance(a: str, b: str) -> int:
587+
"""Derived from Wikipedia, the best possible source for an algorithm:
588+
https://en.wikipedia.org/w/index.php?title=Damerau%E2%80%93Levenshtein_distance&oldid=1050388400#Distance_with_adjacent_transpositions"""
589+
# Strings are 1-indexed, and d is -1-indexed.
590+
591+
da = {ch: 0 for ch in set(a).union(b)}
592+
593+
width = len(a) + 2
594+
height = len(b) + 2
595+
d = [0] * width * height
596+
597+
def matrix_set(x: int, y: int, value: int) -> None:
598+
d[(width * (y + 1)) + (x + 1)] = value
599+
600+
def matrix_get(x: int, y: int) -> int:
601+
return d[(width * (y + 1)) + (x + 1)]
602+
603+
maxdist = len(a) + len(b)
604+
matrix_set(-1, -1, maxdist)
605+
606+
for i in range(0, len(a) + 1):
607+
matrix_set(i, -1, maxdist)
608+
matrix_set(i, 0, i)
609+
610+
for j in range(0, len(b) + 1):
611+
matrix_set(-1, j, maxdist)
612+
matrix_set(0, j, j)
613+
614+
for i in range(1, len(a) + 1):
615+
db = 0
616+
for j in range(1, len(b) + 1):
617+
k = da[b[j - 1]]
618+
l = db
619+
if a[i - 1] == b[j - 1]:
620+
cost = 0
621+
db = j
622+
else:
623+
cost = 1
624+
matrix_set(
625+
i,
626+
j,
627+
min(
628+
matrix_get(i - 1, j - 1) + cost, # substitution
629+
matrix_get(i, j - 1) + 1, # insertion
630+
matrix_get(i - 1, j) + 1, # deletion
631+
matrix_get(k - 1, l - 1)
632+
+ (i - k - 1)
633+
+ 1
634+
+ (j - l - 1), # transposition
635+
),
636+
)
637+
da[a[i - 1]] = i
638+
639+
return matrix_get(len(a), len(b))

0 commit comments

Comments
 (0)