Skip to content

Commit 1bf3222

Browse files
committed
feat: introduce tradedangerous.db.search
This provides backend orm-adjacent search methods for querying either a table or a table + its grouping for fast exact/prefix matches or for employing fuzzy logic. TradeORM can then leverage these to implement it's alternatives to things like lookupStation, lookupItem, etc. At the same time, things like importers, journal code, etc, will have a facade for such lookups without the overhead of fuzzy search. chore: local notebooks are local
1 parent 4f32d45 commit 1bf3222

File tree

3 files changed

+444
-0
lines changed

3 files changed

+444
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,5 @@ tools/*
4949
repo_files.txt
5050
tradedangerous/db/config.ini
5151
tmp/
52+
# Allow notebook experiments in the root to be local.
53+
*.ipynb

tradedangerous/db/search.py

Lines changed: 327 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
"""
2+
search provides general methods for searching orm tables by name. These
3+
are primarily provided as back-ends for TradeORM "lookup" methods.
4+
5+
6+
# Previously:
7+
8+
The TradeDB lookup methods do some complex, crazy, fuzzy matching taking
9+
word boundaries and punctuation etc into account.
10+
11+
We have no metrics on how widely used those are, but they would let
12+
you search for SOL/Abraham Lincoln station with the input "hamlinc".
13+
14+
To do that we loaded all the instances so we could access their names.
15+
16+
TradeDB lookup methods are slow.
17+
18+
19+
# New Approach:
20+
21+
Assume and reward users providing copy/pasted names (or passed via
22+
variables, etc), by hot-pathing exact matches via the database with
23+
no pre-load required.
24+
25+
Follow with near-match based on prefix, before falling back to highly
26+
expensive fuzzy matching.
27+
28+
Fuzzy match is still slow because it has to bring every name into
29+
Python to do the comparison, but it can still be faster than the
30+
TradeDB approach because it uses like patterns to reduce the sheer
31+
number of names retrieved.
32+
33+
That is:
34+
35+
hamlinc -> %h%a%m%l%i%n%c%
36+
37+
While this is an expensive query, it is less expensive than having
38+
SqlAlchemy surface every station name for python to perform a similar
39+
filter.
40+
41+
With the Python filter applied to significantly less rows, there is
42+
still a chance of performance gain.
43+
"""
44+
45+
from __future__ import annotations
46+
47+
from typing import NamedTuple
48+
import re
49+
import typing
50+
51+
from sqlalchemy import Result, select, func, and_
52+
53+
from tradedangerous.tradeexcept import AmbiguityError
54+
55+
from .engine import Session
56+
from .orm_models import Category, Item, Station, System
57+
58+
59+
if typing.TYPE_CHECKING:
60+
from typing import Any
61+
62+
# 'T' is used for the find methods as a generic "this table" parameter,
63+
# 'G' is used for the find methods as a generic "group/parent table" parameter,
64+
T = Item | Station | System
65+
G = Category | System
66+
67+
68+
class Needle(NamedTuple):
69+
"""Needle stores a search term decomposed into a normalized
70+
form (with spaces and punctuation removed) and a search
71+
pattern form (regex/like/etc)."""
72+
73+
normalized: str # the collapsed representation
74+
pattern: str # a pattern to search for this sequence of individual characters
75+
left_anchored: bool
76+
right_anchored: bool
77+
78+
79+
# Punctuation (non-whitespace) characters we will discard in normalization.
80+
NORMALIZE_PUNCT = """/.,;:'"?!+-""" # '-' must come less to be regex usable
81+
82+
# PUNCT_REGEX is a compiled store of a regex that strips consecutive space/punctation
83+
# from a term to collapse it to alphanumeric-etc characters. Not ALL punctuation is removed.
84+
PUNCT_REGEX = re.compile(r"[\s" + NORMALIZE_PUNCT + "]+")
85+
86+
# The 'sub' method of a compiled regex that escapes slash, percent, and understore
87+
# to make a pattern safe for use in a like query.
88+
LIKE_ESCAPING = re.compile(r"([\\%_])").sub
89+
90+
91+
def escaped_for_like(term: str) -> str:
92+
"""escaped_for_like returns term with any SQL 'LIKE' characters escaped."""
93+
return LIKE_ESCAPING(r"\\\1", term)
94+
95+
96+
# python 3.12:
97+
# def name_startswith[T](table: T, pattern: str) -> Any:
98+
def name_startswith(table: T, pattern: str) -> Any: # todo: correct type
99+
return table.name.like(escaped_for_like(pattern) + "%", escape="\\")
100+
101+
102+
def needle_from_term(key: str, *, key_prefix: str = "@", key_suffix: str = "/", pattern_prefix: str = "", pattern_suffix: str = "", pattern_wildcard: str = "%") -> Needle:
103+
"""needle_from_term will produce a Needle instance for the given term, containing
104+
the normalized form and the search pattern based on the pattern's wildcard.
105+
106+
key_prefix and key_suffix specify sequences that can be used to anchor the
107+
key to the beginning or end of the search.
108+
109+
pattern_prefix and pattern_suffix specify sequences that should be injected
110+
to the search pattern to anchor it, if required. e.g. '^' and '$' if you want a regex.
111+
112+
pattern_wildcard is the sequence the matcher needs to express "match 0+ something".
113+
"""
114+
term, left_anchor, right_anchor = key, False, False
115+
116+
# e.g. "@SOL" (if key_prefix is '@', the default)
117+
if term.startswith(key_prefix):
118+
left_anchor, term = True, term[1:]
119+
120+
# e.g "SOL/" (if key_suffix is '/', the default); but only if that's the sole /
121+
# "bre\/\/" does not get this treatment
122+
if term.find(key_suffix) == len(term) - 1:
123+
right_anchor, term = True, term[:-1]
124+
125+
if len(term) == 0:
126+
raise ValueError("empty search string")
127+
# this may be an overly aggressive constraint, but I'm trying to avoid wasting
128+
# the user's time with "'a' could match ..." or "' ' could match ...".
129+
if len(term) == 1:
130+
raise ValueError(f"overly ambiguous search term: {key}")
131+
132+
# split into characters which we then paper-doll with wildcards,
133+
# so that 'g m t a' can match 'gotta match them all' via '%g%m%t%a%'
134+
components = PUNCT_REGEX.split(term.lower())
135+
normalized = "".join(components)
136+
characters = list(normalized)
137+
# @SOL is left-anchored, so it doesn't start with a wildcard.
138+
prefix = pattern_prefix if left_anchor else pattern_wildcard
139+
# SOL/ is right-anchored so it doesn't end with a wildcard.
140+
suffix = pattern_suffix if right_anchor else pattern_wildcard
141+
142+
pattern = f"{prefix}{pattern_wildcard.join(characters)}{suffix}"
143+
144+
return Needle(normalized, pattern, left_anchor, right_anchor)
145+
146+
147+
# python 3.12:
148+
# def fuzzy_like[T, G](tdo: TradeORM, kind: str, term: str, table: T, parent: G | None = None) -> T:
149+
def fuzzy_like(session: Session, kind: str, term: str, table: T, group_table: G | None = None) -> T | None:
150+
"""fuzzy_like attempts to find the a best match given a search key by finding
151+
all potential matches and then testing them for similarity/ambiguity."""
152+
needle = needle_from_term(term)
153+
154+
# when there's no category, it's a one-table query
155+
# if there's a parent we need to join it and concat the names (parent/table)
156+
if not group_table:
157+
query_field = table.name.label("match_name")
158+
query_from = select(table, query_field)
159+
else:
160+
query_field = func.concat(group_table.name, "/", table.name).label("match_name")
161+
query_from = select(table, query_field).join(group_table)
162+
163+
stmt = query_from.where(query_field.like(needle.pattern))
164+
rows: Result[Any] = session.execute(stmt)
165+
166+
# 1: exact, 2: prefix-partial, 3: contains (as-is), 4: fuzzy
167+
matches: dict[int, list[T]] = {1: [], 2: [], 3: [], 4: []}
168+
169+
for row in rows:
170+
match_name = row[1].lower()
171+
match_norm = PUNCT_REGEX.sub("", match_name) # normalized
172+
if needle.normalized in match_norm:
173+
if needle.normalized == match_norm:
174+
score = 1 # exact match
175+
elif needle.normalized.startswith(match_norm):
176+
score = 2 # prefix partial
177+
else:
178+
score = 3 # contains as-is
179+
else:
180+
# fuzziest match (all characters occur in that order but not contiguous)
181+
# 'solr' <-willmatch- 's ol 4.1 ar'
182+
score = 4
183+
matches[score].append(row[0])
184+
185+
# The match is considered unambiguous if we can find a bucket with a single
186+
# match before we find a bucket with > 1 match.
187+
for bucket in (1, 2, 3, 4):
188+
matched = matches[bucket]
189+
match len(matched):
190+
case 0:
191+
continue # nothing in this bucket
192+
case 1:
193+
return matched[0] # single item
194+
case _:
195+
raise AmbiguityError(kind, term, matched, key=lambda i: i.dbname())
196+
197+
return None
198+
199+
200+
# python 3.12:
201+
# def fast_find[T](tdo: TradeORM, kind: str, key: str, table: T) -> T | None:
202+
def fast_find(session: Session, kind: str, key: str, table: T) -> T | None:
203+
"""search a single table for a term and find a unique match based on either
204+
exact-match or unique prefix match."""
205+
if len(key) < 2:
206+
raise ValueError("overly ambiguous {kind} key: {key}")
207+
208+
# exact matching
209+
stmt = select(table).where(table.name == key)
210+
rows = session.execute(stmt.limit(10)).all()
211+
if len(rows) == 1:
212+
return rows[0][0]
213+
if len(rows) > 1:
214+
raise AmbiguityError(kind, key, rows, key=lambda i: i.dbname)
215+
216+
# prefix matching
217+
stmt = select(table).where(name_startswith(table, key))
218+
rows = session.execute(stmt.limit(10)).all()
219+
if len(rows) == 1:
220+
return rows[0][0]
221+
if len(rows) > 1:
222+
raise AmbiguityError(kind, key, rows, key=lambda i: i.dbname())
223+
return None
224+
225+
226+
# python 3.12:
227+
# def fast_find_sub[T, G](tdo: TradeORM, key: str, table: T, group_table: G) -> T | G | None:
228+
def fast_find_sub(session: Session, kind: str, key: str, table: T, group_table: G) -> T | G | None:
229+
if len(key) < 2:
230+
raise ValueError("overly ambiguous search key: {key}")
231+
232+
# Start with a list of most-likely simple, exact (fast) matches that most paths
233+
# will most likely want.
234+
exact_candidates = [
235+
select(table).where(table.name == key),
236+
select(group_table).where(group_table.name == key),
237+
]
238+
# Those will be followed by slower, like-based matches - this relies
239+
# on our tables having no-case collation to avoid manipulating text case.
240+
like_candidates = [
241+
select(table).where(name_startswith(table, key)),
242+
select(group_table).where(name_startswith(group_table, key)),
243+
]
244+
245+
prefix_anchored = key.startswith("@")
246+
slash = key.find("/")
247+
248+
# Possible combinations:
249+
# @/abc explicit non-expression of parent, anchored child; child = abc%
250+
# @abc/ explicit expression of exact parent (trailing slash) parent = abc
251+
# @abc explicit expression of only the parent (no slash). parent = abc%
252+
# @ab/cd explicit parent + child. parent = ab, child = cd
253+
# abc/ explicit parent; parent = abc%
254+
# /abc explicit child; child = %abc%
255+
# ab/cd child + parent
256+
257+
if prefix_anchored: # "@parent", "@par/", "@/child", ...
258+
if slash == 1: # "@", "/", ... -> left-anchored child
259+
if len(key) == 2: # reject "@/"
260+
raise ValueError(f"overly ambiguous term (no actual characters): {key}")
261+
# search for just the child part
262+
exact_candidates.insert(0, select(table).where(table.name == key[2:]))
263+
like_candidates.insert(0, select(table).where(name_startswith(table, key[2:])))
264+
265+
elif slash == -1: # @abc... no slash, just the parent
266+
exact_candidates.insert(0, select(group_table).where(group_table.name == key[1:]))
267+
like_candidates.insert(0, select(group_table).where(name_startswith(group_table, key[1:])))
268+
else: # @sol/[something...] <- combo
269+
parent_part, child_part = key[1:slash], key[slash + 1:]
270+
if not child_part: # @sol/
271+
exact_candidates = [select(group_table).where(group_table.name == parent_part)]
272+
like_candidates = []
273+
else:
274+
exact_candidates.append(select(table).join(group_table).where(and_(group_table.name == parent_part, table.name == child_part)))
275+
like_candidates.insert(0, select(table).join(group_table).where(and_(name_startswith(group_table, parent_part), name_startswith(table, child_part))))
276+
like_candidates.insert(0, select(table).join(group_table).where(and_(group_table.name == parent_part, name_startswith(table, child_part))))
277+
278+
# otherwise if the parent isn't anchored, it can be a wildcard.
279+
elif slash == len(key) - 1: # "foo/"
280+
parent_part = key[:-1]
281+
# This tells us there's a single slash and it's at the end of the name,
282+
# in which case we're going to treat it *first* as a group name.
283+
# We still fall back to trying it directly but only as a last resort;
284+
# there *are* player-named places with a '/' at the end ("Here No\/\/")
285+
# but let that query pay for itself, not the other way around.
286+
exact_candidates = [
287+
select(group_table).where(group_table.name == key),
288+
select(group_table).where(group_table.name == parent_part),
289+
select(table).where(table.name == key),
290+
]
291+
like_candidates = [
292+
select(group_table).where(name_startswith(group_table, key)),
293+
select(group_table).where(name_startswith(group_table, parent_part)),
294+
select(table).where(name_startswith(table, parent_part)),
295+
]
296+
297+
elif slash > 0: # "par/table", and we know it's not trailing /
298+
parent_part, _, child_part = key.partition("/")
299+
300+
exact_candidates = [
301+
select(table).where(table.name == key),
302+
# don't try matching the group because no groups can currently contain '/'.
303+
select(table).join(group_table).where(and_(group_table.name == parent_part, table.name == child_part)),
304+
]
305+
like_candidates = [
306+
select(table).where(name_startswith(table, key)),
307+
select(table).join(group_table).where(group_table.name == parent_part, name_startswith(table, child_part)),
308+
select(table).join(group_table).where(name_startswith(group_table, parent_part), table.name == child_part),
309+
select(table).join(group_table).where(name_startswith(group_table, parent_part), name_startswith(table, child_part)),
310+
]
311+
312+
elif slash <= 0: # "/foo" or "foo" suggests only a child
313+
child_part = key[1:] if slash == 0 else key
314+
exact_candidates.insert(0, select(table).where(table.name == child_part))
315+
like_candidates.insert(0, select(table).where(name_startswith(table, child_part)))
316+
317+
# If there's no slash, then our initial patterns will suffice.
318+
319+
for stmt in exact_candidates + like_candidates:
320+
print(stmt.compile(compile_kwargs={"literal_binds": True}))
321+
rows = session.execute(stmt.limit(10)).all()
322+
if len(rows) == 1:
323+
return rows[0][0]
324+
if len(rows) > 1: # ambiguity/conflict
325+
raise AmbiguityError(kind, key, rows, key=lambda r: r[0].dbname())
326+
327+
return None

0 commit comments

Comments
 (0)