|
| 1 | +""" |
| 2 | +search provides general methods for searching orm tables by name. These |
| 3 | +are primarily provided as back-ends for TradeORM "lookup" methods. |
| 4 | +
|
| 5 | +
|
| 6 | +# Previously: |
| 7 | +
|
| 8 | +The TradeDB lookup methods do some complex, crazy, fuzzy matching taking |
| 9 | +word boundaries and punctuation etc into account. |
| 10 | +
|
| 11 | +We have no metrics on how widely used those are, but they would let |
| 12 | +you search for SOL/Abraham Lincoln station with the input "hamlinc". |
| 13 | +
|
| 14 | +To do that we loaded all the instances so we could access their names. |
| 15 | +
|
| 16 | +TradeDB lookup methods are slow. |
| 17 | +
|
| 18 | +
|
| 19 | +# New Approach: |
| 20 | +
|
| 21 | +Assume and reward users providing copy/pasted names (or passed via |
| 22 | +variables, etc), by hot-pathing exact matches via the database with |
| 23 | +no pre-load required. |
| 24 | +
|
| 25 | +Follow with near-match based on prefix, before falling back to highly |
| 26 | +expensive fuzzy matching. |
| 27 | +
|
| 28 | +Fuzzy match is still slow because it has to bring every name into |
| 29 | +Python to do the comparison, but it can still be faster than the |
| 30 | +TradeDB approach because it uses like patterns to reduce the sheer |
| 31 | +number of names retrieved. |
| 32 | +
|
| 33 | +That is: |
| 34 | +
|
| 35 | + hamlinc -> %h%a%m%l%i%n%c% |
| 36 | +
|
| 37 | +While this is an expensive query, it is less expensive than having |
| 38 | +SqlAlchemy surface every station name for python to perform a similar |
| 39 | +filter. |
| 40 | +
|
| 41 | +With the Python filter applied to significantly less rows, there is |
| 42 | +still a chance of performance gain. |
| 43 | +""" |
| 44 | + |
| 45 | +from __future__ import annotations |
| 46 | + |
| 47 | +from typing import NamedTuple |
| 48 | +import re |
| 49 | +import typing |
| 50 | + |
| 51 | +from sqlalchemy import Result, select, func, and_ |
| 52 | + |
| 53 | +from tradedangerous.tradeexcept import AmbiguityError |
| 54 | + |
| 55 | +from .engine import Session |
| 56 | +from .orm_models import Category, Item, Station, System |
| 57 | + |
| 58 | + |
| 59 | +if typing.TYPE_CHECKING: |
| 60 | + from typing import Any |
| 61 | + |
| 62 | +# 'T' is used for the find methods as a generic "this table" parameter, |
| 63 | +# 'G' is used for the find methods as a generic "group/parent table" parameter, |
| 64 | +T = Item | Station | System |
| 65 | +G = Category | System |
| 66 | + |
| 67 | + |
| 68 | +class Needle(NamedTuple): |
| 69 | + """Needle stores a search term decomposed into a normalized |
| 70 | + form (with spaces and punctuation removed) and a search |
| 71 | + pattern form (regex/like/etc).""" |
| 72 | + |
| 73 | + normalized: str # the collapsed representation |
| 74 | + pattern: str # a pattern to search for this sequence of individual characters |
| 75 | + left_anchored: bool |
| 76 | + right_anchored: bool |
| 77 | + |
| 78 | + |
| 79 | +# Punctuation (non-whitespace) characters we will discard in normalization. |
| 80 | +NORMALIZE_PUNCT = """/.,;:'"?!+-""" # '-' must come less to be regex usable |
| 81 | + |
| 82 | +# PUNCT_REGEX is a compiled store of a regex that strips consecutive space/punctation |
| 83 | +# from a term to collapse it to alphanumeric-etc characters. Not ALL punctuation is removed. |
| 84 | +PUNCT_REGEX = re.compile(r"[\s" + NORMALIZE_PUNCT + "]+") |
| 85 | + |
| 86 | +# The 'sub' method of a compiled regex that escapes slash, percent, and understore |
| 87 | +# to make a pattern safe for use in a like query. |
| 88 | +LIKE_ESCAPING = re.compile(r"([\\%_])").sub |
| 89 | + |
| 90 | + |
| 91 | +def escaped_for_like(term: str) -> str: |
| 92 | + """escaped_for_like returns term with any SQL 'LIKE' characters escaped.""" |
| 93 | + return LIKE_ESCAPING(r"\\\1", term) |
| 94 | + |
| 95 | + |
| 96 | +# python 3.12: |
| 97 | +# def name_startswith[T](table: T, pattern: str) -> Any: |
| 98 | +def name_startswith(table: T, pattern: str) -> Any: # todo: correct type |
| 99 | + return table.name.like(escaped_for_like(pattern) + "%", escape="\\") |
| 100 | + |
| 101 | + |
| 102 | +def needle_from_term(key: str, *, key_prefix: str = "@", key_suffix: str = "/", pattern_prefix: str = "", pattern_suffix: str = "", pattern_wildcard: str = "%") -> Needle: |
| 103 | + """needle_from_term will produce a Needle instance for the given term, containing |
| 104 | + the normalized form and the search pattern based on the pattern's wildcard. |
| 105 | +
|
| 106 | + key_prefix and key_suffix specify sequences that can be used to anchor the |
| 107 | + key to the beginning or end of the search. |
| 108 | +
|
| 109 | + pattern_prefix and pattern_suffix specify sequences that should be injected |
| 110 | + to the search pattern to anchor it, if required. e.g. '^' and '$' if you want a regex. |
| 111 | +
|
| 112 | + pattern_wildcard is the sequence the matcher needs to express "match 0+ something". |
| 113 | + """ |
| 114 | + term, left_anchor, right_anchor = key, False, False |
| 115 | + |
| 116 | + # e.g. "@SOL" (if key_prefix is '@', the default) |
| 117 | + if term.startswith(key_prefix): |
| 118 | + left_anchor, term = True, term[1:] |
| 119 | + |
| 120 | + # e.g "SOL/" (if key_suffix is '/', the default); but only if that's the sole / |
| 121 | + # "bre\/\/" does not get this treatment |
| 122 | + if term.find(key_suffix) == len(term) - 1: |
| 123 | + right_anchor, term = True, term[:-1] |
| 124 | + |
| 125 | + if len(term) == 0: |
| 126 | + raise ValueError("empty search string") |
| 127 | + # this may be an overly aggressive constraint, but I'm trying to avoid wasting |
| 128 | + # the user's time with "'a' could match ..." or "' ' could match ...". |
| 129 | + if len(term) == 1: |
| 130 | + raise ValueError(f"overly ambiguous search term: {key}") |
| 131 | + |
| 132 | + # split into characters which we then paper-doll with wildcards, |
| 133 | + # so that 'g m t a' can match 'gotta match them all' via '%g%m%t%a%' |
| 134 | + components = PUNCT_REGEX.split(term.lower()) |
| 135 | + normalized = "".join(components) |
| 136 | + characters = list(normalized) |
| 137 | + # @SOL is left-anchored, so it doesn't start with a wildcard. |
| 138 | + prefix = pattern_prefix if left_anchor else pattern_wildcard |
| 139 | + # SOL/ is right-anchored so it doesn't end with a wildcard. |
| 140 | + suffix = pattern_suffix if right_anchor else pattern_wildcard |
| 141 | + |
| 142 | + pattern = f"{prefix}{pattern_wildcard.join(characters)}{suffix}" |
| 143 | + |
| 144 | + return Needle(normalized, pattern, left_anchor, right_anchor) |
| 145 | + |
| 146 | + |
| 147 | +# python 3.12: |
| 148 | +# def fuzzy_like[T, G](tdo: TradeORM, kind: str, term: str, table: T, parent: G | None = None) -> T: |
| 149 | +def fuzzy_like(session: Session, kind: str, term: str, table: T, group_table: G | None = None) -> T | None: |
| 150 | + """fuzzy_like attempts to find the a best match given a search key by finding |
| 151 | + all potential matches and then testing them for similarity/ambiguity.""" |
| 152 | + needle = needle_from_term(term) |
| 153 | + |
| 154 | + # when there's no category, it's a one-table query |
| 155 | + # if there's a parent we need to join it and concat the names (parent/table) |
| 156 | + if not group_table: |
| 157 | + query_field = table.name.label("match_name") |
| 158 | + query_from = select(table, query_field) |
| 159 | + else: |
| 160 | + query_field = func.concat(group_table.name, "/", table.name).label("match_name") |
| 161 | + query_from = select(table, query_field).join(group_table) |
| 162 | + |
| 163 | + stmt = query_from.where(query_field.like(needle.pattern)) |
| 164 | + rows: Result[Any] = session.execute(stmt) |
| 165 | + |
| 166 | + # 1: exact, 2: prefix-partial, 3: contains (as-is), 4: fuzzy |
| 167 | + matches: dict[int, list[T]] = {1: [], 2: [], 3: [], 4: []} |
| 168 | + |
| 169 | + for row in rows: |
| 170 | + match_name = row[1].lower() |
| 171 | + match_norm = PUNCT_REGEX.sub("", match_name) # normalized |
| 172 | + if needle.normalized in match_norm: |
| 173 | + if needle.normalized == match_norm: |
| 174 | + score = 1 # exact match |
| 175 | + elif needle.normalized.startswith(match_norm): |
| 176 | + score = 2 # prefix partial |
| 177 | + else: |
| 178 | + score = 3 # contains as-is |
| 179 | + else: |
| 180 | + # fuzziest match (all characters occur in that order but not contiguous) |
| 181 | + # 'solr' <-willmatch- 's ol 4.1 ar' |
| 182 | + score = 4 |
| 183 | + matches[score].append(row[0]) |
| 184 | + |
| 185 | + # The match is considered unambiguous if we can find a bucket with a single |
| 186 | + # match before we find a bucket with > 1 match. |
| 187 | + for bucket in (1, 2, 3, 4): |
| 188 | + matched = matches[bucket] |
| 189 | + match len(matched): |
| 190 | + case 0: |
| 191 | + continue # nothing in this bucket |
| 192 | + case 1: |
| 193 | + return matched[0] # single item |
| 194 | + case _: |
| 195 | + raise AmbiguityError(kind, term, matched, key=lambda i: i.dbname()) |
| 196 | + |
| 197 | + return None |
| 198 | + |
| 199 | + |
| 200 | +# python 3.12: |
| 201 | +# def fast_find[T](tdo: TradeORM, kind: str, key: str, table: T) -> T | None: |
| 202 | +def fast_find(session: Session, kind: str, key: str, table: T) -> T | None: |
| 203 | + """search a single table for a term and find a unique match based on either |
| 204 | + exact-match or unique prefix match.""" |
| 205 | + if len(key) < 2: |
| 206 | + raise ValueError("overly ambiguous {kind} key: {key}") |
| 207 | + |
| 208 | + # exact matching |
| 209 | + stmt = select(table).where(table.name == key) |
| 210 | + rows = session.execute(stmt.limit(10)).all() |
| 211 | + if len(rows) == 1: |
| 212 | + return rows[0][0] |
| 213 | + if len(rows) > 1: |
| 214 | + raise AmbiguityError(kind, key, rows, key=lambda i: i.dbname) |
| 215 | + |
| 216 | + # prefix matching |
| 217 | + stmt = select(table).where(name_startswith(table, key)) |
| 218 | + rows = session.execute(stmt.limit(10)).all() |
| 219 | + if len(rows) == 1: |
| 220 | + return rows[0][0] |
| 221 | + if len(rows) > 1: |
| 222 | + raise AmbiguityError(kind, key, rows, key=lambda i: i.dbname()) |
| 223 | + return None |
| 224 | + |
| 225 | + |
| 226 | +# python 3.12: |
| 227 | +# def fast_find_sub[T, G](tdo: TradeORM, key: str, table: T, group_table: G) -> T | G | None: |
| 228 | +def fast_find_sub(session: Session, kind: str, key: str, table: T, group_table: G) -> T | G | None: |
| 229 | + if len(key) < 2: |
| 230 | + raise ValueError("overly ambiguous search key: {key}") |
| 231 | + |
| 232 | + # Start with a list of most-likely simple, exact (fast) matches that most paths |
| 233 | + # will most likely want. |
| 234 | + exact_candidates = [ |
| 235 | + select(table).where(table.name == key), |
| 236 | + select(group_table).where(group_table.name == key), |
| 237 | + ] |
| 238 | + # Those will be followed by slower, like-based matches - this relies |
| 239 | + # on our tables having no-case collation to avoid manipulating text case. |
| 240 | + like_candidates = [ |
| 241 | + select(table).where(name_startswith(table, key)), |
| 242 | + select(group_table).where(name_startswith(group_table, key)), |
| 243 | + ] |
| 244 | + |
| 245 | + prefix_anchored = key.startswith("@") |
| 246 | + slash = key.find("/") |
| 247 | + |
| 248 | + # Possible combinations: |
| 249 | + # @/abc explicit non-expression of parent, anchored child; child = abc% |
| 250 | + # @abc/ explicit expression of exact parent (trailing slash) parent = abc |
| 251 | + # @abc explicit expression of only the parent (no slash). parent = abc% |
| 252 | + # @ab/cd explicit parent + child. parent = ab, child = cd |
| 253 | + # abc/ explicit parent; parent = abc% |
| 254 | + # /abc explicit child; child = %abc% |
| 255 | + # ab/cd child + parent |
| 256 | + |
| 257 | + if prefix_anchored: # "@parent", "@par/", "@/child", ... |
| 258 | + if slash == 1: # "@", "/", ... -> left-anchored child |
| 259 | + if len(key) == 2: # reject "@/" |
| 260 | + raise ValueError(f"overly ambiguous term (no actual characters): {key}") |
| 261 | + # search for just the child part |
| 262 | + exact_candidates.insert(0, select(table).where(table.name == key[2:])) |
| 263 | + like_candidates.insert(0, select(table).where(name_startswith(table, key[2:]))) |
| 264 | + |
| 265 | + elif slash == -1: # @abc... no slash, just the parent |
| 266 | + exact_candidates.insert(0, select(group_table).where(group_table.name == key[1:])) |
| 267 | + like_candidates.insert(0, select(group_table).where(name_startswith(group_table, key[1:]))) |
| 268 | + else: # @sol/[something...] <- combo |
| 269 | + parent_part, child_part = key[1:slash], key[slash + 1:] |
| 270 | + if not child_part: # @sol/ |
| 271 | + exact_candidates = [select(group_table).where(group_table.name == parent_part)] |
| 272 | + like_candidates = [] |
| 273 | + else: |
| 274 | + exact_candidates.append(select(table).join(group_table).where(and_(group_table.name == parent_part, table.name == child_part))) |
| 275 | + like_candidates.insert(0, select(table).join(group_table).where(and_(name_startswith(group_table, parent_part), name_startswith(table, child_part)))) |
| 276 | + like_candidates.insert(0, select(table).join(group_table).where(and_(group_table.name == parent_part, name_startswith(table, child_part)))) |
| 277 | + |
| 278 | + # otherwise if the parent isn't anchored, it can be a wildcard. |
| 279 | + elif slash == len(key) - 1: # "foo/" |
| 280 | + parent_part = key[:-1] |
| 281 | + # This tells us there's a single slash and it's at the end of the name, |
| 282 | + # in which case we're going to treat it *first* as a group name. |
| 283 | + # We still fall back to trying it directly but only as a last resort; |
| 284 | + # there *are* player-named places with a '/' at the end ("Here No\/\/") |
| 285 | + # but let that query pay for itself, not the other way around. |
| 286 | + exact_candidates = [ |
| 287 | + select(group_table).where(group_table.name == key), |
| 288 | + select(group_table).where(group_table.name == parent_part), |
| 289 | + select(table).where(table.name == key), |
| 290 | + ] |
| 291 | + like_candidates = [ |
| 292 | + select(group_table).where(name_startswith(group_table, key)), |
| 293 | + select(group_table).where(name_startswith(group_table, parent_part)), |
| 294 | + select(table).where(name_startswith(table, parent_part)), |
| 295 | + ] |
| 296 | + |
| 297 | + elif slash > 0: # "par/table", and we know it's not trailing / |
| 298 | + parent_part, _, child_part = key.partition("/") |
| 299 | + |
| 300 | + exact_candidates = [ |
| 301 | + select(table).where(table.name == key), |
| 302 | + # don't try matching the group because no groups can currently contain '/'. |
| 303 | + select(table).join(group_table).where(and_(group_table.name == parent_part, table.name == child_part)), |
| 304 | + ] |
| 305 | + like_candidates = [ |
| 306 | + select(table).where(name_startswith(table, key)), |
| 307 | + select(table).join(group_table).where(group_table.name == parent_part, name_startswith(table, child_part)), |
| 308 | + select(table).join(group_table).where(name_startswith(group_table, parent_part), table.name == child_part), |
| 309 | + select(table).join(group_table).where(name_startswith(group_table, parent_part), name_startswith(table, child_part)), |
| 310 | + ] |
| 311 | + |
| 312 | + elif slash <= 0: # "/foo" or "foo" suggests only a child |
| 313 | + child_part = key[1:] if slash == 0 else key |
| 314 | + exact_candidates.insert(0, select(table).where(table.name == child_part)) |
| 315 | + like_candidates.insert(0, select(table).where(name_startswith(table, child_part))) |
| 316 | + |
| 317 | + # If there's no slash, then our initial patterns will suffice. |
| 318 | + |
| 319 | + for stmt in exact_candidates + like_candidates: |
| 320 | + print(stmt.compile(compile_kwargs={"literal_binds": True})) |
| 321 | + rows = session.execute(stmt.limit(10)).all() |
| 322 | + if len(rows) == 1: |
| 323 | + return rows[0][0] |
| 324 | + if len(rows) > 1: # ambiguity/conflict |
| 325 | + raise AmbiguityError(kind, key, rows, key=lambda r: r[0].dbname()) |
| 326 | + |
| 327 | + return None |
0 commit comments