Skip to content

Commit 372ff42

Browse files
🎨 Improve tokenisation and logging.
1 parent 34b0a71 commit 372ff42

File tree

1 file changed

+40
-3
lines changed

1 file changed

+40
-3
lines changed

plover_websocket_server/lookup.py

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ def lookup(engine: StenoEngine, text_to_lookup: str) -> list:
1515
"""Recursively looks up a phrase by finding the longest possible dictionary match.
1616
1717
Starts from the beginning of the string and then solving for the remainder.
18+
19+
A lookup can fail (return an empty list) if any part of the tokenized input
20+
string cannot be found in Plover's dictionaries. The lookup is performed
21+
recursively, and if any segment of the phrase has no corresponding steno
22+
strokes, the entire lookup for that path will fail, and if no alternative
23+
paths are found, the overall result will be empty.
1824
"""
1925
memo = {}
2026
log.debug(f"Starting lookup for: '{text_to_lookup}'")
@@ -27,6 +33,7 @@ def get_steno_for_phrase(phrase: str) -> list | None:
2733
"""
2834
# 1. Try the phrase as-is (respecting capitalization)
2935
log.debug(f" - get_steno_for_phrase('{phrase}')")
36+
3037
steno_capitalized: set = engine.reverse_lookup(phrase)
3138

3239
# If the phrase is a single non-word character (like '!'),
@@ -49,7 +56,28 @@ def get_steno_for_phrase(phrase: str) -> list | None:
4956

5057
# Prioritize direct capitalized results
5158
combined = steno_capitalized.union(steno_lowercase_modified)
59+
numeric_phrase = re.sub(r"[$,€£]", "", phrase.replace(",", ""))
60+
if numeric_phrase.isdigit():
61+
digit_steno_list = []
62+
all_digits_found = True
63+
for digit in numeric_phrase:
64+
digit_steno = engine.reverse_lookup(digit)
65+
if not digit_steno:
66+
all_digits_found = False
67+
break
68+
digit_steno_list.append(min(digit_steno, key=len)) # Choose shortest steno for the digit
69+
if all_digits_found:
70+
combined_digit_steno = tuple(s for steno_tuple in digit_steno_list for s in steno_tuple)
71+
combined.add(combined_digit_steno)
72+
73+
# If after all attempts, we have no results, return None.
5274
if not combined:
75+
# Only issue a warning for single words that are not found, as this is the root cause of failure.
76+
is_single_word = " " not in phrase
77+
if is_single_word:
78+
log.warning(f"Failed to find steno for word: '{phrase}'")
79+
else:
80+
log.debug(f" - FAILED to find steno for phrase: '{phrase}'")
5381
return None
5482

5583
# Sort results: 1. Direct cap match, 2. Stroke count, 3. Key count
@@ -65,6 +93,8 @@ def solve(words_tuple: tuple) -> list[list[tuple]]:
6593
def get_steno_options(i):
6694
return get_steno_for_phrase(" ".join(words_tuple[:i]))
6795

96+
max_lookup_length = min(len(words_tuple), engine._dictionaries.longest_key)
97+
6898
def process_i(i, best_steno_for_prefix):
6999
# Recursively find all solutions for the rest of the phrase
70100
prefix_phrase = " ".join(words_tuple[:i])
@@ -76,23 +106,30 @@ def process_i(i, best_steno_for_prefix):
76106

77107
all_solutions = [
78108
solution
79-
for i in range(len(words_tuple), 0, -1)
109+
for i in range(max_lookup_length, 0, -1)
80110
if (steno_options := get_steno_options(i))
81111
for solution in process_i(i, steno_options[0])
82112
]
83113

114+
if not all_solutions:
115+
# This is the point of failure. It means for the current `words_tuple`,
116+
# no prefix could be found in the dictionary that also had a valid suffix solution.
117+
log.debug(f" <-- solve({words_tuple}) -> FAILED: No steno found for any prefix.")
84118
memo[words_tuple] = all_solutions
85119
return all_solutions
86120

87121
# Tokenize the input string, separating words from punctuation.
88-
# This finds sequences of word characters (\w+) or single non-word/non-space characters.
89-
words = re.findall(r"\w+|[^\w\s]", text_to_lookup)
122+
# This finds sequences of word characters (including those with internal apostrophes)
123+
# currency symbols attached to numbers, numbers with commas, or single non-word/non-space characters.
124+
token_regex = r"[$€£]?\d+(?:,\d+)*|\w+(?:['’]\w+)*|[^\w\s]" # nosec B105
125+
words = re.findall(token_regex, text_to_lookup)
90126

91127
all_possible_sequences = solve(tuple(words))
92128

93129
log.debug(f"All possible sequences: {all_possible_sequences}")
94130

95131
if not all_possible_sequences:
132+
log.debug(f"Lookup failed for '{text_to_lookup}'. No valid steno sequence found.")
96133
return []
97134

98135
# Sort the collected sequences by overall efficiency

0 commit comments

Comments
 (0)