Skip to content

Commit 9d77307

Browse files
jeremymanningclaude
andcommitted
Fix ELIZA pattern matching to use word-based algorithm
The previous regex-based pattern matching failed because patterns like "* i am * @sad *" expected literal spaces between pattern parts. The original Weizenbaum ELIZA works on words, not characters. Changes: - Rewrote pattern-matcher.js to use word-based matching (not regex) - New matchWord() checks exact, synonym, or wildcard matches per word - New _matchHelper() uses recursive backtracking for * wildcards - Keywords now found in INPUT word order, then sorted by rank - Specific patterns tried before catch-all * patterns per keyword - Captures are now word arrays, properly joined for display/reflection Updated test expectations: - "I'm sorry" matches keyword "i" (not "sorry") since "i" appears first - "I apologise" matches keyword "i" (not "apologise") for same reason All 156 tests pass (144 Demo 01 + 12 Demo 15). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <[email protected]>
1 parent 2dcd213 commit 9d77307

File tree

3 files changed

+200
-75
lines changed

3 files changed

+200
-75
lines changed

demos/01-eliza/js/pattern-matcher.js

Lines changed: 185 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -49,66 +49,125 @@ export class PatternMatcher {
4949
}
5050

5151
/**
52-
* Expand synonyms in pattern
52+
* Check if a word matches a pattern part (handles @synonyms)
5353
*/
54-
expandSynonyms(pattern, synonyms) {
55-
let expanded = pattern;
56-
const regex = /@(\w+)/g;
57-
let match;
54+
matchWord(word, patternPart, synonyms) {
55+
word = word.toLowerCase();
5856

59-
while ((match = regex.exec(pattern)) !== null) {
60-
const synKey = match[1];
57+
if (patternPart === '*') {
58+
return 'wildcard';
59+
}
60+
61+
if (patternPart.startsWith('@')) {
62+
// Synonym match
63+
const synKey = patternPart.substring(1);
6164
if (synonyms[synKey]) {
62-
const alternatives = synonyms[synKey].join('|');
63-
expanded = expanded.replace(match[0], `(${alternatives})`);
65+
// Check if word matches the synonym group name or any synonym
66+
if (word === synKey || synonyms[synKey].includes(word)) {
67+
return 'synonym';
68+
}
6469
}
70+
return false;
71+
}
72+
73+
// Exact match
74+
if (word === patternPart.toLowerCase()) {
75+
return 'exact';
6576
}
6677

67-
return expanded;
78+
return false;
6879
}
6980

7081
/**
71-
* Convert ELIZA pattern to regex
82+
* Word-based pattern matching (like Python implementation)
83+
* Pattern parts: ['*', 'i', 'am', '*', '@sad', '*']
84+
* Input words: ['i', 'am', 'unhappy']
85+
* Returns: { matched: true, captures: [[], [], ['unhappy'], []] }
7286
*/
73-
patternToRegex(pattern, synonyms) {
74-
// Escape special regex characters first (before synonym expansion)
75-
// Note: We escape . + ^ $ { } [ ] \ but NOT * () | ? :
76-
// * and () are used in ELIZA patterns, | ? : are used in synonym groups
77-
let regexPattern = pattern.replace(/[.+^${}[\]\\]/g, '\\$&');
87+
matchPattern(input, pattern, synonyms) {
88+
// Split input into words, stripping punctuation
89+
const inputWords = input.toLowerCase()
90+
.replace(/[.,!?;:]/g, ' ')
91+
.split(/\s+/)
92+
.filter(w => w.length > 0);
7893

79-
// Expand synonyms after escaping to preserve (?:...) syntax
80-
regexPattern = this.expandSynonyms(regexPattern, synonyms);
94+
// Split pattern into parts
95+
const patternParts = pattern.toLowerCase()
96+
.split(/\s+/)
97+
.filter(p => p.length > 0);
8198

82-
// Convert ELIZA wildcards to regex
83-
// * matches any sequence of words (non-greedy to match minimal text)
84-
regexPattern = regexPattern.replace(/\*/g, '(.*?)');
99+
// Use recursive matching
100+
const result = this._matchHelper(inputWords, patternParts, synonyms);
85101

86-
// Add anchors
87-
regexPattern = '^' + regexPattern + '$';
102+
if (result.matched) {
103+
return {
104+
matched: true,
105+
captures: result.captures,
106+
pattern,
107+
patternParts
108+
};
109+
}
88110

89-
return new RegExp(regexPattern, 'i');
111+
return { matched: false, pattern, patternParts };
90112
}
91113

92114
/**
93-
* Match input against pattern and extract components
115+
* Recursive helper for word-based pattern matching
94116
*/
95-
matchPattern(input, pattern, synonyms) {
96-
const regex = this.patternToRegex(pattern, synonyms);
97-
// Pad input with spaces to handle patterns starting/ending with *
98-
const paddedInput = ' ' + input + ' ';
99-
const match = paddedInput.match(regex);
117+
_matchHelper(words, parts, synonyms) {
118+
// Base case: no more pattern parts
119+
if (parts.length === 0) {
120+
// Match only if no more words either
121+
return { matched: words.length === 0, captures: [] };
122+
}
100123

101-
if (match) {
102-
const captures = match.slice(1).map(g => (g || '').trim());
103-
return {
104-
matched: true,
105-
captures,
106-
pattern,
107-
regex: regex.toString()
108-
};
124+
const [currentPart, ...remainingParts] = parts;
125+
126+
if (currentPart === '*') {
127+
// Wildcard: try matching 0, 1, 2, ... words
128+
// Try shortest match first (greedy would cause issues)
129+
for (let i = 0; i <= words.length; i++) {
130+
const capturedWords = words.slice(0, i);
131+
const remainingWords = words.slice(i);
132+
133+
const result = this._matchHelper(remainingWords, remainingParts, synonyms);
134+
if (result.matched) {
135+
return {
136+
matched: true,
137+
captures: [capturedWords, ...result.captures]
138+
};
139+
}
140+
}
141+
return { matched: false, captures: [] };
142+
}
143+
144+
// Non-wildcard: must match the first word
145+
if (words.length === 0) {
146+
return { matched: false, captures: [] };
147+
}
148+
149+
const [currentWord, ...remainingWords] = words;
150+
const matchType = this.matchWord(currentWord, currentPart, synonyms);
151+
152+
if (matchType) {
153+
const result = this._matchHelper(remainingWords, remainingParts, synonyms);
154+
if (result.matched) {
155+
// For synonym matches, capture the matched word
156+
if (matchType === 'synonym') {
157+
return {
158+
matched: true,
159+
captures: [[currentWord], ...result.captures]
160+
};
161+
}
162+
// For exact matches, don't capture
163+
return {
164+
matched: true,
165+
captures: result.captures
166+
};
167+
}
109168
}
110169

111-
return { matched: false, pattern, regex: regex.toString() };
170+
return { matched: false, captures: [] };
112171
}
113172

114173
/**
@@ -120,22 +179,37 @@ export class PatternMatcher {
120179
.replace(/[.,!?;:]/g, ' ')
121180
.split(/\s+/)
122181
.filter(w => w.length > 0);
123-
const matchedRules = [];
124182

125-
// Find all rules with keywords present in input
183+
// Build a map of keyword -> rule for quick lookup
184+
const rulesByKeyword = new Map();
126185
for (const rule of rules) {
127-
const keyword = rule.keyword.toLowerCase();
128-
if (words.includes(keyword)) {
186+
rulesByKeyword.set(rule.keyword.toLowerCase(), rule);
187+
}
188+
189+
// Find keywords by iterating through INPUT words (like Python implementation)
190+
// This ensures keywords are found in the order they appear in the input
191+
const matchedRules = [];
192+
const seenKeywords = new Set();
193+
194+
for (const word of words) {
195+
const rule = rulesByKeyword.get(word);
196+
if (rule && !seenKeywords.has(word)) {
129197
matchedRules.push(rule);
198+
seenKeywords.add(word);
130199
}
131200
}
132201

133202
// Sort by rank (higher rank = higher priority)
203+
// Use stable sort so equal ranks preserve input word order
134204
matchedRules.sort((a, b) => (b.rank || 0) - (a.rank || 0));
135205

136-
// Try to match patterns for each rule
206+
// Try to match patterns for each rule (in rank order)
207+
// For each rule, try specific patterns first, then catch-all
137208
for (const rule of matchedRules) {
209+
// First try specific patterns
138210
for (const patternObj of rule.patterns) {
211+
if (patternObj.pattern === '*') continue;
212+
139213
const matchResult = this.matchPattern(input, patternObj.pattern, synonyms);
140214

141215
if (matchResult.matched) {
@@ -147,9 +221,24 @@ export class PatternMatcher {
147221
};
148222
}
149223
}
224+
225+
// Then try catch-all for this rule
226+
for (const patternObj of rule.patterns) {
227+
if (patternObj.pattern === '*') {
228+
const matchResult = this.matchPattern(input, patternObj.pattern, synonyms);
229+
if (matchResult.matched) {
230+
return {
231+
rule,
232+
pattern: patternObj,
233+
matchResult,
234+
allTestedRules: matchedRules
235+
};
236+
}
237+
}
238+
}
150239
}
151240

152-
// No keyword matched, try catch-all patterns
241+
// No keyword matched, try catch-all patterns from any rule
153242
for (const rule of rules) {
154243
for (const patternObj of rule.patterns) {
155244
if (patternObj.pattern === '*') {
@@ -169,6 +258,7 @@ export class PatternMatcher {
169258

170259
/**
171260
* Assemble response from template and captures
261+
* Captures are now arrays of words (from word-based matching)
172262
*/
173263
assembleResponse(template, captures, postSubstitutions) {
174264
let response = template;
@@ -178,8 +268,13 @@ export class PatternMatcher {
178268
for (let i = 0; i < captures.length; i++) {
179269
const placeholder = `(${i + 1})`;
180270
if (response.includes(placeholder)) {
271+
// Convert capture (array of words) to string
272+
const captureText = Array.isArray(captures[i])
273+
? captures[i].join(' ')
274+
: captures[i];
275+
181276
// Apply post-substitutions to captured text
182-
const { result } = this.applyPostSubstitutions(captures[i], postSubstitutions);
277+
const { result } = this.applyPostSubstitutions(captureText, postSubstitutions);
183278

184279
// Use a temporary marker to preserve the text that should stay lowercase
185280
const marker = `__CAPTURE_${i}__`;
@@ -214,21 +309,28 @@ export class PatternMatcher {
214309

215310
const processedInput = preSubResult.result;
216311

217-
// Step 2: Keyword detection
312+
// Step 2: Keyword detection (using input word order)
218313
// Split and clean words, removing punctuation
219314
const words = processedInput.toLowerCase()
220315
.replace(/[.,!?;:]/g, ' ')
221316
.split(/\s+/)
222317
.filter(w => w.length > 0);
223-
const keywordsFound = [];
224-
const keywordsNotFound = [];
225318

319+
// Build a map of keyword -> rule for quick lookup
320+
const rulesByKeyword = new Map();
226321
for (const rule of rules) {
227-
const keyword = rule.keyword.toLowerCase();
228-
if (words.includes(keyword)) {
229-
keywordsFound.push({ keyword, rank: rule.rank || 0, rule });
230-
} else {
231-
keywordsNotFound.push(keyword);
322+
rulesByKeyword.set(rule.keyword.toLowerCase(), rule);
323+
}
324+
325+
// Find keywords by iterating through INPUT words
326+
const keywordsFound = [];
327+
const seenKeywords = new Set();
328+
329+
for (const word of words) {
330+
const rule = rulesByKeyword.get(word);
331+
if (rule && !seenKeywords.has(word)) {
332+
keywordsFound.push({ keyword: rule.keyword.toLowerCase(), rank: rule.rank || 0, rule });
333+
seenKeywords.add(word);
232334
}
233335
}
234336

@@ -242,21 +344,41 @@ export class PatternMatcher {
242344
details: `Found ${keywordsFound.length} keyword(s), testing in priority order`
243345
});
244346

245-
// Step 3: Pattern matching
347+
// Step 3: Pattern matching (specific patterns first, then catch-all)
246348
const patternTests = [];
247349
let matchedRule = null;
248350
let matchedPattern = null;
249351
let matchResult = null;
250352

251-
// Test each keyword's patterns
353+
// First pass: try specific patterns (not catch-all '*')
252354
for (const { keyword, rule } of keywordsFound) {
253355
for (const patternObj of rule.patterns) {
356+
if (patternObj.pattern === '*') continue;
357+
358+
const result = this.matchPattern(processedInput, patternObj.pattern, synonyms);
359+
patternTests.push({
360+
keyword,
361+
pattern: patternObj.pattern,
362+
matched: result.matched,
363+
captures: result.captures
364+
});
365+
366+
if (result.matched && !matchedRule) {
367+
matchedRule = rule;
368+
matchedPattern = patternObj;
369+
matchResult = result;
370+
}
371+
}
372+
373+
// Then try catch-all for this keyword
374+
for (const patternObj of rule.patterns) {
375+
if (patternObj.pattern !== '*') continue;
376+
254377
const result = this.matchPattern(processedInput, patternObj.pattern, synonyms);
255378
patternTests.push({
256379
keyword,
257380
pattern: patternObj.pattern,
258381
matched: result.matched,
259-
regex: result.regex,
260382
captures: result.captures
261383
});
262384

@@ -268,7 +390,7 @@ export class PatternMatcher {
268390
}
269391
}
270392

271-
// If no keyword matched, try fallback patterns
393+
// If no keyword matched, try catch-all patterns from any rule
272394
if (!matchedRule) {
273395
for (const rule of rules) {
274396
for (const patternObj of rule.patterns) {
@@ -278,7 +400,6 @@ export class PatternMatcher {
278400
keyword: rule.keyword,
279401
pattern: patternObj.pattern,
280402
matched: result.matched,
281-
regex: result.regex,
282403
captures: result.captures
283404
});
284405

@@ -308,7 +429,7 @@ export class PatternMatcher {
308429
description: 'Extracting parts from input',
309430
input: processedInput,
310431
output: matchResult.captures.length > 0
311-
? matchResult.captures.map((c, i) => `(${i + 1}): "${c}"`).join(', ')
432+
? matchResult.captures.map((c, i) => `(${i + 1}): "${Array.isArray(c) ? c.join(' ') : c}"`).join(', ')
312433
: 'No captures',
313434
details: `Extracted ${matchResult.captures.length} component(s)`
314435
});
@@ -338,13 +459,15 @@ export class PatternMatcher {
338459
for (let i = 0; i < matchResult.captures.length; i++) {
339460
const placeholder = `(${i + 1})`;
340461
if (assembledResponse.includes(placeholder)) {
462+
// Convert capture (array of words) to string
341463
const capture = matchResult.captures[i];
342-
const { result: reflected, steps } = this.applyPostSubstitutions(capture, postSubstitutions);
464+
const captureText = Array.isArray(capture) ? capture.join(' ') : capture;
465+
const { result: reflected, steps } = this.applyPostSubstitutions(captureText, postSubstitutions);
343466

344467
if (steps.length > 0) {
345468
postSubSteps.push({
346469
capture: i + 1,
347-
original: capture,
470+
original: captureText,
348471
reflected,
349472
substitutions: steps
350473
});

0 commit comments

Comments
 (0)