colbymchenry · andy-sg · May 28, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -39,6 +39,17 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
   now sees the four anonymous overrides in its trail without a Read.
 
 ### Fixed
+- **`codegraph context` (and the `codegraph_context` MCP tool) now works for
+  non-ASCII queries.** A task description in Korean, Japanese, Chinese, or any
+  other non-Latin script — e.g. `codegraph context "로그인"` — used to return
+  an empty context (just the header + the query) even though `codegraph query`
+  found the symbols fine. The keyword extractors that feed context were built
+  entirely from ASCII patterns (`[a-zA-Z]` + the ASCII `\b` word boundary), so
+  they pulled zero keywords out of a non-ASCII description and searched for
+  nothing. Both extractors now also pick up runs of Unicode letters and hand
+  them to the existing FTS path (which already tokenizes non-ASCII via
+  `unicode61`); ASCII extraction is unchanged. A Korean class/function/method
+  query now surfaces the matching symbols and their code blocks.
 - **`codegraph index` / `init -i` summary now reports the true edge count.**
   The per-file counter in the orchestrator only saw extraction-phase edges,
   so resolution and synthesizer edges (often >50% of the graph on

diff --git a/__tests__/context.test.ts b/__tests__/context.test.ts
@@ -336,6 +336,79 @@ export function validateEmail(email: string): boolean {
     });
   });
 
+  describe('Non-ASCII (Korean) queries', () => {
+    let krDir: string;
+    let krCg: CodeGraph;
+
+    beforeEach(async () => {
+      krDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-context-kr-'));
+      const srcDir = path.join(krDir, 'src');
+      fs.mkdirSync(srcDir);
+
+      // Source with Korean (non-ASCII) identifiers — the patterns that used to
+      // drive keyword extraction were all [a-zA-Z]-based, so context returned
+      // empty for these even though symbol search found them.
+      fs.writeFileSync(
+        path.join(srcDir, 'auth.ts'),
+        `export function 로그인(사용자명: string): boolean {
+  return 인증확인(사용자명);
+}
+
+export function 인증확인(사용자명: string): boolean {
+  return 사용자명.length > 0;
+}
+
+export class 사용자관리자 {
+  생성하기(이름: string): string {
+    return 이름;
+  }
+}
+`
+      );
+
+      krCg = CodeGraph.initSync(krDir, {
+        config: { include: ['**/*.ts'], exclude: [] },
+      });
+      await krCg.indexAll();
+    });
+
+    afterEach(() => {
+      if (krCg) krCg.destroy();
+      if (fs.existsSync(krDir)) {
+        fs.rmSync(krDir, { recursive: true, force: true });
+      }
+    });
+
+    it('finds relevant nodes for a Korean query', async () => {
+      const result = await krCg.findRelevantContext('로그인');
+
+      expect(result.nodes.size).toBeGreaterThan(0);
+      const names = Array.from(result.nodes.values()).map((n) => n.name);
+      expect(names).toContain('로그인');
+    });
+
+    it('surfaces Korean symbols in built context (was empty before fix)', async () => {
+      const result = (await krCg.buildContext('로그인', {
+        format: 'markdown',
+      })) as string;
+
+      // Regression guard for the reported bug: the markdown used to contain
+      // only the header + query, with zero symbols.
+      expect(result).toContain('로그인');
+      expect(result).toContain('### Entry Points');
+    });
+
+    it('surfaces a Korean class and method from a multi-word Korean query', async () => {
+      const result = (await krCg.buildContext('사용자관리자 생성하기', {
+        format: 'json',
+      })) as string;
+      const parsed = JSON.parse(result);
+      const names = parsed.nodes.map((n: { name: string }) => n.name);
+
+      expect(names).toContain('사용자관리자');
+    });
+  });
+
   describe('Edge cases', () => {
     it('should handle empty query', async () => {
       const result = await cg.buildContext('', { format: 'markdown' });

diff --git a/__tests__/query-utils.test.ts b/__tests__/query-utils.test.ts
@@ -0,0 +1,58 @@
+/**
+ * Search query-utils tests
+ *
+ * Focused coverage for term extraction, especially the non-ASCII path that
+ * keeps context working for Korean/CJK queries.
+ */
+
+import { describe, it, expect } from 'vitest';
+import { extractSearchTerms } from '../src/search/query-utils';
+
+describe('extractSearchTerms', () => {
+  describe('ASCII behavior (must not regress)', () => {
+    it('splits camelCase and keeps the compound', () => {
+      const terms = extractSearchTerms('getUserName');
+      expect(terms).toContain('getusername');
+      expect(terms).toContain('user');
+      expect(terms).toContain('name');
+    });
+
+    it('drops <3-char ASCII tokens and stop words', () => {
+      const terms = extractSearchTerms('is a to ok payment');
+      expect(terms).toContain('payment');
+      expect(terms).not.toContain('is');
+      expect(terms).not.toContain('ok'); // 2 chars
+      expect(terms).not.toContain('to');
+    });
+
+    it('splits snake_case and dot.notation', () => {
+      const terms = extractSearchTerms('user_service app.isPackaged');
+      expect(terms).toContain('user_service');
+      expect(terms).toContain('service');
+      expect(terms).toContain('packaged');
+    });
+  });
+
+  describe('non-ASCII (Korean) extraction', () => {
+    it('extracts a single Korean token', () => {
+      expect(extractSearchTerms('로그인')).toContain('로그인');
+    });
+
+    it('splits a multi-word Korean query on whitespace', () => {
+      const terms = extractSearchTerms('사용자 로그인 처리');
+      expect(terms).toContain('사용자');
+      expect(terms).toContain('로그인');
+      expect(terms).toContain('처리');
+    });
+
+    it('keeps 2-char Korean tokens (lower floor than ASCII)', () => {
+      expect(extractSearchTerms('인증')).toContain('인증');
+    });
+
+    it('handles mixed ASCII + Korean queries', () => {
+      const terms = extractSearchTerms('login 로그인 handler');
+      expect(terms).toContain('login');
+      expect(terms).toContain('로그인');
+    });
+  });
+});
diff --git a/src/context/index.ts b/src/context/index.ts
@@ -100,6 +100,21 @@ function extractSymbolsFromQuery(query: string): string[] {
     }
   }
 
+  // Extract non-ASCII identifier runs (Hangul, CJK, Cyrillic, Greek, …).
+  // Every pattern above relies on [a-zA-Z] and the ASCII word boundary \b, so
+  // a query like "로그인" or "认证" yields zero symbols and context comes back
+  // empty. Pull runs of Unicode letters/digits and keep only those that
+  // actually contain a non-ASCII letter — purely additive (ASCII-only tokens
+  // stay owned by the patterns above). FTS already indexes/prefix-matches
+  // these tokens (unicode61), so an extracted name flows straight into search.
+  const unicodePattern = /[\p{L}\p{N}_]+/gu;
+  while ((match = unicodePattern.exec(query)) !== null) {
+    const token = match[0];
+    if (token.length >= 2 && /[^\x00-\x7F]/.test(token)) {
+      symbols.add(token);
+    }
+  }
+
   // Filter out common English words that aren't likely symbol names
   const commonWords = new Set([
     'the', 'and', 'for', 'with', 'from', 'this', 'that', 'have', 'been',

diff --git a/src/search/query-utils.ts b/src/search/query-utils.ts
@@ -137,12 +137,21 @@ export function extractSearchTerms(query: string, options?: { stems?: boolean })
   // Replace underscores and dots with spaces (snake_case, dot.notation)
   const normalised = camelSplit.replace(/[_.]+/g, ' ');
 
-  // Split on any non-alphanumeric character
-  const words = normalised.split(/[^a-zA-Z0-9]+/).filter(Boolean);
+  // Split on separators while preserving Unicode letters/digits. The old
+  // /[^a-zA-Z0-9]+/ split treated every Hangul/CJK character as a separator,
+  // so a non-ASCII query (e.g. "로그인 처리") tokenized to nothing and text
+  // search ran with no terms. \p{L}\p{N} (u flag) keeps those runs intact;
+  // ASCII behavior is unchanged ([a-zA-Z0-9] ⊂ [\p{L}\p{N}], and every ASCII
+  // separator — space, punctuation, _, . — is still a separator).
+  const words = normalised.split(/[^\p{L}\p{N}]+/u).filter(Boolean);
 
   for (const word of words) {
     const lower = word.toLowerCase();
-    if (lower.length < 3) continue;
+    // Hangul/CJK pack a full morpheme into each character, so a 2-char
+    // non-ASCII token (e.g. "인증") is as meaningful as a longer English word.
+    // Keep the 3-char floor for ASCII to suppress noise like "abc"/"xyz".
+    const minLen = /[^\x00-\x7F]/.test(lower) ? 2 : 3;
+    if (lower.length < minLen) continue;
     if (STOP_WORDS.has(lower)) continue;
     tokens.add(lower);
   }