Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,17 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
now sees the four anonymous overrides in its trail without a Read.

### Fixed
- **`codegraph context` (and the `codegraph_context` MCP tool) now works for
non-ASCII queries.** A task description in Korean, Japanese, Chinese, or any
other non-Latin script — e.g. `codegraph context "로그인"` — used to return
an empty context (just the header + the query) even though `codegraph query`
found the symbols fine. The keyword extractors that feed context were built
entirely from ASCII patterns (`[a-zA-Z]` + the ASCII `\b` word boundary), so
they pulled zero keywords out of a non-ASCII description and searched for
nothing. Both extractors now also pick up runs of Unicode letters and hand
them to the existing FTS path (which already tokenizes non-ASCII via
`unicode61`); ASCII extraction is unchanged. A Korean class/function/method
query now surfaces the matching symbols and their code blocks.
- **`codegraph index` / `init -i` summary now reports the true edge count.**
The per-file counter in the orchestrator only saw extraction-phase edges,
so resolution and synthesizer edges (often >50% of the graph on
Expand Down
73 changes: 73 additions & 0 deletions __tests__/context.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,79 @@ export function validateEmail(email: string): boolean {
});
});

describe('Non-ASCII (Korean) queries', () => {
let krDir: string;
let krCg: CodeGraph;

beforeEach(async () => {
krDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-context-kr-'));
const srcDir = path.join(krDir, 'src');
fs.mkdirSync(srcDir);

// Source with Korean (non-ASCII) identifiers — the patterns that used to
// drive keyword extraction were all [a-zA-Z]-based, so context returned
// empty for these even though symbol search found them.
fs.writeFileSync(
path.join(srcDir, 'auth.ts'),
`export function 로그인(사용자명: string): boolean {
return 인증확인(사용자명);
}

export function 인증확인(사용자명: string): boolean {
return 사용자명.length > 0;
}

export class 사용자관리자 {
생성하기(이름: string): string {
return 이름;
}
}
`
);

krCg = CodeGraph.initSync(krDir, {
config: { include: ['**/*.ts'], exclude: [] },
});
await krCg.indexAll();
});

afterEach(() => {
if (krCg) krCg.destroy();
if (fs.existsSync(krDir)) {
fs.rmSync(krDir, { recursive: true, force: true });
}
});

it('finds relevant nodes for a Korean query', async () => {
const result = await krCg.findRelevantContext('로그인');

expect(result.nodes.size).toBeGreaterThan(0);
const names = Array.from(result.nodes.values()).map((n) => n.name);
expect(names).toContain('로그인');
});

it('surfaces Korean symbols in built context (was empty before fix)', async () => {
const result = (await krCg.buildContext('로그인', {
format: 'markdown',
})) as string;

// Regression guard for the reported bug: the markdown used to contain
// only the header + query, with zero symbols.
expect(result).toContain('로그인');
expect(result).toContain('### Entry Points');
});

it('surfaces a Korean class and method from a multi-word Korean query', async () => {
const result = (await krCg.buildContext('사용자관리자 생성하기', {
format: 'json',
})) as string;
const parsed = JSON.parse(result);
const names = parsed.nodes.map((n: { name: string }) => n.name);

expect(names).toContain('사용자관리자');
});
});

describe('Edge cases', () => {
it('should handle empty query', async () => {
const result = await cg.buildContext('', { format: 'markdown' });
Expand Down
58 changes: 58 additions & 0 deletions __tests__/query-utils.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/**
* Search query-utils tests
*
* Focused coverage for term extraction, especially the non-ASCII path that
* keeps context working for Korean/CJK queries.
*/

import { describe, it, expect } from 'vitest';
import { extractSearchTerms } from '../src/search/query-utils';

describe('extractSearchTerms', () => {
describe('ASCII behavior (must not regress)', () => {
it('splits camelCase and keeps the compound', () => {
const terms = extractSearchTerms('getUserName');
expect(terms).toContain('getusername');
expect(terms).toContain('user');
expect(terms).toContain('name');
});

it('drops <3-char ASCII tokens and stop words', () => {
const terms = extractSearchTerms('is a to ok payment');
expect(terms).toContain('payment');
expect(terms).not.toContain('is');
expect(terms).not.toContain('ok'); // 2 chars
expect(terms).not.toContain('to');
});

it('splits snake_case and dot.notation', () => {
const terms = extractSearchTerms('user_service app.isPackaged');
expect(terms).toContain('user_service');
expect(terms).toContain('service');
expect(terms).toContain('packaged');
});
});

describe('non-ASCII (Korean) extraction', () => {
it('extracts a single Korean token', () => {
expect(extractSearchTerms('로그인')).toContain('로그인');
});

it('splits a multi-word Korean query on whitespace', () => {
const terms = extractSearchTerms('사용자 로그인 처리');
expect(terms).toContain('사용자');
expect(terms).toContain('로그인');
expect(terms).toContain('처리');
});

it('keeps 2-char Korean tokens (lower floor than ASCII)', () => {
expect(extractSearchTerms('인증')).toContain('인증');
});

it('handles mixed ASCII + Korean queries', () => {
const terms = extractSearchTerms('login 로그인 handler');
expect(terms).toContain('login');
expect(terms).toContain('로그인');
});
});
});
15 changes: 15 additions & 0 deletions src/context/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,21 @@ function extractSymbolsFromQuery(query: string): string[] {
}
}

// Extract non-ASCII identifier runs (Hangul, CJK, Cyrillic, Greek, …).
// Every pattern above relies on [a-zA-Z] and the ASCII word boundary \b, so
// a query like "로그인" or "认证" yields zero symbols and context comes back
// empty. Pull runs of Unicode letters/digits and keep only those that
// actually contain a non-ASCII letter — purely additive (ASCII-only tokens
// stay owned by the patterns above). FTS already indexes/​prefix-matches
// these tokens (unicode61), so an extracted name flows straight into search.
const unicodePattern = /[\p{L}\p{N}_]+/gu;
while ((match = unicodePattern.exec(query)) !== null) {
const token = match[0];
if (token.length >= 2 && /[^\x00-\x7F]/.test(token)) {
symbols.add(token);
}
}

// Filter out common English words that aren't likely symbol names
const commonWords = new Set([
'the', 'and', 'for', 'with', 'from', 'this', 'that', 'have', 'been',
Expand Down
15 changes: 12 additions & 3 deletions src/search/query-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,21 @@ export function extractSearchTerms(query: string, options?: { stems?: boolean })
// Replace underscores and dots with spaces (snake_case, dot.notation)
const normalised = camelSplit.replace(/[_.]+/g, ' ');

// Split on any non-alphanumeric character
const words = normalised.split(/[^a-zA-Z0-9]+/).filter(Boolean);
// Split on separators while preserving Unicode letters/digits. The old
// /[^a-zA-Z0-9]+/ split treated every Hangul/CJK character as a separator,
// so a non-ASCII query (e.g. "로그인 처리") tokenized to nothing and text
// search ran with no terms. \p{L}\p{N} (u flag) keeps those runs intact;
// ASCII behavior is unchanged ([a-zA-Z0-9] ⊂ [\p{L}\p{N}], and every ASCII
// separator — space, punctuation, _, . — is still a separator).
const words = normalised.split(/[^\p{L}\p{N}]+/u).filter(Boolean);

for (const word of words) {
const lower = word.toLowerCase();
if (lower.length < 3) continue;
// Hangul/CJK pack a full morpheme into each character, so a 2-char
// non-ASCII token (e.g. "인증") is as meaningful as a longer English word.
// Keep the 3-char floor for ASCII to suppress noise like "abc"/"xyz".
const minLen = /[^\x00-\x7F]/.test(lower) ? 2 : 3;
if (lower.length < minLen) continue;
if (STOP_WORDS.has(lower)) continue;
tokens.add(lower);
}
Expand Down