Skip to content

Commit 3e9a309

Browse files
catlog22claude
andcommitted
refactor: 移除图索引功能,修复内存泄露,优化嵌入生成
主要更改: 1. 移除图索引功能 (graph indexing) - 删除 graph_analyzer.py 及相关迁移文件 - 移除 CLI 的 graph 命令和 --enrich 标志 - 清理 chain_search.py 中的图查询方法 (370行) - 删除相关测试文件 2. 修复嵌入生成内存问题 - 重构 generate_embeddings.py 使用流式批处理 - 改用 embedding_manager 的内存安全实现 - 文件从 548 行精简到 259 行 (52.7% 减少) 3. 修复内存泄露 - chain_search.py: quick_search 使用 with 语句管理 ChainSearchEngine - embedding_manager.py: 使用 with 语句管理 VectorStore - vector_store.py: 添加暴力搜索内存警告 4. 代码清理 - 移除 Symbol 模型的 token_count 和 symbol_type 字段 - 清理相关测试用例 测试: 760 passed, 7 skipped 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <[email protected]>
1 parent 15d5890 commit 3e9a309

19 files changed

+165
-3909
lines changed

codex-lens/scripts/generate_embeddings.py

Lines changed: 80 additions & 368 deletions
Large diffs are not rendered by default.

codex-lens/src/codexlens/cli/commands.py

Lines changed: 1 addition & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,6 @@ def search(
268268
files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."),
269269
mode: str = typer.Option("auto", "--mode", "-m", help="Search mode: auto, exact, fuzzy, hybrid, vector, pure-vector."),
270270
weights: Optional[str] = typer.Option(None, "--weights", help="Custom RRF weights as 'exact,fuzzy,vector' (e.g., '0.5,0.3,0.2')."),
271-
enrich: bool = typer.Option(False, "--enrich", help="Enrich results with code graph relationships (calls, imports)."),
272271
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
273272
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
274273
) -> None:
@@ -423,30 +422,10 @@ def search(
423422
for r in result.results
424423
]
425424

426-
# Enrich results with relationship data if requested
427-
enriched = False
428-
if enrich:
429-
try:
430-
from codexlens.search.enrichment import RelationshipEnricher
431-
432-
# Find index path for the search path
433-
project_record = registry.find_by_source_path(str(search_path))
434-
if project_record:
435-
index_path = Path(project_record["index_root"]) / "_index.db"
436-
if index_path.exists():
437-
with RelationshipEnricher(index_path) as enricher:
438-
results_list = enricher.enrich(results_list, limit=limit)
439-
enriched = True
440-
except Exception as e:
441-
# Enrichment failure should not break search
442-
if verbose:
443-
console.print(f"[yellow]Warning: Enrichment failed: {e}[/yellow]")
444-
445425
payload = {
446426
"query": query,
447427
"mode": actual_mode,
448428
"count": len(results_list),
449-
"enriched": enriched,
450429
"results": results_list,
451430
"stats": {
452431
"dirs_searched": result.stats.dirs_searched,
@@ -458,8 +437,7 @@ def search(
458437
print_json(success=True, result=payload)
459438
else:
460439
render_search_results(result.results, verbose=verbose)
461-
enrich_status = " | [green]Enriched[/green]" if enriched else ""
462-
console.print(f"[dim]Mode: {actual_mode} | Searched {result.stats.dirs_searched} directories in {result.stats.time_ms:.1f}ms{enrich_status}[/dim]")
440+
console.print(f"[dim]Mode: {actual_mode} | Searched {result.stats.dirs_searched} directories in {result.stats.time_ms:.1f}ms[/dim]")
463441

464442
except SearchError as exc:
465443
if json_mode:
@@ -1376,103 +1354,6 @@ def clean(
13761354
raise typer.Exit(code=1)
13771355

13781356

1379-
@app.command()
1380-
def graph(
1381-
query_type: str = typer.Argument(..., help="Query type: callers, callees, or inheritance"),
1382-
symbol: str = typer.Argument(..., help="Symbol name to query"),
1383-
path: Path = typer.Option(Path("."), "--path", "-p", help="Directory to search from."),
1384-
limit: int = typer.Option(50, "--limit", "-n", min=1, max=500, help="Max results."),
1385-
depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited)."),
1386-
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
1387-
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
1388-
) -> None:
1389-
"""Query semantic graph for code relationships.
1390-
1391-
Supported query types:
1392-
- callers: Find all functions/methods that call the given symbol
1393-
- callees: Find all functions/methods called by the given symbol
1394-
- inheritance: Find inheritance relationships for the given class
1395-
1396-
Examples:
1397-
codex-lens graph callers my_function
1398-
codex-lens graph callees MyClass.method --path src/
1399-
codex-lens graph inheritance BaseClass
1400-
"""
1401-
_configure_logging(verbose)
1402-
search_path = path.expanduser().resolve()
1403-
1404-
# Validate query type
1405-
valid_types = ["callers", "callees", "inheritance"]
1406-
if query_type not in valid_types:
1407-
if json_mode:
1408-
print_json(success=False, error=f"Invalid query type: {query_type}. Must be one of: {', '.join(valid_types)}")
1409-
else:
1410-
console.print(f"[red]Invalid query type:[/red] {query_type}")
1411-
console.print(f"[dim]Valid types: {', '.join(valid_types)}[/dim]")
1412-
raise typer.Exit(code=1)
1413-
1414-
registry: RegistryStore | None = None
1415-
try:
1416-
registry = RegistryStore()
1417-
registry.initialize()
1418-
mapper = PathMapper()
1419-
1420-
engine = ChainSearchEngine(registry, mapper)
1421-
options = SearchOptions(depth=depth, total_limit=limit)
1422-
1423-
# Execute graph query based on type
1424-
if query_type == "callers":
1425-
results = engine.search_callers(symbol, search_path, options=options)
1426-
result_type = "callers"
1427-
elif query_type == "callees":
1428-
results = engine.search_callees(symbol, search_path, options=options)
1429-
result_type = "callees"
1430-
else: # inheritance
1431-
results = engine.search_inheritance(symbol, search_path, options=options)
1432-
result_type = "inheritance"
1433-
1434-
payload = {
1435-
"query_type": query_type,
1436-
"symbol": symbol,
1437-
"count": len(results),
1438-
"relationships": results
1439-
}
1440-
1441-
if json_mode:
1442-
print_json(success=True, result=payload)
1443-
else:
1444-
from .output import render_graph_results
1445-
render_graph_results(results, query_type=query_type, symbol=symbol)
1446-
1447-
except SearchError as exc:
1448-
if json_mode:
1449-
print_json(success=False, error=f"Graph search error: {exc}")
1450-
else:
1451-
console.print(f"[red]Graph query failed (search):[/red] {exc}")
1452-
raise typer.Exit(code=1)
1453-
except StorageError as exc:
1454-
if json_mode:
1455-
print_json(success=False, error=f"Storage error: {exc}")
1456-
else:
1457-
console.print(f"[red]Graph query failed (storage):[/red] {exc}")
1458-
raise typer.Exit(code=1)
1459-
except CodexLensError as exc:
1460-
if json_mode:
1461-
print_json(success=False, error=str(exc))
1462-
else:
1463-
console.print(f"[red]Graph query failed:[/red] {exc}")
1464-
raise typer.Exit(code=1)
1465-
except Exception as exc:
1466-
if json_mode:
1467-
print_json(success=False, error=f"Unexpected error: {exc}")
1468-
else:
1469-
console.print(f"[red]Graph query failed (unexpected):[/red] {exc}")
1470-
raise typer.Exit(code=1)
1471-
finally:
1472-
if registry is not None:
1473-
registry.close()
1474-
1475-
14761357
@app.command("semantic-list")
14771358
def semantic_list(
14781359
path: Path = typer.Option(Path("."), "--path", "-p", help="Project path to list metadata from."),

codex-lens/src/codexlens/cli/embedding_manager.py

Lines changed: 73 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,6 @@ def generate_embeddings(
194194
try:
195195
# Use cached embedder (singleton) for performance
196196
embedder = get_embedder(profile=model_profile)
197-
vector_store = VectorStore(index_path)
198197
chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size))
199198

200199
if progress_callback:
@@ -217,85 +216,86 @@ def generate_embeddings(
217216
EMBEDDING_BATCH_SIZE = 8 # jina-embeddings-v2-base-code needs small batches
218217

219218
try:
220-
with sqlite3.connect(index_path) as conn:
221-
conn.row_factory = sqlite3.Row
222-
path_column = _get_path_column(conn)
223-
224-
# Get total file count for progress reporting
225-
total_files = conn.execute("SELECT COUNT(*) FROM files").fetchone()[0]
226-
if total_files == 0:
227-
return {"success": False, "error": "No files found in index"}
228-
229-
if progress_callback:
230-
progress_callback(f"Processing {total_files} files in batches of {FILE_BATCH_SIZE}...")
231-
232-
cursor = conn.execute(f"SELECT {path_column}, content, language FROM files")
233-
batch_number = 0
219+
with VectorStore(index_path) as vector_store:
220+
with sqlite3.connect(index_path) as conn:
221+
conn.row_factory = sqlite3.Row
222+
path_column = _get_path_column(conn)
234223

235-
while True:
236-
# Fetch a batch of files (streaming, not fetchall)
237-
file_batch = cursor.fetchmany(FILE_BATCH_SIZE)
238-
if not file_batch:
239-
break
224+
# Get total file count for progress reporting
225+
total_files = conn.execute("SELECT COUNT(*) FROM files").fetchone()[0]
226+
if total_files == 0:
227+
return {"success": False, "error": "No files found in index"}
240228

241-
batch_number += 1
242-
batch_chunks_with_paths = []
243-
files_in_batch_with_chunks = set()
229+
if progress_callback:
230+
progress_callback(f"Processing {total_files} files in batches of {FILE_BATCH_SIZE}...")
231+
232+
cursor = conn.execute(f"SELECT {path_column}, content, language FROM files")
233+
batch_number = 0
234+
235+
while True:
236+
# Fetch a batch of files (streaming, not fetchall)
237+
file_batch = cursor.fetchmany(FILE_BATCH_SIZE)
238+
if not file_batch:
239+
break
240+
241+
batch_number += 1
242+
batch_chunks_with_paths = []
243+
files_in_batch_with_chunks = set()
244+
245+
# Step 1: Chunking for the current file batch
246+
for file_row in file_batch:
247+
file_path = file_row[path_column]
248+
content = file_row["content"]
249+
language = file_row["language"] or "python"
250+
251+
try:
252+
chunks = chunker.chunk_sliding_window(
253+
content,
254+
file_path=file_path,
255+
language=language
256+
)
257+
if chunks:
258+
for chunk in chunks:
259+
batch_chunks_with_paths.append((chunk, file_path))
260+
files_in_batch_with_chunks.add(file_path)
261+
except Exception as e:
262+
logger.error(f"Failed to chunk {file_path}: {e}")
263+
failed_files.append((file_path, str(e)))
264+
265+
if not batch_chunks_with_paths:
266+
continue
267+
268+
batch_chunk_count = len(batch_chunks_with_paths)
269+
if progress_callback:
270+
progress_callback(f" Batch {batch_number}: {len(file_batch)} files, {batch_chunk_count} chunks")
271+
272+
# Step 2: Generate embeddings for this batch
273+
batch_embeddings = []
274+
try:
275+
for i in range(0, batch_chunk_count, EMBEDDING_BATCH_SIZE):
276+
batch_end = min(i + EMBEDDING_BATCH_SIZE, batch_chunk_count)
277+
batch_contents = [chunk.content for chunk, _ in batch_chunks_with_paths[i:batch_end]]
278+
embeddings = embedder.embed(batch_contents)
279+
batch_embeddings.extend(embeddings)
280+
except Exception as e:
281+
logger.error(f"Failed to generate embeddings for batch {batch_number}: {str(e)}")
282+
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
283+
continue
244284

245-
# Step 1: Chunking for the current file batch
246-
for file_row in file_batch:
247-
file_path = file_row[path_column]
248-
content = file_row["content"]
249-
language = file_row["language"] or "python"
285+
# Step 3: Assign embeddings to chunks
286+
for (chunk, _), embedding in zip(batch_chunks_with_paths, batch_embeddings):
287+
chunk.embedding = embedding
250288

289+
# Step 4: Store this batch to database immediately (releases memory)
251290
try:
252-
chunks = chunker.chunk_sliding_window(
253-
content,
254-
file_path=file_path,
255-
language=language
256-
)
257-
if chunks:
258-
for chunk in chunks:
259-
batch_chunks_with_paths.append((chunk, file_path))
260-
files_in_batch_with_chunks.add(file_path)
291+
vector_store.add_chunks_batch(batch_chunks_with_paths)
292+
total_chunks_created += batch_chunk_count
293+
total_files_processed += len(files_in_batch_with_chunks)
261294
except Exception as e:
262-
logger.error(f"Failed to chunk {file_path}: {e}")
263-
failed_files.append((file_path, str(e)))
295+
logger.error(f"Failed to store batch {batch_number}: {str(e)}")
296+
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
264297

265-
if not batch_chunks_with_paths:
266-
continue
267-
268-
batch_chunk_count = len(batch_chunks_with_paths)
269-
if progress_callback:
270-
progress_callback(f" Batch {batch_number}: {len(file_batch)} files, {batch_chunk_count} chunks")
271-
272-
# Step 2: Generate embeddings for this batch
273-
batch_embeddings = []
274-
try:
275-
for i in range(0, batch_chunk_count, EMBEDDING_BATCH_SIZE):
276-
batch_end = min(i + EMBEDDING_BATCH_SIZE, batch_chunk_count)
277-
batch_contents = [chunk.content for chunk, _ in batch_chunks_with_paths[i:batch_end]]
278-
embeddings = embedder.embed(batch_contents)
279-
batch_embeddings.extend(embeddings)
280-
except Exception as e:
281-
logger.error(f"Failed to generate embeddings for batch {batch_number}: {str(e)}")
282-
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
283-
continue
284-
285-
# Step 3: Assign embeddings to chunks
286-
for (chunk, _), embedding in zip(batch_chunks_with_paths, batch_embeddings):
287-
chunk.embedding = embedding
288-
289-
# Step 4: Store this batch to database immediately (releases memory)
290-
try:
291-
vector_store.add_chunks_batch(batch_chunks_with_paths)
292-
total_chunks_created += batch_chunk_count
293-
total_files_processed += len(files_in_batch_with_chunks)
294-
except Exception as e:
295-
logger.error(f"Failed to store batch {batch_number}: {str(e)}")
296-
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
297-
298-
# Memory is released here as batch_chunks_with_paths and batch_embeddings go out of scope
298+
# Memory is released here as batch_chunks_with_paths and batch_embeddings go out of scope
299299

300300
except Exception as e:
301301
return {"success": False, "error": f"Failed to read or process files: {str(e)}"}

codex-lens/src/codexlens/cli/output.py

Lines changed: 0 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -122,68 +122,3 @@ def render_file_inspect(path: str, language: str, symbols: Iterable[Symbol]) ->
122122
console.print(header)
123123
render_symbols(list(symbols), title="Discovered Symbols")
124124

125-
126-
def render_graph_results(results: list[dict[str, Any]], *, query_type: str, symbol: str) -> None:
127-
"""Render semantic graph query results.
128-
129-
Args:
130-
results: List of relationship dicts
131-
query_type: Type of query (callers, callees, inheritance)
132-
symbol: Symbol name that was queried
133-
"""
134-
if not results:
135-
console.print(f"[yellow]No {query_type} found for symbol:[/yellow] {symbol}")
136-
return
137-
138-
title_map = {
139-
"callers": f"Callers of '{symbol}' ({len(results)} found)",
140-
"callees": f"Callees of '{symbol}' ({len(results)} found)",
141-
"inheritance": f"Inheritance relationships for '{symbol}' ({len(results)} found)"
142-
}
143-
144-
table = Table(title=title_map.get(query_type, f"Graph Results ({len(results)})"))
145-
146-
if query_type == "callers":
147-
table.add_column("Caller", style="green")
148-
table.add_column("File", style="cyan", no_wrap=False, max_width=40)
149-
table.add_column("Line", justify="right", style="yellow")
150-
table.add_column("Type", style="dim")
151-
152-
for rel in results:
153-
table.add_row(
154-
rel.get("source_symbol", "-"),
155-
rel.get("source_file", "-"),
156-
str(rel.get("source_line", "-")),
157-
rel.get("relationship_type", "-")
158-
)
159-
160-
elif query_type == "callees":
161-
table.add_column("Target", style="green")
162-
table.add_column("File", style="cyan", no_wrap=False, max_width=40)
163-
table.add_column("Line", justify="right", style="yellow")
164-
table.add_column("Type", style="dim")
165-
166-
for rel in results:
167-
table.add_row(
168-
rel.get("target_symbol", "-"),
169-
rel.get("target_file", "-") if rel.get("target_file") else rel.get("source_file", "-"),
170-
str(rel.get("source_line", "-")),
171-
rel.get("relationship_type", "-")
172-
)
173-
174-
else: # inheritance
175-
table.add_column("Derived Class", style="green")
176-
table.add_column("Base Class", style="magenta")
177-
table.add_column("File", style="cyan", no_wrap=False, max_width=40)
178-
table.add_column("Line", justify="right", style="yellow")
179-
180-
for rel in results:
181-
table.add_row(
182-
rel.get("source_symbol", "-"),
183-
rel.get("target_symbol", "-"),
184-
rel.get("source_file", "-"),
185-
str(rel.get("source_line", "-"))
186-
)
187-
188-
console.print(table)
189-

0 commit comments

Comments
 (0)