Skip to content

Commit 717676e

Browse files
author
Peter
committed
fix(ingestion): prevent stdlib contamination by re-resolving version in Tier 1 and validating rustdoc JSON\n\n- Re-resolve (version, url) just before Tier 1 download\n- Proper stdlib handling via construct_stdlib_url; clean fallback on 404\n- Only set tier=rustdoc_json after content sanity check\n- Add ingest re-exports (get_stdlib_url, normalize_item_type, resolve_parent_id)\n- normalize_item_type: preserve unknown string kinds as-is
1 parent 9f5384c commit 717676e

File tree

6 files changed

+1406
-5
lines changed

6 files changed

+1406
-5
lines changed

.code/agents/352185e2-d05d-4cae-bd90-524c2c34cb87/result.txt

Lines changed: 371 additions & 0 deletions
Large diffs are not rendered by default.

.code/agents/d57ce572-3055-42b1-8971-299a9c8158a7/result.txt

Lines changed: 974 additions & 0 deletions
Large diffs are not rendered by default.

src/docsrs_mcp/ingest.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
batch_examples,
2121
# Rustdoc parsing
2222
build_module_hierarchy,
23+
normalize_item_type,
24+
resolve_parent_id,
2325
# Cache management
2426
calculate_cache_size,
2527
calculate_example_hash,
@@ -29,6 +31,8 @@
2931
cleanup_embedding_model,
3032
# Version resolution
3133
construct_stdlib_url,
34+
# Backward-compat alias expected by tests
35+
construct_stdlib_url as get_stdlib_url,
3236
decompress_content,
3337
download_rustdoc,
3438
evict_cache_if_needed,
@@ -88,6 +92,8 @@
8892
"download_rustdoc",
8993
"decompress_content",
9094
"construct_stdlib_url",
95+
# Backward-compat alias
96+
"get_stdlib_url",
9197
# Rustdoc parsing
9298
"parse_rustdoc_items_streaming",
9399
"parse_rustdoc_items",

src/docsrs_mcp/ingestion/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
build_module_hierarchy,
3838
parse_rustdoc_items,
3939
parse_rustdoc_items_streaming,
40+
normalize_item_type,
4041
resolve_parent_id,
4142
)
4243
from .signature_extractor import (
@@ -97,6 +98,7 @@
9798
"parse_rustdoc_items_streaming",
9899
"parse_rustdoc_items",
99100
"build_module_hierarchy",
101+
"normalize_item_type",
100102
"resolve_parent_id",
101103
# Signature extraction
102104
"format_signature",

src/docsrs_mcp/ingestion/ingest_orchestrator.py

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
download_rustdoc,
5757
fetch_crate_info,
5858
is_stdlib_crate,
59+
construct_stdlib_url,
5960
resolve_stdlib_version,
6061
resolve_version,
6162
)
@@ -556,18 +557,64 @@ async def ingest_crate(crate_name: str, version: str | None = None) -> Path:
556557
try:
557558
await set_ingestion_status(db_path, crate_id, "downloading")
558559

560+
# Always ensure we have a concrete version and URL right before download
561+
tier1_url: str | None = None
562+
559563
if is_stdlib:
560-
# For stdlib, try docs.rs URL (expected to fail but worth trying)
561-
rustdoc_url = (
562-
f"https://docs.rs/{crate_name}/latest/{crate_name}.json"
564+
# docs.rs generally doesn't host stdlib JSON; construct a throwaway URL
565+
# so that RustdocVersionNotFoundError can drive clean fallback without
566+
# contaminating state.
567+
try:
568+
tier1_url = construct_stdlib_url(crate_name, resolved_version)
569+
except Exception:
570+
tier1_url = None
571+
else:
572+
# Use any previously resolved URL if available; otherwise resolve now
573+
try:
574+
prior_url = rustdoc_url # may be set earlier during initial resolve
575+
except NameError:
576+
prior_url = None
577+
578+
needs_resolution = (
579+
not resolved_version
580+
or str(resolved_version).strip().lower() in ("", "latest")
581+
or not prior_url
563582
)
564583

584+
if needs_resolution:
585+
try:
586+
resolved_version, tier1_url = await resolve_version(
587+
session, crate_name, version
588+
)
589+
except Exception as e:
590+
logger.info(
591+
f"Tier1: resolve_version retry skipped/failed for {crate_name}@{version}: {e}"
592+
)
593+
tier1_url = prior_url
594+
else:
595+
tier1_url = prior_url
596+
597+
# Final guards before attempting download
598+
if not isinstance(resolved_version, str) or not resolved_version:
599+
resolved_version = version or "latest"
600+
601+
logger.info(
602+
f"Tier1: attempting rustdoc download crate={crate_name} resolved_version={resolved_version} url={tier1_url} stdlib={is_stdlib}"
603+
)
604+
565605
raw_content, used_url = await download_rustdoc(
566-
session, crate_name, resolved_version, rustdoc_url
606+
session, crate_name, resolved_version, tier1_url
567607
)
568608

569609
json_content = await decompress_content(raw_content, used_url)
570610

611+
# Sanity check before marking tier success to avoid contamination
612+
if not json_content or (
613+
isinstance(json_content, str)
614+
and ('"index"' not in json_content and '"paths"' not in json_content)
615+
):
616+
raise Exception("Rustdoc JSON content appears invalid/empty")
617+
571618
ingestion_tier = IngestionTier.RUSTDOC_JSON
572619

573620
# Parse and store

src/docsrs_mcp/ingestion/rustdoc_parser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -524,7 +524,8 @@ def normalize_item_type(kind: dict | str) -> str:
524524
if key in kind_str:
525525
return value
526526

527-
return "unknown"
527+
# Preserve unknown string kinds as-is; use "unknown" for dicts or empty
528+
return kind_str if not isinstance(kind, dict) else "unknown"
528529

529530

530531
def extract_item_path(item: dict) -> str:

0 commit comments

Comments
 (0)