Skip to content

Commit 0e04bf7

Browse files
committed
update generate category script to only output md format (will explore other formats in future ticket)
1 parent 5e4ff25 commit 0e04bf7

File tree

2 files changed

+4
-87
lines changed

2 files changed

+4
-87
lines changed

.ai/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ The scripts for LLM-related files generation are located in `polkadot-docs/scrip
4747
- **`generate_ai_pages.py`**: Creates one resolved Markdown file per documentation page and outputs them to the `/.ai/pages` directory.
4848
- **`generate_llms_txt.py`**: Creates the `llms.txt` site index file using the Markdown file URLs and outputs it to the `/polkadot-docs/` directory.
4949
- **`generate_site_index.py`**: Creates two full-site content related files:
50-
- `llms-full.jsonl`: This file contains the entire documentation site, enhanced with metadata for improved indexing and chunking, and replaces the previous `llms-full.txt` file perviously used.
51-
- `site-index.json`: This lightweight version of the full documentation site uses content previews rather than full content bodies to allow for a smaller file size.
50+
- **`llms-full.jsonl`**: This file contains the entire documentation site, enhanced with metadata for improved indexing and chunking, and replaces the previous `llms-full.txt` file perviously used.
51+
- **`site-index.json`**: This lightweight version of the full documentation site uses content previews rather than full content bodies to allow for a smaller file size.
5252
- **`generate_category_bundles.py`**: Bundles pages with the same category tag together, along with context via Basics and Reference categories, and outputs them to `/.ai/categories/` as Markdown files.
5353

5454
## FAQs

scripts/generate_category_bundles.py

Lines changed: 2 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -13,32 +13,6 @@
1313
1414
1515
Outputs (written under /.ai/categories/):
16-
- <category-slug>.manifest.json (when --format manifest/all)
17-
{
18-
"category": "Smart Contracts",
19-
"category_slug": "smart-contracts",
20-
"includes_base": true,
21-
"base_categories": ["Basics","Reference"],
22-
"count": 12,
23-
"estimated_token_count_total": 12345,
24-
"token_estimator": "heuristic-v1",
25-
"generated_at": "2025-09-02T12:34:56Z",
26-
"pages": [
27-
{
28-
"title": "Intro to XCM",
29-
"slug": "develop-interoperability-intro-to-xcm",
30-
"raw_md_url": "https://raw.githubusercontent.com/<org>/<repo>/<branch>/.ai/pages/<slug>.md",
31-
"html_url": "https://docs.example.com/develop/interoperability/intro-to-xcm/",
32-
"description": "Short lede…",
33-
"categories": ["Basics","Networks"],
34-
"estimated_token_count": 678
35-
}
36-
]
37-
}
38-
39-
- <category-slug>.bundle.jsonl (when --format jsonl/all)
40-
# One JSON object per line, INCLUDING full page content
41-
{"title":"…","slug":"…","html_url":"…","raw_md_url":"…","categories":[…],"description":"…","estimated_token_count":210,"token_estimator":"heuristic-v1","content":"<full markdown>"}
4216
4317
- <category-slug>.md (when --format md/all)
4418
# A single concatenated Markdown file with page boundaries and titles
@@ -233,58 +207,9 @@ def union_pages(sets: List[List[AiPage]]) -> List[AiPage]:
233207

234208

235209
# ----------------------------
236-
# Writers
210+
# Writer
237211
# ----------------------------
238212

239-
def write_manifest(out_path: Path, category: str, category_slug: str,
240-
includes_base: bool, base_categories: List[str],
241-
pages: List[AiPage], raw_base: str,
242-
estimator_label: str, page_tokens: Dict[str, int]) -> None:
243-
est_total = sum(page_tokens.get(p.slug, 0) for p in pages)
244-
record = {
245-
"category": category,
246-
"category_slug": category_slug,
247-
"includes_base": includes_base,
248-
"base_categories": base_categories,
249-
"count": len(pages),
250-
"estimated_token_count_total": est_total,
251-
"token_estimator": estimator_label,
252-
"generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
253-
"pages": [
254-
{
255-
"title": p.title,
256-
"slug": p.slug,
257-
"raw_md_url": f"{raw_base}/{p.slug}.md",
258-
"html_url": p.html_url,
259-
"description": p.description,
260-
"categories": p.categories,
261-
"estimated_token_count": page_tokens.get(p.slug, 0),
262-
}
263-
for p in pages
264-
],
265-
}
266-
out_path.parent.mkdir(parents=True, exist_ok=True)
267-
with open(out_path, "w", encoding="utf-8") as f:
268-
json.dump(record, f, indent=2, ensure_ascii=False)
269-
270-
def write_jsonl(out_path: Path, pages: List[AiPage], raw_base: str,
271-
estimator_label: str, page_tokens: Dict[str, int]) -> None:
272-
out_path.parent.mkdir(parents=True, exist_ok=True)
273-
with open(out_path, "w", encoding="utf-8") as f:
274-
for p in pages:
275-
obj = {
276-
"title": p.title,
277-
"slug": p.slug,
278-
"raw_md_url": f"{raw_base}/{p.slug}.md",
279-
"html_url": p.html_url,
280-
"categories": p.categories,
281-
"description": p.description,
282-
"estimated_token_count": page_tokens.get(p.slug, 0),
283-
"token_estimator": estimator_label,
284-
"content": p.body,
285-
}
286-
f.write(json.dumps(obj, ensure_ascii=False) + "\n")
287-
288213
def write_markdown(out_path: Path, category: str, includes_base: bool,
289214
base_categories: List[str], pages: List[AiPage], raw_base: str) -> None:
290215
"""
@@ -368,10 +293,6 @@ def build_category_bundles(config_path: str, fmt: str, dry_run: bool, limit: int
368293
print(f"[dry-run] base bundle: {cat} ({len(pages_out)} pages)")
369294
else:
370295
out_root.mkdir(parents=True, exist_ok=True)
371-
if fmt in ("manifest", "all"):
372-
write_manifest(out_root / f"{cat_slug}.manifest.json", cat, cat_slug, False, base_cats, pages_out, raw_base, token_estimator, page_tokens)
373-
if fmt in ("jsonl", "all"):
374-
write_jsonl(out_root / f"{cat_slug}.bundle.jsonl", pages_out, raw_base, token_estimator, page_tokens)
375296
if fmt in ("md", "all"):
376297
write_markdown(out_root / f"{cat_slug}.md", cat, False, base_cats, pages_out, raw_base)
377298
continue
@@ -385,10 +306,6 @@ def build_category_bundles(config_path: str, fmt: str, dry_run: bool, limit: int
385306
print(f"[dry-run] category bundle: {cat} (base+cat={len(pages_out)} pages; base={len(base_union)} cat_only={len(pages_cat)})")
386307
else:
387308
out_root.mkdir(parents=True, exist_ok=True)
388-
if fmt in ("manifest", "all"):
389-
write_manifest(out_root / f"{cat_slug}.manifest.json", cat, cat_slug, True, base_cats, pages_out, raw_base, token_estimator, page_tokens)
390-
if fmt in ("jsonl", "all"):
391-
write_jsonl(out_root / f"{cat_slug}.bundle.jsonl", pages_out, raw_base, token_estimator, page_tokens)
392309
if fmt in ("md", "all"):
393310
write_markdown(out_root / f"{cat_slug}.md", cat, True, base_cats, pages_out, raw_base)
394311

@@ -401,7 +318,7 @@ def build_category_bundles(config_path: str, fmt: str, dry_run: bool, limit: int
401318
def main():
402319
parser = argparse.ArgumentParser(description="Build category-based bundles from /.ai/pages/*.md")
403320
parser.add_argument("--config", default="llms_config.json", help="Path to llms_config.json (default: scripts/llms_config.json)")
404-
parser.add_argument("--format", choices=["manifest", "jsonl", "md", "all"], default="md",
321+
parser.add_argument("--format", choices=["md", "all"], default="md",
405322
help="Output format to generate (default: md)")
406323
parser.add_argument("--dry-run", action="store_true", help="Show what would be generated; do not write files")
407324
parser.add_argument("--limit", type=int, default=0, help="Limit pages loaded (0=all) for dry-run sanity")

0 commit comments

Comments
 (0)