adds word count and token estimate to category files

dawnkelly09 · dawnkelly09 · commit 214f5867d194 · 2025-11-10T13:19:51.000-05:00
diff --git a/.ai/categories/basics.md b/.ai/categories/basics.md
@@ -1,4 +1,6 @@
 Begin New Bundle: Basics
+word_count: 57075
+estimated_tokens: 93024
 
 
 ---
diff --git a/.ai/categories/dapps.md b/.ai/categories/dapps.md
@@ -1,5 +1,7 @@
 Begin New Bundle: dApps
 Includes shared base categories: Basics, Reference
+word_count: 83174
+estimated_tokens: 142038
 
 
 ---
diff --git a/.ai/categories/infrastructure.md b/.ai/categories/infrastructure.md
@@ -1,5 +1,7 @@
 Begin New Bundle: Infrastructure
 Includes shared base categories: Basics, Reference
+word_count: 89525
+estimated_tokens: 150296
 
 
 ---
diff --git a/.ai/categories/networks.md b/.ai/categories/networks.md
@@ -1,5 +1,7 @@
 Begin New Bundle: Networks
 Includes shared base categories: Basics, Reference
+word_count: 70170
+estimated_tokens: 119487
 
 
 ---
diff --git a/.ai/categories/parachains.md b/.ai/categories/parachains.md
@@ -1,5 +1,7 @@
 Begin New Bundle: Parachains
 Includes shared base categories: Basics, Reference
+word_count: 115001
+estimated_tokens: 197941
 
 
 ---
diff --git a/.ai/categories/polkadot-protocol.md b/.ai/categories/polkadot-protocol.md
@@ -1,5 +1,7 @@
 Begin New Bundle: Polkadot Protocol
 Includes shared base categories: Basics, Reference
+word_count: 80127
+estimated_tokens: 133726
 
 
 ---
diff --git a/.ai/categories/reference.md b/.ai/categories/reference.md
@@ -1,4 +1,6 @@
 Begin New Bundle: Reference
+word_count: 13095
+estimated_tokens: 26463
 
 
 ---
diff --git a/.ai/categories/smart-contracts.md b/.ai/categories/smart-contracts.md
@@ -1,5 +1,7 @@
 Begin New Bundle: Smart Contracts
 Includes shared base categories: Basics, Reference
+word_count: 90059
+estimated_tokens: 156139
 
 
 ---
diff --git a/.ai/categories/tooling.md b/.ai/categories/tooling.md
@@ -1,5 +1,7 @@
 Begin New Bundle: Tooling
 Includes shared base categories: Basics, Reference
+word_count: 130474
+estimated_tokens: 235609
 
 
 ---
diff --git a/scripts/generate_category_bundles.py b/scripts/generate_category_bundles.py
@@ -147,7 +147,7 @@ def load_all_pages(ai_dir: Path) -> List[AiPage]:
 
 
 # ----------------------------
-# Token estimation
+# Token estimation, word count
 # ----------------------------
 
 def _heuristic_token_count(s: str) -> int:
@@ -179,6 +179,10 @@ def estimate_tokens(text: str, estimator: str = "heuristic-v1") -> int:
     return _heuristic_token_count(text)
 
 
+def word_count(text: str) -> int:
+    return len(re.findall(r"\b\w+\b", text, flags=re.UNICODE))
+
+
 # ----------------------------
 # Category logic
 # ----------------------------
@@ -211,7 +215,8 @@ def union_pages(sets: List[List[AiPage]]) -> List[AiPage]:
 # ----------------------------
 
 def write_markdown(out_path: Path, category: str, includes_base: bool,
-                   base_categories: List[str], pages: List[AiPage], raw_base: str) -> None:
+                   base_categories: List[str], pages: List[AiPage], raw_base: str,
+                   total_words: int, total_tokens: int) -> None:
     """
     Concatenate pages into a single Markdown with clear boundaries.
     (Note: pages already contain headings; we avoid adding extra YAML to keep it simple.)
@@ -221,6 +226,8 @@ def write_markdown(out_path: Path, category: str, includes_base: bool,
     lines.append(f"Begin New Bundle: {category}")
     if includes_base:
         lines.append(f"Includes shared base categories: {', '.join(base_categories)}")
+    lines.append(f"word_count: {total_words}")
+    lines.append(f"estimated_tokens: {total_tokens}")
     lines.append("")
     for idx, p in enumerate(pages, 1):
         lines.append(f"\n---\n\nPage Title: {p.title}\n")
@@ -260,6 +267,7 @@ def build_category_bundles(config_path: str, fmt: str, dry_run: bool, limit: int
 
     # Precompute token counts once per page
     page_tokens: Dict[str, int] = {p.slug: estimate_tokens(p.body, token_estimator) for p in pages}
+    page_words: Dict[str, int] = {p.slug: word_count(p.body) for p in pages}
 
     out_root = (repo_root / config.get("outputs", {}).get("public_root", "/.ai/").strip("/") / "categories").resolve()
 
@@ -294,7 +302,9 @@ def build_category_bundles(config_path: str, fmt: str, dry_run: bool, limit: int
             else:
                 out_root.mkdir(parents=True, exist_ok=True)
                 if fmt in ("md", "all"):
-                    write_markdown(out_root / f"{cat_slug}.md", cat, False, base_cats, pages_out, raw_base)
+                    total_words = sum(page_words.get(p.slug, 0) for p in pages_out)
+                    total_tokens = sum(page_tokens.get(p.slug, 0) for p in pages_out)
+                    write_markdown(out_root / f"{cat_slug}.md", cat, False, base_cats, pages_out, raw_base, total_words, total_tokens)
             continue
 
         # Non-base category: include base union + this category's pages (dedup)
@@ -307,7 +317,9 @@ def build_category_bundles(config_path: str, fmt: str, dry_run: bool, limit: int
         else:
             out_root.mkdir(parents=True, exist_ok=True)
             if fmt in ("md", "all"):
-                write_markdown(out_root / f"{cat_slug}.md", cat, True, base_cats, pages_out, raw_base)
+                total_words = sum(page_words.get(p.slug, 0) for p in pages_out)
+                total_tokens = sum(page_tokens.get(p.slug, 0) for p in pages_out)
+                write_markdown(out_root / f"{cat_slug}.md", cat, True, base_cats, pages_out, raw_base, total_words, total_tokens)
 
     if dry_run:
         print("[dry-run] No files were written.")