@@ -147,7 +147,7 @@ def load_all_pages(ai_dir: Path) -> List[AiPage]:
147147
148148
149149# ----------------------------
150- # Token estimation
150+ # Token estimation, word count
151151# ----------------------------
152152
153153def _heuristic_token_count(s: str) -> int:
@@ -179,6 +179,10 @@ def estimate_tokens(text: str, estimator: str = "heuristic-v1") -> int:
179179 return _heuristic_token_count(text)
180180
181181
182+ def word_count(text: str) -> int:
183+ return len(re.findall(r"\b\w+\b", text, flags=re.UNICODE))
184+
185+
182186# ----------------------------
183187# Category logic
184188# ----------------------------
@@ -211,7 +215,8 @@ def union_pages(sets: List[List[AiPage]]) -> List[AiPage]:
211215# ----------------------------
212216
213217def write_markdown(out_path: Path, category: str, includes_base: bool,
214- base_categories: List[str], pages: List[AiPage], raw_base: str) -> None:
218+ base_categories: List[str], pages: List[AiPage], raw_base: str,
219+ total_words: int, total_tokens: int) -> None:
215220 """
216221 Concatenate pages into a single Markdown with clear boundaries.
217222 (Note: pages already contain headings; we avoid adding extra YAML to keep it simple.)
@@ -221,6 +226,8 @@ def write_markdown(out_path: Path, category: str, includes_base: bool,
221226 lines.append(f"Begin New Bundle: {category}")
222227 if includes_base:
223228 lines.append(f"Includes shared base categories: {', '.join(base_categories)}")
229+ lines.append(f"word_count: {total_words}")
230+ lines.append(f"estimated_tokens: {total_tokens}")
224231 lines.append("")
225232 for idx, p in enumerate(pages, 1):
226233 lines.append(f"\n---\n\nPage Title: {p.title}\n")
@@ -260,6 +267,7 @@ def build_category_bundles(config_path: str, fmt: str, dry_run: bool, limit: int
260267
261268 # Precompute token counts once per page
262269 page_tokens: Dict[str, int] = {p.slug: estimate_tokens(p.body, token_estimator) for p in pages}
270+ page_words: Dict[str, int] = {p.slug: word_count(p.body) for p in pages}
263271
264272 out_root = (repo_root / config.get("outputs", {}).get("public_root", "/.ai/").strip("/") / "categories").resolve()
265273
@@ -294,7 +302,9 @@ def build_category_bundles(config_path: str, fmt: str, dry_run: bool, limit: int
294302 else:
295303 out_root.mkdir(parents=True, exist_ok=True)
296304 if fmt in ("md", "all"):
297- write_markdown(out_root / f"{cat_slug}.md", cat, False, base_cats, pages_out, raw_base)
305+ total_words = sum(page_words.get(p.slug, 0) for p in pages_out)
306+ total_tokens = sum(page_tokens.get(p.slug, 0) for p in pages_out)
307+ write_markdown(out_root / f"{cat_slug}.md", cat, False, base_cats, pages_out, raw_base, total_words, total_tokens)
298308 continue
299309
300310 # Non-base category: include base union + this category's pages (dedup)
@@ -307,7 +317,9 @@ def build_category_bundles(config_path: str, fmt: str, dry_run: bool, limit: int
307317 else:
308318 out_root.mkdir(parents=True, exist_ok=True)
309319 if fmt in ("md", "all"):
310- write_markdown(out_root / f"{cat_slug}.md", cat, True, base_cats, pages_out, raw_base)
320+ total_words = sum(page_words.get(p.slug, 0) for p in pages_out)
321+ total_tokens = sum(page_tokens.get(p.slug, 0) for p in pages_out)
322+ write_markdown(out_root / f"{cat_slug}.md", cat, True, base_cats, pages_out, raw_base, total_words, total_tokens)
311323
312324 if dry_run:
313325 print("[dry-run] No files were written.")
0 commit comments