Skip to content

Commit 214f586

Browse files
committed
adds word count and token estimate to category files
1 parent 5136bce commit 214f586

File tree

10 files changed

+34
-4
lines changed

10 files changed

+34
-4
lines changed

.ai/categories/basics.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
Begin New Bundle: Basics
2+
word_count: 57075
3+
estimated_tokens: 93024
24

35

46
---

.ai/categories/dapps.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
Begin New Bundle: dApps
22
Includes shared base categories: Basics, Reference
3+
word_count: 83174
4+
estimated_tokens: 142038
35

46

57
---

.ai/categories/infrastructure.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
Begin New Bundle: Infrastructure
22
Includes shared base categories: Basics, Reference
3+
word_count: 89525
4+
estimated_tokens: 150296
35

46

57
---

.ai/categories/networks.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
Begin New Bundle: Networks
22
Includes shared base categories: Basics, Reference
3+
word_count: 70170
4+
estimated_tokens: 119487
35

46

57
---

.ai/categories/parachains.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
Begin New Bundle: Parachains
22
Includes shared base categories: Basics, Reference
3+
word_count: 115001
4+
estimated_tokens: 197941
35

46

57
---

.ai/categories/polkadot-protocol.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
Begin New Bundle: Polkadot Protocol
22
Includes shared base categories: Basics, Reference
3+
word_count: 80127
4+
estimated_tokens: 133726
35

46

57
---

.ai/categories/reference.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
Begin New Bundle: Reference
2+
word_count: 13095
3+
estimated_tokens: 26463
24

35

46
---

.ai/categories/smart-contracts.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
Begin New Bundle: Smart Contracts
22
Includes shared base categories: Basics, Reference
3+
word_count: 90059
4+
estimated_tokens: 156139
35

46

57
---

.ai/categories/tooling.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
Begin New Bundle: Tooling
22
Includes shared base categories: Basics, Reference
3+
word_count: 130474
4+
estimated_tokens: 235609
35

46

57
---

scripts/generate_category_bundles.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def load_all_pages(ai_dir: Path) -> List[AiPage]:
147147

148148

149149
# ----------------------------
150-
# Token estimation
150+
# Token estimation, word count
151151
# ----------------------------
152152

153153
def _heuristic_token_count(s: str) -> int:
@@ -179,6 +179,10 @@ def estimate_tokens(text: str, estimator: str = "heuristic-v1") -> int:
179179
return _heuristic_token_count(text)
180180

181181

182+
def word_count(text: str) -> int:
183+
return len(re.findall(r"\b\w+\b", text, flags=re.UNICODE))
184+
185+
182186
# ----------------------------
183187
# Category logic
184188
# ----------------------------
@@ -211,7 +215,8 @@ def union_pages(sets: List[List[AiPage]]) -> List[AiPage]:
211215
# ----------------------------
212216

213217
def write_markdown(out_path: Path, category: str, includes_base: bool,
214-
base_categories: List[str], pages: List[AiPage], raw_base: str) -> None:
218+
base_categories: List[str], pages: List[AiPage], raw_base: str,
219+
total_words: int, total_tokens: int) -> None:
215220
"""
216221
Concatenate pages into a single Markdown with clear boundaries.
217222
(Note: pages already contain headings; we avoid adding extra YAML to keep it simple.)
@@ -221,6 +226,8 @@ def write_markdown(out_path: Path, category: str, includes_base: bool,
221226
lines.append(f"Begin New Bundle: {category}")
222227
if includes_base:
223228
lines.append(f"Includes shared base categories: {', '.join(base_categories)}")
229+
lines.append(f"word_count: {total_words}")
230+
lines.append(f"estimated_tokens: {total_tokens}")
224231
lines.append("")
225232
for idx, p in enumerate(pages, 1):
226233
lines.append(f"\n---\n\nPage Title: {p.title}\n")
@@ -260,6 +267,7 @@ def build_category_bundles(config_path: str, fmt: str, dry_run: bool, limit: int
260267

261268
# Precompute token counts once per page
262269
page_tokens: Dict[str, int] = {p.slug: estimate_tokens(p.body, token_estimator) for p in pages}
270+
page_words: Dict[str, int] = {p.slug: word_count(p.body) for p in pages}
263271

264272
out_root = (repo_root / config.get("outputs", {}).get("public_root", "/.ai/").strip("/") / "categories").resolve()
265273

@@ -294,7 +302,9 @@ def build_category_bundles(config_path: str, fmt: str, dry_run: bool, limit: int
294302
else:
295303
out_root.mkdir(parents=True, exist_ok=True)
296304
if fmt in ("md", "all"):
297-
write_markdown(out_root / f"{cat_slug}.md", cat, False, base_cats, pages_out, raw_base)
305+
total_words = sum(page_words.get(p.slug, 0) for p in pages_out)
306+
total_tokens = sum(page_tokens.get(p.slug, 0) for p in pages_out)
307+
write_markdown(out_root / f"{cat_slug}.md", cat, False, base_cats, pages_out, raw_base, total_words, total_tokens)
298308
continue
299309

300310
# Non-base category: include base union + this category's pages (dedup)
@@ -307,7 +317,9 @@ def build_category_bundles(config_path: str, fmt: str, dry_run: bool, limit: int
307317
else:
308318
out_root.mkdir(parents=True, exist_ok=True)
309319
if fmt in ("md", "all"):
310-
write_markdown(out_root / f"{cat_slug}.md", cat, True, base_cats, pages_out, raw_base)
320+
total_words = sum(page_words.get(p.slug, 0) for p in pages_out)
321+
total_tokens = sum(page_tokens.get(p.slug, 0) for p in pages_out)
322+
write_markdown(out_root / f"{cat_slug}.md", cat, True, base_cats, pages_out, raw_base, total_words, total_tokens)
311323

312324
if dry_run:
313325
print("[dry-run] No files were written.")

0 commit comments

Comments
 (0)