Skip to content

Commit 83659a9

Browse files
committed
update token function per feedback from Copilot
1 parent 214f586 commit 83659a9

File tree

1 file changed

+0
-12
lines changed

1 file changed

+0
-12
lines changed

scripts/generate_category_bundles.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -151,18 +151,9 @@ def load_all_pages(ai_dir: Path) -> List[AiPage]:
151151
# ----------------------------
152152

153153
def _heuristic_token_count(s: str) -> int:
154-
"""
155-
Dependency-free token estimate:
156-
- counts words and standalone punctuation
157-
- decent for prose and code; model-agnostic
158-
"""
159154
return len(re.findall(r"\w+|[^\s\w]", s, flags=re.UNICODE))
160155

161156
def _cl100k_token_count(s: str) -> int:
162-
"""
163-
Optional: if tiktoken is installed and estimator name is 'cl100k',
164-
compute tokens via cl100k_base; otherwise fall back to heuristic.
165-
"""
166157
try:
167158
import tiktoken # type: ignore
168159
enc = tiktoken.get_encoding("cl100k_base")
@@ -171,11 +162,8 @@ def _cl100k_token_count(s: str) -> int:
171162
return _heuristic_token_count(s)
172163

173164
def estimate_tokens(text: str, estimator: str = "heuristic-v1") -> int:
174-
if estimator == "heuristic-v1":
175-
return _heuristic_token_count(text)
176165
if estimator == "cl100k":
177166
return _cl100k_token_count(text)
178-
# Unknown/custom estimator name → compute via heuristic but keep the label in outputs.
179167
return _heuristic_token_count(text)
180168

181169

0 commit comments

Comments
 (0)