Skip to content

Commit fa35ef8

Browse files
committed
fix
1 parent 6437b5b commit fa35ef8

File tree

1 file changed

+12
-1
lines changed

1 file changed

+12
-1
lines changed

scripts/translator.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,23 @@
1414

1515
MASTER_BRANCH = "master"
1616
VERBOSE = True
17-
MAX_TOKENS = 20000 #gpt-4-1106-preview
17+
MAX_TOKENS = 30000 #gpt-4-1106-preview
18+
DISALLOWED_SPECIAL = "<|endoftext|>"
19+
REPLACEMENT_TOKEN = "<END_OF_TEXT>"
20+
21+
def _sanitize(text: str) -> str:
22+
"""
23+
Replace the reserved tiktoken token with a harmless placeholder.
24+
Called everywhere a string can flow into tiktoken.encode() or the
25+
OpenAI client.
26+
"""
27+
return text.replace(DISALLOWED_SPECIAL, REPLACEMENT_TOKEN)
1828

1929
def reportTokens(prompt, model):
2030
encoding = tiktoken.encoding_for_model(model)
2131
# print number of tokens in light gray, with first 50 characters of prompt in green. if truncated, show that it is truncated
2232
#print("\033[37m" + str(len(encoding.encode(prompt))) + " tokens\033[0m" + " in prompt: " + "\033[92m" + prompt[:50] + "\033[0m" + ("..." if len(prompt) > 50 else ""))
33+
prompt = _sanitize(prompt)
2334
return len(encoding.encode(prompt))
2435

2536

0 commit comments

Comments
 (0)