Skip to content

Commit 06db547

Browse files
authored
Use a WASM-based tiktoken implementation (RooCodeInc#2859)
* Use a WASM-based tiktoken implementation * Clean up imports
1 parent 7e76736 commit 06db547

File tree

5 files changed

+37
-26
lines changed

5 files changed

+37
-26
lines changed

esbuild.js

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,23 @@ const copyWasmFiles = {
2929
name: "copy-wasm-files",
3030
setup(build) {
3131
build.onEnd(() => {
32-
// tree sitter
33-
const sourceDir = path.join(__dirname, "node_modules", "web-tree-sitter")
34-
const targetDir = path.join(__dirname, "dist")
35-
36-
// Copy tree-sitter.wasm
37-
fs.copyFileSync(path.join(sourceDir, "tree-sitter.wasm"), path.join(targetDir, "tree-sitter.wasm"))
38-
39-
// Copy language-specific WASM files
40-
const languageWasmDir = path.join(__dirname, "node_modules", "tree-sitter-wasms", "out")
32+
const nodeModulesDir = path.join(__dirname, "node_modules")
33+
const distDir = path.join(__dirname, "dist")
34+
35+
// tiktoken
36+
fs.copyFileSync(
37+
path.join(nodeModulesDir, "tiktoken", "tiktoken_bg.wasm"),
38+
path.join(distDir, "tiktoken_bg.wasm"),
39+
)
40+
41+
// tree-sitter WASM
42+
fs.copyFileSync(
43+
path.join(nodeModulesDir, "web-tree-sitter", "tree-sitter.wasm"),
44+
path.join(distDir, "tree-sitter.wasm"),
45+
)
46+
47+
// language-specific tree-sitter WASMs
48+
const languageWasmDir = path.join(nodeModulesDir, "tree-sitter-wasms", "out")
4149
const languages = [
4250
"typescript",
4351
"tsx",
@@ -57,7 +65,7 @@ const copyWasmFiles = {
5765

5866
languages.forEach((lang) => {
5967
const filename = `tree-sitter-${lang}.wasm`
60-
fs.copyFileSync(path.join(languageWasmDir, filename), path.join(targetDir, filename))
68+
fs.copyFileSync(path.join(languageWasmDir, filename), path.join(distDir, filename))
6169
})
6270
})
6371
},

package-lock.json

Lines changed: 7 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,6 @@
429429
"get-folder-size": "^5.0.0",
430430
"i18next": "^24.2.2",
431431
"isbinaryfile": "^5.0.2",
432-
"js-tiktoken": "^1.0.19",
433432
"mammoth": "^1.8.0",
434433
"monaco-vscode-textmate-theme-converter": "^0.1.7",
435434
"node-cache": "^5.1.2",
@@ -451,6 +450,7 @@
451450
"string-similarity": "^4.0.4",
452451
"strip-ansi": "^7.1.0",
453452
"strip-bom": "^5.0.0",
453+
"tiktoken": "^1.0.21",
454454
"tmp": "^0.2.3",
455455
"tree-sitter-wasms": "^0.1.11",
456456
"turndown": "^7.2.0",

src/api/providers/base-provider.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ import { Anthropic } from "@anthropic-ai/sdk"
22
import { ApiHandler } from ".."
33
import { ModelInfo } from "../../shared/api"
44
import { ApiStream } from "../transform/stream"
5-
import { Tiktoken } from "js-tiktoken/lite"
6-
import o200kBase from "js-tiktoken/ranks/o200k_base"
5+
import { Tiktoken } from "tiktoken/lite"
6+
import o200kBase from "tiktoken/encoders/o200k_base"
77

88
// Reuse the fudge factor used in the original code
99
const TOKEN_FUDGE_FACTOR = 1.5
@@ -34,21 +34,23 @@ export abstract class BaseProvider implements ApiHandler {
3434

3535
// Lazily create and cache the encoder if it doesn't exist
3636
if (!this.encoder) {
37-
this.encoder = new Tiktoken(o200kBase)
37+
this.encoder = new Tiktoken(o200kBase.bpe_ranks, o200kBase.special_tokens, o200kBase.pat_str)
3838
}
3939

4040
// Process each content block using the cached encoder
4141
for (const block of content) {
4242
if (block.type === "text") {
4343
// Use tiktoken for text token counting
4444
const text = block.text || ""
45+
4546
if (text.length > 0) {
4647
const tokens = this.encoder.encode(text)
4748
totalTokens += tokens.length
4849
}
4950
} else if (block.type === "image") {
5051
// For images, calculate based on data size
5152
const imageSource = block.source
53+
5254
if (imageSource && typeof imageSource === "object" && "data" in imageSource) {
5355
const base64Data = imageSource.data as string
5456
totalTokens += Math.ceil(Math.sqrt(base64Data.length))

src/core/sliding-window/__tests__/sliding-window.test.ts

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,13 @@
33
import { Anthropic } from "@anthropic-ai/sdk"
44

55
import { ModelInfo } from "../../../shared/api"
6-
import { ApiHandler } from "../../../api"
76
import { BaseProvider } from "../../../api/providers/base-provider"
8-
import { TOKEN_BUFFER_PERCENTAGE } from "../index"
9-
import { estimateTokenCount, truncateConversation, truncateConversationIfNeeded } from "../index"
7+
import {
8+
TOKEN_BUFFER_PERCENTAGE,
9+
estimateTokenCount,
10+
truncateConversation,
11+
truncateConversationIfNeeded,
12+
} from "../index"
1013

1114
// Create a mock ApiHandler for testing
1215
class MockApiHandler extends BaseProvider {

0 commit comments

Comments
 (0)