get tokens properly

harjotgill · harjotgill · commit b6b5107b8bdc · 2023-03-09T18:30:19.000-08:00
diff --git a/dist/index.js b/dist/index.js
diff --git a/src/bot.ts b/src/bot.ts
@@ -8,6 +8,7 @@ import {
   SendMessageBrowserOptions,
   SendMessageOptions
 } from 'chatgpt'
+import * as tokenizer from './tokenizer'
 
 // define type to save parentMessageId and conversationId
 export type Ids = {
@@ -49,15 +50,16 @@ export class Bot {
   }
 
   chat = async (message: string, ids: Ids): Promise<[string, Ids]> => {
-    console.time(`chatgpt ${message.length} tokens cost`)
+    const tokens = tokenizer.get_token_count(message)
+    console.time(`chatgpt ${tokens} tokens cost`)
     let new_ids: Ids = {}
     let response = ''
     try {
       ;[response, new_ids] = await this.chat_(message, ids)
     } catch (e: any) {
       core.warning(`Failed to chat: ${e}, backtrace: ${e.stack}`)
     } finally {
-      console.timeEnd(`chatgpt ${message.length} tokens cost`)
+      console.timeEnd(`chatgpt ${tokens} tokens cost`)
       return [response, new_ids]
     }
   }
diff --git a/src/review.ts b/src/review.ts
@@ -1,13 +1,10 @@
 import * as core from '@actions/core'
 import * as github from '@actions/github'
-import {get_encoding} from '@dqbd/tiktoken'
 import {Octokit} from '@octokit/action'
 import {Bot} from './bot.js'
 import {Commenter} from './commenter.js'
 import {Inputs, Options, Prompts} from './options.js'
-
-// TODO: make this configurable
-const tokenizer = get_encoding('cl100k_base')
+import * as tokenizer from './tokenizer'
 
 const token = core.getInput('token')
   ? core.getInput('token')
@@ -134,7 +131,7 @@ export const codeReview = async (
       next_review_ids = review_begin_ids
 
       if (file_content.length > 0) {
-        const file_content_tokens = await get_token_count(file_content)
+        const file_content_tokens = tokenizer.get_token_count(file_content)
         if (file_content_tokens < MAX_TOKENS_FOR_EXTRA_CONTENT) {
           // review file
           const [resp, review_file_ids] = await bot.chat(
@@ -154,7 +151,7 @@ export const codeReview = async (
       }
 
       if (file_diff.length > 0) {
-        const file_diff_tokens = await get_token_count(file_diff)
+        const file_diff_tokens = tokenizer.get_token_count(file_diff)
         if (file_diff_tokens < MAX_TOKENS_FOR_EXTRA_CONTENT) {
           // review diff
           const [resp, review_diff_ids] = await bot.chat(
@@ -328,8 +325,3 @@ const patch_comment_line = (patch: string): number => {
     return -1
   }
 }
-
-const get_token_count = async (text: string): Promise<number> => {
-  text = text.replace(/<\|endoftext\|>/g, '')
-  return tokenizer.encode(text).length
-}
diff --git a/src/tokenizer.ts b/src/tokenizer.ts
@@ -0,0 +1,12 @@
+import {get_encoding} from '@dqbd/tiktoken'
+
+const tokenizer = get_encoding('cl100k_base')
+
+export function encode(input: string): Uint32Array {
+  return tokenizer.encode(input)
+}
+
+export function get_token_count(input: string): number {
+  input = input.replace(/<\|endoftext\|>/g, '')
+  return encode(input).length
+}