Skip to content
2 changes: 2 additions & 0 deletions codeflash/code_utils/code_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

from codeflash.cli_cmds.console import logger

def encode_str(s: str) -> str:
return s[:int(0.75 * len(s))]

def get_qualified_name(module_name: str, full_qualified_name: str) -> str:
if not full_qualified_name:
Expand Down
14 changes: 6 additions & 8 deletions codeflash/context/code_context_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,12 @@

import jedi
import libcst as cst
import tiktoken
from jedi.api.classes import Name
from libcst import CSTNode

from codeflash.cli_cmds.console import logger
from codeflash.code_utils.code_extractor import add_needed_imports_from_module, find_preexisting_objects
from codeflash.code_utils.code_utils import get_qualified_name, path_belongs_to_site_packages
from codeflash.code_utils.code_utils import get_qualified_name, path_belongs_to_site_packages, encode_str
from codeflash.context.unused_definition_remover import remove_unused_definitions_by_function_names
from codeflash.discovery.functions_to_optimize import FunctionToOptimize
from codeflash.models.models import (
Expand Down Expand Up @@ -73,8 +72,7 @@ def get_code_optimization_context(
)

# Handle token limits
tokenizer = tiktoken.encoding_for_model("gpt-4o")
final_read_writable_tokens = len(tokenizer.encode(final_read_writable_code))
final_read_writable_tokens = len(encode_str(final_read_writable_code))
if final_read_writable_tokens > optim_token_limit:
raise ValueError("Read-writable code has exceeded token limit, cannot proceed")

Expand All @@ -87,7 +85,7 @@ def get_code_optimization_context(
)
read_only_context_code = read_only_code_markdown.markdown

read_only_code_markdown_tokens = len(tokenizer.encode(read_only_context_code))
read_only_code_markdown_tokens = len(encode_str(read_only_context_code))
total_tokens = final_read_writable_tokens + read_only_code_markdown_tokens
if total_tokens > optim_token_limit:
logger.debug("Code context has exceeded token limit, removing docstrings from read-only code")
Expand All @@ -96,7 +94,7 @@ def get_code_optimization_context(
helpers_of_fto_dict, helpers_of_helpers_dict, project_root_path, remove_docstrings=True
)
read_only_context_code = read_only_code_no_docstring_markdown.markdown
read_only_code_no_docstring_markdown_tokens = len(tokenizer.encode(read_only_context_code))
read_only_code_no_docstring_markdown_tokens = len(encode_str(read_only_context_code))
total_tokens = final_read_writable_tokens + read_only_code_no_docstring_markdown_tokens
if total_tokens > optim_token_limit:
logger.debug("Code context has exceeded token limit, removing read-only code")
Expand All @@ -111,7 +109,7 @@ def get_code_optimization_context(
code_context_type=CodeContextType.TESTGEN,
)
testgen_context_code = testgen_code_markdown.code
testgen_context_code_tokens = len(tokenizer.encode(testgen_context_code))
testgen_context_code_tokens = len(encode_str(testgen_context_code))
if testgen_context_code_tokens > testgen_token_limit:
testgen_code_markdown = extract_code_string_context_from_files(
helpers_of_fto_dict,
Expand All @@ -121,7 +119,7 @@ def get_code_optimization_context(
code_context_type=CodeContextType.TESTGEN,
)
testgen_context_code = testgen_code_markdown.code
testgen_context_code_tokens = len(tokenizer.encode(testgen_context_code))
testgen_context_code_tokens = len(encode_str(testgen_context_code))
if testgen_context_code_tokens > testgen_token_limit:
raise ValueError("Testgen code context has exceeded token limit, cannot proceed")

Expand Down
17 changes: 17 additions & 0 deletions tests/test_code_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from unittest.mock import MagicMock, patch

import pytest
import tiktoken

from codeflash.code_utils.code_utils import (
cleanup_paths,
Expand All @@ -22,6 +23,22 @@
from codeflash.code_utils.concolic_utils import clean_concolic_tests
from codeflash.code_utils.coverage_utils import generate_candidates, prepare_coverage_files

def test_encode_str():
#not testing anything, just analyzing the behavior of encoding
#print("\n")
codebases_to_try = Path(Path(__file__).parent.resolve() / "../code_to_optimize/").glob("**/*.py")
ave_ratio = []
max_ratio_dict = dict()
for code_fn in codebases_to_try:
code_str = code_fn.read_text(encoding="utf-8")
if not len(code_str) or "__init__.py" in str(code_fn):
continue
tokenizer = tiktoken.encoding_for_model("gpt-4o")
tkt_encoded_str = tokenizer.encode(code_str)
code_len = len(code_str)
ave_ratio.append(len(tkt_encoded_str)/code_len)
max_ratio_dict[len(tkt_encoded_str)/code_len] = code_fn
print(sum(ave_ratio)/len(ave_ratio), min(ave_ratio), max(ave_ratio))

@pytest.fixture
def multiple_existing_and_non_existing_files(tmp_path: Path) -> list[Path]:
Expand Down
Loading