From ed1006e89a0e1e0a418c0789c7d65dc90040c33d Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Mon, 19 May 2025 14:42:23 -0400 Subject: [PATCH 01/10] remove tiktoken as a dependency --- codeflash/context/code_context_extractor.py | 4 +--- pyproject.toml | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/codeflash/context/code_context_extractor.py b/codeflash/context/code_context_extractor.py index ce54bb0e2..352d1fb9b 100644 --- a/codeflash/context/code_context_extractor.py +++ b/codeflash/context/code_context_extractor.py @@ -7,7 +7,6 @@ import jedi import libcst as cst -import tiktoken from jedi.api.classes import Name from libcst import CSTNode @@ -73,8 +72,7 @@ def get_code_optimization_context( ) # Handle token limits - tokenizer = tiktoken.encoding_for_model("gpt-4o") - final_read_writable_tokens = len(tokenizer.encode(final_read_writable_code)) + final_read_writable_tokens = len(final_read_writable_code)*0.75 if final_read_writable_tokens > optim_token_limit: raise ValueError("Read-writable code has exceeded token limit, cannot proceed") diff --git a/pyproject.toml b/pyproject.toml index 15dc01098..ee6fa9d6d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,7 +73,6 @@ pytest = ">=7.0.0,!=8.3.4" gitpython = ">=3.1.31" libcst = ">=1.0.1" jedi = ">=0.19.1" -tiktoken = ">=0.7.0" timeout-decorator = ">=0.5.0" pytest-timeout = ">=2.1.0" tomlkit = ">=0.11.7" From 663fa83dde5e05c37b583a2d79c3b586b0af610b Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Mon, 19 May 2025 14:46:01 -0400 Subject: [PATCH 02/10] another pr to remove tiktoken for passing tests --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index ee6fa9d6d..15dc01098 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ pytest = ">=7.0.0,!=8.3.4" gitpython = ">=3.1.31" libcst = ">=1.0.1" jedi = ">=0.19.1" +tiktoken = ">=0.7.0" timeout-decorator = ">=0.5.0" pytest-timeout = ">=2.1.0" tomlkit = ">=0.11.7" From 14de7b43126aa9ae1460ccd7829f1886e2d6b72d Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Mon, 19 May 2025 14:54:05 -0400 Subject: [PATCH 03/10] fixed now --- codeflash/context/code_context_extractor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/codeflash/context/code_context_extractor.py b/codeflash/context/code_context_extractor.py index 352d1fb9b..d28b9347d 100644 --- a/codeflash/context/code_context_extractor.py +++ b/codeflash/context/code_context_extractor.py @@ -85,7 +85,7 @@ def get_code_optimization_context( ) read_only_context_code = read_only_code_markdown.markdown - read_only_code_markdown_tokens = len(tokenizer.encode(read_only_context_code)) + read_only_code_markdown_tokens = len(read_only_context_code)*0.75 total_tokens = final_read_writable_tokens + read_only_code_markdown_tokens if total_tokens > optim_token_limit: logger.debug("Code context has exceeded token limit, removing docstrings from read-only code") @@ -94,7 +94,7 @@ def get_code_optimization_context( helpers_of_fto_dict, helpers_of_helpers_dict, project_root_path, remove_docstrings=True ) read_only_context_code = read_only_code_no_docstring_markdown.markdown - read_only_code_no_docstring_markdown_tokens = len(tokenizer.encode(read_only_context_code)) + read_only_code_no_docstring_markdown_tokens = len(read_only_context_code)*0.75 total_tokens = final_read_writable_tokens + read_only_code_no_docstring_markdown_tokens if total_tokens > optim_token_limit: logger.debug("Code context has exceeded token limit, removing read-only code") @@ -109,7 +109,7 @@ def get_code_optimization_context( code_context_type=CodeContextType.TESTGEN, ) testgen_context_code = testgen_code_markdown.code - testgen_context_code_tokens = len(tokenizer.encode(testgen_context_code)) + testgen_context_code_tokens = len(testgen_context_code)*0.75 if testgen_context_code_tokens > testgen_token_limit: testgen_code_markdown = extract_code_string_context_from_files( helpers_of_fto_dict, @@ -119,7 +119,7 @@ def get_code_optimization_context( code_context_type=CodeContextType.TESTGEN, ) testgen_context_code = testgen_code_markdown.code - testgen_context_code_tokens = len(tokenizer.encode(testgen_context_code)) + testgen_context_code_tokens = len(testgen_context_code)*0.75 if testgen_context_code_tokens > testgen_token_limit: raise ValueError("Testgen code context has exceeded token limit, cannot proceed") From 33c8258d9070bcff771b360d13fe875e7e271954 Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Mon, 19 May 2025 15:44:34 -0400 Subject: [PATCH 04/10] encoding as a separate function, can be replaced later for more accurate calculation --- codeflash/code_utils/code_utils.py | 2 ++ codeflash/context/code_context_extractor.py | 12 ++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/codeflash/code_utils/code_utils.py b/codeflash/code_utils/code_utils.py index f63756d98..d0f206754 100644 --- a/codeflash/code_utils/code_utils.py +++ b/codeflash/code_utils/code_utils.py @@ -10,6 +10,8 @@ from codeflash.cli_cmds.console import logger +def encode_str(s: str) -> str: + return s[:int(0.75 * len(s))] def get_qualified_name(module_name: str, full_qualified_name: str) -> str: if not full_qualified_name: diff --git a/codeflash/context/code_context_extractor.py b/codeflash/context/code_context_extractor.py index d28b9347d..b1e0e72b6 100644 --- a/codeflash/context/code_context_extractor.py +++ b/codeflash/context/code_context_extractor.py @@ -12,7 +12,7 @@ from codeflash.cli_cmds.console import logger from codeflash.code_utils.code_extractor import add_needed_imports_from_module, find_preexisting_objects -from codeflash.code_utils.code_utils import get_qualified_name, path_belongs_to_site_packages +from codeflash.code_utils.code_utils import get_qualified_name, path_belongs_to_site_packages, encode_str from codeflash.context.unused_definition_remover import remove_unused_definitions_by_function_names from codeflash.discovery.functions_to_optimize import FunctionToOptimize from codeflash.models.models import ( @@ -72,7 +72,7 @@ def get_code_optimization_context( ) # Handle token limits - final_read_writable_tokens = len(final_read_writable_code)*0.75 + final_read_writable_tokens = len(encode_str(final_read_writable_code)) if final_read_writable_tokens > optim_token_limit: raise ValueError("Read-writable code has exceeded token limit, cannot proceed") @@ -85,7 +85,7 @@ def get_code_optimization_context( ) read_only_context_code = read_only_code_markdown.markdown - read_only_code_markdown_tokens = len(read_only_context_code)*0.75 + read_only_code_markdown_tokens = len(encode_str(read_only_context_code)) total_tokens = final_read_writable_tokens + read_only_code_markdown_tokens if total_tokens > optim_token_limit: logger.debug("Code context has exceeded token limit, removing docstrings from read-only code") @@ -94,7 +94,7 @@ def get_code_optimization_context( helpers_of_fto_dict, helpers_of_helpers_dict, project_root_path, remove_docstrings=True ) read_only_context_code = read_only_code_no_docstring_markdown.markdown - read_only_code_no_docstring_markdown_tokens = len(read_only_context_code)*0.75 + read_only_code_no_docstring_markdown_tokens = len(encode_str(read_only_context_code)) total_tokens = final_read_writable_tokens + read_only_code_no_docstring_markdown_tokens if total_tokens > optim_token_limit: logger.debug("Code context has exceeded token limit, removing read-only code") @@ -109,7 +109,7 @@ def get_code_optimization_context( code_context_type=CodeContextType.TESTGEN, ) testgen_context_code = testgen_code_markdown.code - testgen_context_code_tokens = len(testgen_context_code)*0.75 + testgen_context_code_tokens = len(encode_str(testgen_context_code)) if testgen_context_code_tokens > testgen_token_limit: testgen_code_markdown = extract_code_string_context_from_files( helpers_of_fto_dict, @@ -119,7 +119,7 @@ def get_code_optimization_context( code_context_type=CodeContextType.TESTGEN, ) testgen_context_code = testgen_code_markdown.code - testgen_context_code_tokens = len(testgen_context_code)*0.75 + testgen_context_code_tokens = len(encode_str(testgen_context_code)) if testgen_context_code_tokens > testgen_token_limit: raise ValueError("Testgen code context has exceeded token limit, cannot proceed") From f5bfcd9a5b61655cde9569b366f2f9cf1e97089c Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Mon, 19 May 2025 16:35:31 -0400 Subject: [PATCH 05/10] cleaning up --- tests/test_code_utils.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_code_utils.py b/tests/test_code_utils.py index a10f50a56..ba3643238 100644 --- a/tests/test_code_utils.py +++ b/tests/test_code_utils.py @@ -5,6 +5,7 @@ from unittest.mock import MagicMock, patch import pytest +import tiktoken from codeflash.code_utils.code_utils import ( cleanup_paths, @@ -22,6 +23,22 @@ from codeflash.code_utils.concolic_utils import clean_concolic_tests from codeflash.code_utils.coverage_utils import generate_candidates, prepare_coverage_files +def test_encode_str(): + #not testing anything, just analyzing the behavior of encoding + #print("\n") + codebases_to_try = Path(Path(__file__).parent.resolve() / "../code_to_optimize/").glob("**/*.py") + ave_ratio = [] + max_ratio_dict = dict() + for code_fn in codebases_to_try: + code_str = code_fn.read_text(encoding="utf-8") + if not len(code_str) or "__init__.py" in str(code_fn): + continue + tokenizer = tiktoken.encoding_for_model("gpt-4o") + tkt_encoded_str = tokenizer.encode(code_str) + code_len = len(code_str) + ave_ratio.append(len(tkt_encoded_str)/code_len) + max_ratio_dict[len(tkt_encoded_str)/code_len] = code_fn + print(sum(ave_ratio)/len(ave_ratio), min(ave_ratio), max(ave_ratio)) @pytest.fixture def multiple_existing_and_non_existing_files(tmp_path: Path) -> list[Path]: From e705363ea59eb5417404dde01a8cb03f506c2256 Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Mon, 19 May 2025 14:08:16 -0700 Subject: [PATCH 06/10] Update codeflash/code_utils/code_utils.py --- codeflash/code_utils/code_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codeflash/code_utils/code_utils.py b/codeflash/code_utils/code_utils.py index d0f206754..7e73be48e 100644 --- a/codeflash/code_utils/code_utils.py +++ b/codeflash/code_utils/code_utils.py @@ -11,7 +11,7 @@ from codeflash.cli_cmds.console import logger def encode_str(s: str) -> str: - return s[:int(0.75 * len(s))] + return s[:len(s)//2] def get_qualified_name(module_name: str, full_qualified_name: str) -> str: if not full_qualified_name: From c5dbcbfde9226adb491df3af057247f947844607 Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Mon, 19 May 2025 17:11:24 -0400 Subject: [PATCH 07/10] restoring tests --- tests/test_code_utils.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/tests/test_code_utils.py b/tests/test_code_utils.py index ba3643238..a10f50a56 100644 --- a/tests/test_code_utils.py +++ b/tests/test_code_utils.py @@ -5,7 +5,6 @@ from unittest.mock import MagicMock, patch import pytest -import tiktoken from codeflash.code_utils.code_utils import ( cleanup_paths, @@ -23,22 +22,6 @@ from codeflash.code_utils.concolic_utils import clean_concolic_tests from codeflash.code_utils.coverage_utils import generate_candidates, prepare_coverage_files -def test_encode_str(): - #not testing anything, just analyzing the behavior of encoding - #print("\n") - codebases_to_try = Path(Path(__file__).parent.resolve() / "../code_to_optimize/").glob("**/*.py") - ave_ratio = [] - max_ratio_dict = dict() - for code_fn in codebases_to_try: - code_str = code_fn.read_text(encoding="utf-8") - if not len(code_str) or "__init__.py" in str(code_fn): - continue - tokenizer = tiktoken.encoding_for_model("gpt-4o") - tkt_encoded_str = tokenizer.encode(code_str) - code_len = len(code_str) - ave_ratio.append(len(tkt_encoded_str)/code_len) - max_ratio_dict[len(tkt_encoded_str)/code_len] = code_fn - print(sum(ave_ratio)/len(ave_ratio), min(ave_ratio), max(ave_ratio)) @pytest.fixture def multiple_existing_and_non_existing_files(tmp_path: Path) -> list[Path]: From 5dfd15fdda528cee147b2932e782bc8691fb754c Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Mon, 19 May 2025 22:23:29 -0400 Subject: [PATCH 08/10] return length directly --- codeflash/code_utils/code_utils.py | 4 ++-- codeflash/context/code_context_extractor.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/codeflash/code_utils/code_utils.py b/codeflash/code_utils/code_utils.py index 7e73be48e..7d1690557 100644 --- a/codeflash/code_utils/code_utils.py +++ b/codeflash/code_utils/code_utils.py @@ -10,8 +10,8 @@ from codeflash.cli_cmds.console import logger -def encode_str(s: str) -> str: - return s[:len(s)//2] +def encode_str(s: str) -> int: + return len(s)//2 def get_qualified_name(module_name: str, full_qualified_name: str) -> str: if not full_qualified_name: diff --git a/codeflash/context/code_context_extractor.py b/codeflash/context/code_context_extractor.py index b1e0e72b6..7c685efc8 100644 --- a/codeflash/context/code_context_extractor.py +++ b/codeflash/context/code_context_extractor.py @@ -72,7 +72,7 @@ def get_code_optimization_context( ) # Handle token limits - final_read_writable_tokens = len(encode_str(final_read_writable_code)) + final_read_writable_tokens = encode_str(final_read_writable_code) if final_read_writable_tokens > optim_token_limit: raise ValueError("Read-writable code has exceeded token limit, cannot proceed") @@ -85,7 +85,7 @@ def get_code_optimization_context( ) read_only_context_code = read_only_code_markdown.markdown - read_only_code_markdown_tokens = len(encode_str(read_only_context_code)) + read_only_code_markdown_tokens = encode_str(read_only_context_code) total_tokens = final_read_writable_tokens + read_only_code_markdown_tokens if total_tokens > optim_token_limit: logger.debug("Code context has exceeded token limit, removing docstrings from read-only code") @@ -94,7 +94,7 @@ def get_code_optimization_context( helpers_of_fto_dict, helpers_of_helpers_dict, project_root_path, remove_docstrings=True ) read_only_context_code = read_only_code_no_docstring_markdown.markdown - read_only_code_no_docstring_markdown_tokens = len(encode_str(read_only_context_code)) + read_only_code_no_docstring_markdown_tokens = encode_str(read_only_context_code) total_tokens = final_read_writable_tokens + read_only_code_no_docstring_markdown_tokens if total_tokens > optim_token_limit: logger.debug("Code context has exceeded token limit, removing read-only code") @@ -109,7 +109,7 @@ def get_code_optimization_context( code_context_type=CodeContextType.TESTGEN, ) testgen_context_code = testgen_code_markdown.code - testgen_context_code_tokens = len(encode_str(testgen_context_code)) + testgen_context_code_tokens = encode_str(testgen_context_code) if testgen_context_code_tokens > testgen_token_limit: testgen_code_markdown = extract_code_string_context_from_files( helpers_of_fto_dict, @@ -119,7 +119,7 @@ def get_code_optimization_context( code_context_type=CodeContextType.TESTGEN, ) testgen_context_code = testgen_code_markdown.code - testgen_context_code_tokens = len(encode_str(testgen_context_code)) + testgen_context_code_tokens = encode_str(testgen_context_code) if testgen_context_code_tokens > testgen_token_limit: raise ValueError("Testgen code context has exceeded token limit, cannot proceed") From 252e927470a8aba6ba1ae148091473e925c1abb4 Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Mon, 19 May 2025 22:36:14 -0400 Subject: [PATCH 09/10] function intent --- codeflash/code_utils/code_utils.py | 4 +++- codeflash/context/code_context_extractor.py | 12 ++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/codeflash/code_utils/code_utils.py b/codeflash/code_utils/code_utils.py index 7d1690557..13a844015 100644 --- a/codeflash/code_utils/code_utils.py +++ b/codeflash/code_utils/code_utils.py @@ -10,7 +10,9 @@ from codeflash.cli_cmds.console import logger -def encode_str(s: str) -> int: +def encoded_tokens_len(s: str) -> int: + '''Function for returning the approximate length of the encoded tokens + It's an approximation of BPE encoding (https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)''' return len(s)//2 def get_qualified_name(module_name: str, full_qualified_name: str) -> str: diff --git a/codeflash/context/code_context_extractor.py b/codeflash/context/code_context_extractor.py index 7c685efc8..bf55c7575 100644 --- a/codeflash/context/code_context_extractor.py +++ b/codeflash/context/code_context_extractor.py @@ -12,7 +12,7 @@ from codeflash.cli_cmds.console import logger from codeflash.code_utils.code_extractor import add_needed_imports_from_module, find_preexisting_objects -from codeflash.code_utils.code_utils import get_qualified_name, path_belongs_to_site_packages, encode_str +from codeflash.code_utils.code_utils import get_qualified_name, path_belongs_to_site_packages, encoded_tokens_len from codeflash.context.unused_definition_remover import remove_unused_definitions_by_function_names from codeflash.discovery.functions_to_optimize import FunctionToOptimize from codeflash.models.models import ( @@ -72,7 +72,7 @@ def get_code_optimization_context( ) # Handle token limits - final_read_writable_tokens = encode_str(final_read_writable_code) + final_read_writable_tokens = encoded_tokens_len(final_read_writable_code) if final_read_writable_tokens > optim_token_limit: raise ValueError("Read-writable code has exceeded token limit, cannot proceed") @@ -85,7 +85,7 @@ def get_code_optimization_context( ) read_only_context_code = read_only_code_markdown.markdown - read_only_code_markdown_tokens = encode_str(read_only_context_code) + read_only_code_markdown_tokens = encoded_tokens_len(read_only_context_code) total_tokens = final_read_writable_tokens + read_only_code_markdown_tokens if total_tokens > optim_token_limit: logger.debug("Code context has exceeded token limit, removing docstrings from read-only code") @@ -94,7 +94,7 @@ def get_code_optimization_context( helpers_of_fto_dict, helpers_of_helpers_dict, project_root_path, remove_docstrings=True ) read_only_context_code = read_only_code_no_docstring_markdown.markdown - read_only_code_no_docstring_markdown_tokens = encode_str(read_only_context_code) + read_only_code_no_docstring_markdown_tokens = encoded_tokens_len(read_only_context_code) total_tokens = final_read_writable_tokens + read_only_code_no_docstring_markdown_tokens if total_tokens > optim_token_limit: logger.debug("Code context has exceeded token limit, removing read-only code") @@ -109,7 +109,7 @@ def get_code_optimization_context( code_context_type=CodeContextType.TESTGEN, ) testgen_context_code = testgen_code_markdown.code - testgen_context_code_tokens = encode_str(testgen_context_code) + testgen_context_code_tokens = encoded_tokens_len(testgen_context_code) if testgen_context_code_tokens > testgen_token_limit: testgen_code_markdown = extract_code_string_context_from_files( helpers_of_fto_dict, @@ -119,7 +119,7 @@ def get_code_optimization_context( code_context_type=CodeContextType.TESTGEN, ) testgen_context_code = testgen_code_markdown.code - testgen_context_code_tokens = encode_str(testgen_context_code) + testgen_context_code_tokens = encoded_tokens_len(testgen_context_code) if testgen_context_code_tokens > testgen_token_limit: raise ValueError("Testgen code context has exceeded token limit, cannot proceed") From e54a6cdaeafbc884d3e23666f355600b6dd48c30 Mon Sep 17 00:00:00 2001 From: Saurabh Misra Date: Mon, 19 May 2025 22:40:17 -0400 Subject: [PATCH 10/10] remove tiktoken --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 15dc01098..ee6fa9d6d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,7 +73,6 @@ pytest = ">=7.0.0,!=8.3.4" gitpython = ">=3.1.31" libcst = ">=1.0.1" jedi = ">=0.19.1" -tiktoken = ">=0.7.0" timeout-decorator = ">=0.5.0" pytest-timeout = ">=2.1.0" tomlkit = ">=0.11.7"