Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions code_to_optimize/few_formatting_errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os

class UnformattedExampleClass(object):
def __init__(
self,
name,
age= None,
email= None,
phone=None,
address=None,
city=None,
state=None,
zip_code=None,
):
self.name = name
self.age = age
self.email = email
self.phone = phone
self. address = address
self.city = city
self.state = state
self.zip_code = zip_code
self.data = {"name": name, "age": age, "email": email}

def get_info(self):
return f"Name: {self.name}, Age: {self.age}"

def update_data(self, **kwargs):
for key, value in kwargs.items():
if hasattr(self, key):
setattr(self, key, value)
self.data.update(kwargs)


def process_data(
data_list, filter_func=None, transform_func=None, sort_key=None, reverse=False
):
if not data_list:
return []
if filter_func:
data_list = [ item for item in data_list if filter_func(item)]
if transform_func:
data_list = [transform_func(item) for item in data_list]
if sort_key:
data_list = sorted(data_list, key=sort_key, reverse=reverse)
return data_list

147 changes: 147 additions & 0 deletions code_to_optimize/many_formatting_errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import os,sys,json,datetime,math,random;import requests;from collections import defaultdict,OrderedDict
from typing import List,Dict,Optional,Union,Tuple,Any;import numpy as np;import pandas as pd

# This is a poorly formatted Python file with many style violations

class UnformattedExampleClass( object ):
def __init__(self,name,age=None,email=None,phone=None,address=None,city=None,state=None,zip_code=None):
self.name=name;self.age=age;self.email=email;self.phone=phone
self.address=address;self.city=city;self.state=state;self.zip_code=zip_code
self.data={"name":name,"age":age,"email":email}

def get_info(self ):
return f"Name: {self.name}, Age: {self.age}"

def update_data(self,**kwargs):
for key,value in kwargs.items():
if hasattr(self,key):setattr(self,key,value)
self.data.update(kwargs)

def process_data(data_list,filter_func=None,transform_func=None,sort_key=None,reverse=False):
if not data_list:return[]
if filter_func:data_list=[item for item in data_list if filter_func(item)]
if transform_func:data_list=[transform_func(item)for item in data_list]
if sort_key:data_list=sorted(data_list,key=sort_key,reverse=reverse)
return data_list

def calculate_statistics(numbers):
if not numbers:return None
mean=sum(numbers)/len(numbers); median=sorted(numbers)[len(numbers)//2]
variance=sum((x-mean)**2 for x in numbers)/len(numbers);std_dev=math.sqrt(variance)
return {"mean":mean,"median":median,"variance":variance,"std_dev":std_dev,"min":min(numbers),"max":max(numbers)}

def complex_nested_function(x,y,z):
def inner_function_1(a,b):
def deeply_nested(c,d):
return c*d+a*b
return deeply_nested(a+1,b-1)+deeply_nested(a-1,b+1)
def inner_function_2 (a,b,c):
result=[]
for i in range(a):
for j in range(b):
for k in range(c):
if i*j*k>0:result.append(i*j*k)
elif i+j+k==0:result.append(-1)
else :result.append(0)
return result
return inner_function_1(x,y)+sum(inner_function_2(x,y,z))

# Long lines and poor dictionary formatting
user_data={"users":[{"id":1,"name":"John Doe","email":"[email protected]","preferences":{"theme":"dark","notifications":True,"language":"en"},"metadata":{"created_at":"2023-01-01","last_login":"2024-01-01","login_count":150}},{"id":2,"name":"Jane Smith","email":"[email protected]","preferences":{"theme":"light","notifications":False,"language":"es"},"metadata":{"created_at":"2023-02-15","last_login":"2024-01-15","login_count":89}}]}

# Poor list formatting and string concatenation
long_list_of_items=['item_1','item_2','item_3','item_4','item_5','item_6','item_7','item_8','item_9','item_10','item_11','item_12','item_13','item_14','item_15','item_16','item_17','item_18','item_19','item_20']

def generate_report(data,include_stats=True,include_charts=False,format_type='json',output_file=None):
if not data:raise ValueError("Data cannot be empty")
report={'timestamp':datetime.datetime.now().isoformat(),'data_count':len(data),'summary':{}}

# Bad formatting in loops and conditionals
for i,item in enumerate(data):
if isinstance(item,dict):
for key,value in item.items():
if key not in report['summary']:report['summary'][key]=[]
report['summary'][key].append(value)
elif isinstance(item,(int,float)):
if 'numbers' not in report['summary']:report['summary']['numbers']=[]
report['summary']['numbers'].append(item)
else:
if 'other' not in report['summary']:report['summary']['other']=[]
report['summary']['other'].append(str(item))

if include_stats and 'numbers' in report['summary']:
numbers=report['summary']['numbers']
report['statistics']=calculate_statistics(numbers)

# Long conditional chain with poor formatting
if format_type=='json':result=json.dumps(report,indent=None,separators=(',',':'))
elif format_type=='pretty_json':result=json.dumps(report,indent=2)
elif format_type=='string':result=str(report)
else:result=report

if output_file:
with open(output_file,'w')as f:f.write(result if isinstance(result,str)else json.dumps(result))

return result

class DataProcessor ( UnformattedExampleClass ) :
def __init__(self,data_source,config=None,debug=False):
super().__init__("DataProcessor")
self.data_source=data_source;self.config=config or{};self.debug=debug
self.processed_data=[];self.errors=[];self.warnings=[]

def load_data ( self ) :
try:
if isinstance(self.data_source,str):
if self.data_source.endswith('.json'):
with open(self.data_source,'r')as f:data=json.load(f)
elif self.data_source.endswith('.csv'):data=pd.read_csv(self.data_source).to_dict('records')
else:raise ValueError(f"Unsupported file type: {self.data_source}")
elif isinstance(self.data_source,list):data=self.data_source
else:data=[self.data_source]
return data
except Exception as e:
self.errors.append(str(e));return[]

def validate_data(self,data):
valid_items=[];invalid_items=[]
for item in data:
if isinstance(item,dict)and'id'in item and'name'in item:valid_items.append(item)
else:invalid_items.append(item)
if invalid_items:self.warnings.append(f"Found {len(invalid_items)} invalid items")
return valid_items

def process(self):
data=self.load_data()
if not data:return{"success":False,"error":"No data loaded"}

validated_data=self.validate_data(data)
processed_result=process_data(validated_data,
filter_func=lambda x:x.get('active',True),
transform_func=lambda x:{**x,'processed_at':datetime.datetime.now().isoformat()},
sort_key=lambda x:x.get('name',''))

self.processed_data=processed_result
return{"success":True,"count":len(processed_result),"data":processed_result}
if __name__=="__main__":
sample_data=[{"id":1,"name":"Alice","active":True},{"id":2,"name":"Bob","active":False},{"id":3,"name":"Charlie","active":True}]

processor=DataProcessor(sample_data,config={"debug":True})
result=processor.process()

if result["success"]:
print(f"Successfully processed {result['count']} items")
for item in result["data"][:3]:print(f"- {item['name']} (ID: {item['id']})")
else:print(f"Processing failed: {result.get('error','Unknown error')}")

# Generate report with poor formatting
report=generate_report(sample_data,include_stats=True,format_type='pretty_json')
print("Generated report:",report[:100]+"..."if len(report)>100 else report)

# Complex calculation with poor spacing
numbers=[random.randint(1,100)for _ in range(50)]
stats=calculate_statistics(numbers)
complex_result=complex_nested_function(5,3,2)

print(f"Statistics: mean={stats['mean']:.2f}, std_dev={stats['std_dev']:.2f}")
print(f"Complex calculation result: {complex_result}")
71 changes: 71 additions & 0 deletions code_to_optimize/no_formatting_errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import os, sys, json, datetime, math, random
import requests
from collections import defaultdict, OrderedDict
from typing import List, Dict, Optional, Union, Tuple, Any
import numpy as np
import pandas as pd

# This is a poorly formatted Python file with many style violations


class UnformattedExampleClass(object):
def __init__(
self,
name,
age=None,
email=None,
phone=None,
address=None,
city=None,
state=None,
zip_code=None,
):
self.name = name
self.age = age
self.email = email
self.phone = phone
self.address = address
self.city = city
self.state = state
self.zip_code = zip_code
self.data = {"name": name, "age": age, "email": email}

def get_info(self):
return f"Name: {self.name}, Age: {self.age}"

def update_data(self, **kwargs):
for key, value in kwargs.items():
if hasattr(self, key):
setattr(self, key, value)
self.data.update(kwargs)


def process_data(
data_list, filter_func=None, transform_func=None, sort_key=None, reverse=False
):
if not data_list:
return []
if filter_func:
data_list = [item for item in data_list if filter_func(item)]
if transform_func:
data_list = [transform_func(item) for item in data_list]
if sort_key:
data_list = sorted(data_list, key=sort_key, reverse=reverse)
return data_list


def calculate_statistics(numbers):
if not numbers:
return None
mean = sum(numbers) / len(numbers)
median = sorted(numbers)[len(numbers) // 2]
variance = sum((x - mean) ** 2 for x in numbers) / len(numbers)
std_dev = math.sqrt(variance)
return {
"mean": mean,
"median": median,
"variance": variance,
"std_dev": std_dev,
"min": min(numbers),
"max": max(numbers),
}
47 changes: 43 additions & 4 deletions codeflash/code_utils/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import shlex
import subprocess
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Optional

import isort

Expand All @@ -13,14 +13,53 @@
from pathlib import Path


def get_diff_output_by_black(filepath: str, unformatted_content: str) -> Optional[str]:
try:
from black import Mode, format_file_contents, output, report

formatted_content = format_file_contents(src_contents=unformatted_content, fast=True, mode=Mode())
return output.diff(unformatted_content, formatted_content, a_name=filepath, b_name=filepath)
except (ImportError, report.NothingChanged):
return None


def get_diff_lines_count(diff_output: str) -> int:
lines = diff_output.split("\n")

def is_diff_line(line: str) -> bool:
return line.startswith(("+", "-")) and not line.startswith(("+++", "---"))

diff_lines = [line for line in lines if is_diff_line(line)]
return len(diff_lines)


Comment on lines +27 to +35
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚡️Codeflash found 16% (0.16x) speedup for get_diff_lines_count in codeflash/code_utils/formatter.py

⏱️ Runtime : 2.65 milliseconds 2.29 milliseconds (best of 263 runs)

📝 Explanation and details

Here's a faster version of your program. The main bottleneck is the list comprehension calling a nested function for every line, leading to relatively high overhead. Avoiding the inner function, only iterating once, and not building an intermediate list (since we only need the count) can significantly improve performance.

You do not need to build the list of lines; simply scan and count qualifying lines in a single pass.

Optimized version.

Changes made:

  • Inlined the is_diff_line logic for better performance (function call overhead avoided).
  • Used a running integer (count) instead of a list, so memory use and processing are reduced.
  • Avoided creating an unnecessary list and removed the nested function.

This is as fast and memory-efficient as this logic gets in idiomatic Python.

Correctness verification report:

Test Status
⚙️ Existing Unit Tests 🔘 None Found
🌀 Generated Regression Tests 61 Passed
⏪ Replay Tests 🔘 None Found
🔎 Concolic Coverage Tests 1 Passed
📊 Tests Coverage 100.0%
🌀 Generated Regression Tests Details
from __future__ import annotations

# imports
import pytest  # used for our unit tests
from codeflash.code_utils.formatter import get_diff_lines_count

# unit tests

# ------------------------
# Basic Test Cases
# ------------------------

def test_empty_string_returns_zero():
    # No lines at all
    codeflash_output = get_diff_lines_count("")

def test_no_diff_lines():
    # No lines starting with + or -
    diff = " context line 1\n context line 2"
    codeflash_output = get_diff_lines_count(diff)

def test_single_addition_line():
    # One line added
    diff = "+added line"
    codeflash_output = get_diff_lines_count(diff)

def test_single_deletion_line():
    # One line deleted
    diff = "-deleted line"
    codeflash_output = get_diff_lines_count(diff)

def test_mixed_add_and_delete():
    # Mixed + and - lines
    diff = "+added\n-context\n+another add\n-context2\n-deleted"
    codeflash_output = get_diff_lines_count(diff)  # 2 additions, 2 deletions

def test_ignores_diff_headers():
    # Ignore lines like --- and +++
    diff = "--- a/file.txt\n+++ b/file.txt\n+added\n-context\n-deleted"
    codeflash_output = get_diff_lines_count(diff)  # +added, -deleted

def test_only_diff_headers():
    # Only diff headers, no actual changes
    diff = "--- a/file.txt\n+++ b/file.txt"
    codeflash_output = get_diff_lines_count(diff)

def test_context_lines_with_leading_spaces():
    # Lines with spaces before + or - are not counted
    diff = " +not counted\n -not counted\n+counted\n-counted"
    codeflash_output = get_diff_lines_count(diff)

def test_multiple_changes_and_headers():
    # Headers and multiple diff lines
    diff = (
        "--- a/foo.txt\n"
        "+++ b/foo.txt\n"
        "@@ -1,3 +1,4 @@\n"
        " line 1\n"
        "+added 1\n"
        " line 2\n"
        "-deleted 1\n"
        " line 3\n"
        "+added 2\n"
        "-deleted 2"
    )
    codeflash_output = get_diff_lines_count(diff)  # +added 1, -deleted 1, +added 2, -deleted 2

# ------------------------
# Edge Test Cases
# ------------------------

def test_lines_with_plus_and_minus_in_middle():
    # Lines with + or - not at start should not be counted
    diff = "context + not counted\ncontext - not counted"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_only_plus_and_minus():
    # Lines that are just "+" or "-" are counted
    diff = "+\n-\n---\n+++"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_trailing_spaces():
    # Lines with trailing spaces after + or - are counted
    diff = "+added \n-deleted \n context"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_multiple_plus_or_minus():
    # Lines starting with ++ or -- but not +++ or ---
    diff = "++double plus\n--double minus\n+++triple plus\n---triple minus"
    codeflash_output = get_diff_lines_count(diff)  # only ++double plus, --double minus

def test_lines_with_tabs():
    # Lines with tabs after + or - are counted
    diff = "+\tadded with tab\n-\tdeleted with tab"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_unicode_characters():
    # Lines with unicode characters
    diff = "+añadido\n-удалено\n context"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_empty_lines_between():
    # Empty lines should not affect the count
    diff = "+add1\n\n-add2\n\n\n+add3"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_only_newlines():
    # Only newlines, no diff lines
    diff = "\n\n\n"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_mixed_line_endings():
    # Test with mixed \n and \r\n endings
    diff = "+add1\r\n-context\r\n-add2\n+add3"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_leading_and_trailing_whitespace():
    # Only lines that start with + or - (no spaces before) are counted
    diff = " +not counted\n- counted\n+counted\n-counted"
    codeflash_output = get_diff_lines_count(diff)

# ------------------------
# Large Scale Test Cases
# ------------------------

def test_large_number_of_additions():
    # 1000 addition lines
    diff = "\n".join(["+line{}".format(i) for i in range(1000)])
    codeflash_output = get_diff_lines_count(diff)

def test_large_number_of_deletions():
    # 1000 deletion lines
    diff = "\n".join(["-line{}".format(i) for i in range(1000)])
    codeflash_output = get_diff_lines_count(diff)

def test_large_mixed_diff():
    # 500 additions, 500 deletions, 500 context lines, 2 headers
    additions = ["+add{}".format(i) for i in range(500)]
    deletions = ["-del{}".format(i) for i in range(500)]
    context = [" context{}".format(i) for i in range(500)]
    headers = ["--- a/file.txt", "+++ b/file.txt"]
    diff = "\n".join(headers + additions + deletions + context)
    codeflash_output = get_diff_lines_count(diff)

def test_large_diff_with_headers_and_noise():
    # 400 additions, 400 deletions, 100 context, 100 lines starting with +++ or ---
    additions = ["+add{}".format(i) for i in range(400)]
    deletions = ["-del{}".format(i) for i in range(400)]
    context = [" context{}".format(i) for i in range(100)]
    noise = ["+++ noise{}".format(i) for i in range(50)] + ["--- noise{}".format(i) for i in range(50)]
    diff = "\n".join(noise + additions + deletions + context)
    codeflash_output = get_diff_lines_count(diff)

def test_large_diff_with_varied_line_types():
    # 300 additions, 300 deletions, 200 context, 100 lines with leading spaces
    additions = ["+add{}".format(i) for i in range(300)]
    deletions = ["-del{}".format(i) for i in range(300)]
    context = [" context{}".format(i) for i in range(200)]
    leading_spaces = [" +notcounted{}".format(i) for i in range(50)] + [" -notcounted{}".format(i) for i in range(50)]
    diff = "\n".join(additions + deletions + context + leading_spaces)
    codeflash_output = get_diff_lines_count(diff)

# ------------------------
# Additional Edge Cases for Robustness
# ------------------------

def test_diff_with_only_headers_and_context():
    # Only headers and context lines
    diff = "--- a/file.txt\n+++ b/file.txt\n context1\n context2"
    codeflash_output = get_diff_lines_count(diff)

def test_diff_with_plus_and_minus_in_middle_of_line():
    # + or - in the middle should not count
    diff = "context + not counted\ncontext - not counted\n+counted\n-counted"
    codeflash_output = get_diff_lines_count(diff)

def test_diff_with_empty_lines_and_whitespace():
    # Empty lines and whitespace-only lines
    diff = "\n \n\t\n+add\n-del\n"
    codeflash_output = get_diff_lines_count(diff)

def test_diff_with_non_ascii_and_binary_like_lines():
    # Non-ASCII and binary-like content
    diff = "+\x00\x01\n-\x02\x03\n context"
    codeflash_output = get_diff_lines_count(diff)

def test_diff_with_at_at_lines():
    # Lines starting with @@ should not be counted
    diff = "@@ -1,3 +1,4 @@\n+add\n-del"
    codeflash_output = get_diff_lines_count(diff)

def test_diff_with_tricky_headers():
    # Lines like ---- or ++++ (more than 3) are not headers and should be counted
    diff = "---- not header\n++++ not header\n+++ header\n--- header"
    codeflash_output = get_diff_lines_count(diff)  # ---- not header, ++++ not header

def test_diff_with_mixed_case_headers():
    # Headers are always lowercase (per diff), so +++A is not a header
    diff = "+++A\n---B\n+add\n-del"
    codeflash_output = get_diff_lines_count(diff)

def test_diff_with_windows_line_endings():
    # Windows CRLF endings
    diff = "+add1\r\n-context\r\n-add2\r\n+add3\r\n"
    codeflash_output = get_diff_lines_count(diff)

def test_diff_with_leading_newlines():
    # Leading newlines before diff lines
    diff = "\n\n+add\n-del"
    codeflash_output = get_diff_lines_count(diff)

def test_diff_with_trailing_newlines():
    # Trailing newlines after diff lines
    diff = "+add\n-del\n\n\n"
    codeflash_output = get_diff_lines_count(diff)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

from __future__ import annotations

# imports
import pytest  # used for our unit tests
from codeflash.code_utils.formatter import get_diff_lines_count

# unit tests

# -------------------- BASIC TEST CASES --------------------

def test_empty_string_returns_zero():
    # No diff lines in empty input
    codeflash_output = get_diff_lines_count("")

def test_no_diff_lines():
    # Input contains lines but none are diff lines
    diff = "context line\nanother context line"
    codeflash_output = get_diff_lines_count(diff)

def test_single_added_line():
    # Single added line
    diff = "+added line"
    codeflash_output = get_diff_lines_count(diff)

def test_single_removed_line():
    # Single removed line
    diff = "-removed line"
    codeflash_output = get_diff_lines_count(diff)

def test_mixed_diff_lines():
    # Mixed added and removed lines
    diff = "+added\n-removed\n context"
    codeflash_output = get_diff_lines_count(diff)

def test_ignores_diff_headers():
    # Should not count diff headers like '+++', '---'
    diff = "--- a/file.txt\n+++ b/file.txt\n+added\n-removed\n context"
    codeflash_output = get_diff_lines_count(diff)

def test_multiple_diff_lines_with_context():
    # Multiple diff lines interleaved with context
    diff = (
        " context1\n"
        "+added1\n"
        " context2\n"
        "-removed1\n"
        "+added2\n"
        " context3\n"
        "-removed2"
    )
    codeflash_output = get_diff_lines_count(diff)

def test_diff_lines_with_leading_spaces():
    # Only lines starting *exactly* with '+' or '-' count
    diff = " +not counted\n-added\n -not counted\n+added"
    codeflash_output = get_diff_lines_count(diff)

# -------------------- EDGE TEST CASES --------------------

def test_only_diff_headers():
    # Only diff headers, should return 0
    diff = "--- a/file\n+++ b/file"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_multiple_plus_minus():
    # Lines starting with multiple '+' or '-' but not '+++' or '---'
    diff = "++not header\n--not header\n+++ header\n--- header"
    # '++not header' and '--not header' should count, '+++ header' and '--- header' should not
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_only_plus_minus():
    # Lines that are just '+' or '-'
    diff = "+\n-\n context"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_tabs_and_spaces():
    # Tabs or spaces after + or - are fine, but not before
    diff = "+\tadded with tab\n- removed with space"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_unicode_characters():
    # Unicode should not affect counting
    diff = "+äöü\n-你好\n context"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_trailing_newlines():
    # Trailing newlines should not affect count
    diff = "+added\n-removed\n"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_only_newlines():
    # Only newlines, no diff lines
    diff = "\n\n\n"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_mixed_line_endings():
    # Mixed \n and \r\n endings
    diff = "+added\r\n-removed\n context\r\n"
    codeflash_output = get_diff_lines_count(diff.replace('\r\n', '\n'))

def test_lines_with_just_plus_or_minus_and_spaces():
    # Lines like '+ ' or '- ' should count
    diff = "+ \n- \n context"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_plus_minus_in_middle():
    # Only lines starting with + or - count
    diff = "context +added\ncontext -removed"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_plus_minus_but_not_at_start():
    # Should not count if + or - is not at the start
    diff = " context+added\n context-removed"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_plus_minus_but_too_short_for_header():
    # Lines like '++', '--' should count, but '+++' and '---' should not
    diff = "++\n--\n+++\n---"
    codeflash_output = get_diff_lines_count(diff)

def test_lines_with_plus_minus_and_numbers():
    # Should count as diff lines
    diff = "+1\n-2\n context"
    codeflash_output = get_diff_lines_count(diff)

# -------------------- LARGE SCALE TEST CASES --------------------

def test_large_number_of_added_lines():
    # 1000 added lines
    diff = "\n".join(["+line{}".format(i) for i in range(1000)])
    codeflash_output = get_diff_lines_count(diff)

def test_large_number_of_removed_lines():
    # 1000 removed lines
    diff = "\n".join(["-line{}".format(i) for i in range(1000)])
    codeflash_output = get_diff_lines_count(diff)

def test_large_mixed_diff_and_context():
    # 500 added, 500 removed, 500 context lines
    added = ["+a{}".format(i) for i in range(500)]
    removed = ["-r{}".format(i) for i in range(500)]
    context = [" context{}".format(i) for i in range(500)]
    # Interleave them
    lines = []
    for i in range(500):
        lines.append(added[i])
        lines.append(removed[i])
        lines.append(context[i])
    diff = "\n".join(lines)
    codeflash_output = get_diff_lines_count(diff)

def test_large_with_headers_and_diff_lines():
    # 100 diff headers, 800 diff lines, 100 context lines
    headers = ["--- a/file{}".format(i) for i in range(50)] + ["+++ b/file{}".format(i) for i in range(50)]
    diff_lines = ["+add{}".format(i) for i in range(400)] + ["-rem{}".format(i) for i in range(400)]
    context = [" context{}".format(i) for i in range(100)]
    lines = headers + diff_lines + context
    diff = "\n".join(lines)
    codeflash_output = get_diff_lines_count(diff)

def test_large_all_headers():
    # 1000 header lines, should return 0
    headers = ["+++ file{}".format(i) for i in range(500)] + ["--- file{}".format(i) for i in range(500)]
    diff = "\n".join(headers)
    codeflash_output = get_diff_lines_count(diff)

def test_large_randomized_diff_lines():
    # Mix of diff, context, and header lines
    lines = []
    for i in range(333):
        lines.append("+add{}".format(i))
        lines.append(" context{}".format(i))
        lines.append("--- file{}".format(i))
        lines.append("-rem{}".format(i))
        lines.append("+++ file{}".format(i))
    diff = "\n".join(lines)
    # Only +add and -rem lines count, so 333*2 = 666
    codeflash_output = get_diff_lines_count(diff)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

from codeflash.code_utils.formatter import get_diff_lines_count

def test_get_diff_lines_count():
    get_diff_lines_count('')

To test or edit this optimization locally git merge codeflash/optimize-pr284-2025-06-04T20.10.23

Suggested change
lines = diff_output.split("\n")
def is_diff_line(line: str) -> bool:
return line.startswith(("+", "-")) and not line.startswith(("+++", "---"))
diff_lines = [line for line in lines if is_diff_line(line)]
return len(diff_lines)
# Count only the diff lines as per is_diff_line logic, but avoid unnecessary function call and list allocation.
count = 0
for line in diff_output.split("\n"):
if line and line[0] in "+-" and not (line.startswith("+++") or line.startswith("---")):
count += 1
return count

def is_safe_to_format(filepath: str, content: str, max_diff_lines: int = 100) -> bool:
diff_changes_str = None

diff_changes_str = get_diff_output_by_black(filepath, unformatted_content=content)

if diff_changes_str is None:
logger.warning("Looks like black formatter not found, make sure it is installed.")
return False

diff_lines_count = get_diff_lines_count(diff_changes_str)
if diff_lines_count > max_diff_lines:
logger.debug(f"Skipping formatting {filepath}: {diff_lines_count} lines would change (max: {max_diff_lines})")
return False

return True


def format_code(formatter_cmds: list[str], path: Path, print_status: bool = True) -> str: # noqa
# TODO: Only allow a particular whitelist of formatters here to prevent arbitrary code execution
formatter_name = formatter_cmds[0].lower()
if not path.exists():
msg = f"File {path} does not exist. Cannot format the file."
raise FileNotFoundError(msg)
if formatter_name == "disabled":
return path.read_text(encoding="utf8")
file_content = path.read_text(encoding="utf8")
if formatter_name == "disabled" or not is_safe_to_format(filepath=str(path), content=file_content):
return file_content

file_token = "$file" # noqa: S105
for command in formatter_cmds:
formatter_cmd_list = shlex.split(command, posix=os.name != "nt")
Expand All @@ -29,7 +68,7 @@ def format_code(formatter_cmds: list[str], path: Path, print_status: bool = True
result = subprocess.run(formatter_cmd_list, capture_output=True, check=False)
if result.returncode == 0:
if print_status:
console.rule(f"Formatted Successfully with: {formatter_name.replace('$file', path.name)}")
console.rule(f"Formatted Successfully with: {command.replace('$file', path.name)}")
else:
logger.error(f"Failed to format code with {' '.join(formatter_cmd_list)}")
except FileNotFoundError as e:
Expand Down
4 changes: 3 additions & 1 deletion codeflash/discovery/functions_to_optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,12 +476,14 @@ def filter_functions(
if blocklist_funcs:
functions_tmp = []
for function in _functions:
if not (
if (
function.file_path.name in blocklist_funcs
and function.qualified_name in blocklist_funcs[function.file_path.name]
):
# This function is in blocklist, we can skip it
blocklist_funcs_removed_count += 1
continue
# This function is NOT in blocklist. we can keep it
functions_tmp.append(function)
_functions = functions_tmp

Expand Down
Loading
Loading