From 4d11be8161a3632e245c05aaf61fd6f354be99de Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Wed, 30 Jul 2025 03:27:09 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Speed=20up=20function=20`p?= =?UTF-8?q?ivot=5Ftable`=20by=202,181%?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimization achieves a **2180% speedup** by eliminating the most expensive operation in the original code: repeatedly calling `df.iloc[i]` to access DataFrame rows. **Key Optimization: Vectorized Column Extraction** The critical change replaces the inefficient row-by-row DataFrame access: ```python # Original: Expensive row access (71.1% of total time) for i in range(len(df)): row = df.iloc[i] # This line alone took 244ms out of 344ms total index_val = row[index] column_val = row[columns] value = row[values] ``` With direct NumPy array extraction and zip iteration: ```python # Optimized: Extract entire columns as arrays once index_arr = df[index].values # 2.4ms columns_arr = df[columns].values # 1.3ms values_arr = df[values].values # 1.3ms # Then iterate over arrays directly for index_val, column_val, value in zip(index_arr, columns_arr, values_arr): ``` **Why This Works** 1. **DataFrame.iloc[i] is extremely slow** - it creates a new Series object for each row access and involves significant pandas overhead for indexing operations 2. **Array access is fast** - NumPy arrays provide direct memory access with minimal overhead 3. **Bulk extraction is efficient** - Getting entire columns at once leverages pandas' optimized column operations **Performance Impact by Test Case** The optimization excels across all test scenarios: - **Large-scale tests see massive gains**: 3543-6406% speedup for datasets with 1000+ rows - **Medium datasets (100-900 rows)**: 1560-5350% speedup - **Small datasets**: 57-129% speedup - **Edge cases**: Generally 19-92% faster, though very small datasets (single row, empty) show minimal or slightly negative impact due to the overhead of array extraction The optimization is particularly effective for scenarios with many rows since it eliminates the O(n) DataFrame row access overhead, making the algorithm scale much better with dataset size. --- src/numpy_pandas/dataframe_operations.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/numpy_pandas/dataframe_operations.py b/src/numpy_pandas/dataframe_operations.py index cb4cda2..3fe6d8e 100644 --- a/src/numpy_pandas/dataframe_operations.py +++ b/src/numpy_pandas/dataframe_operations.py @@ -62,31 +62,39 @@ def pivot_table( df: pd.DataFrame, index: str, columns: str, values: str, aggfunc: str = "mean" ) -> dict[Any, dict[Any, float]]: result = {} + # Define aggregation function if aggfunc == "mean": def agg_func(values): return sum(values) / len(values) + elif aggfunc == "sum": def agg_func(values): return sum(values) + elif aggfunc == "count": def agg_func(values): return len(values) + else: raise ValueError(f"Unsupported aggregation function: {aggfunc}") + + # Vectorized extraction of columns for faster row iteration + index_arr = df[index].values + columns_arr = df[columns].values + values_arr = df[values].values + + # Populate grouped_data directly using arrays, avoiding DataFrame row objects grouped_data = {} - for i in range(len(df)): - row = df.iloc[i] - index_val = row[index] - column_val = row[columns] - value = row[values] + for index_val, column_val, value in zip(index_arr, columns_arr, values_arr): if index_val not in grouped_data: grouped_data[index_val] = {} if column_val not in grouped_data[index_val]: grouped_data[index_val][column_val] = [] grouped_data[index_val][column_val].append(value) + for index_val in grouped_data: result[index_val] = {} for column_val in grouped_data[index_val]: