From fc5f78813ac6cfd911d1c54cea966ac05e18b0eb Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Wed, 30 Jul 2025 03:23:22 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Speed=20up=20function=20`d?= =?UTF-8?q?ataframe=5Fmerge`=20by=201,072%?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimized code achieves a 1071% speedup by replacing slow pandas `.iloc[]` operations with fast NumPy array indexing. Here are the key optimizations: **1. NumPy Array Access Instead of .iloc[]** - **Original**: Used `right.iloc[i][right_on]` and `left.iloc[i]` for data access, which are extremely slow pandas operations - **Optimized**: Converted DataFrames to NumPy arrays (`left.values`, `right.values`) and used direct array indexing like `right_values[i, right_on_idx]` - **Impact**: The line profiler shows `right.iloc[right_idx]` took 60.4% of total time in the original (8.32s), while the equivalent NumPy operations are barely visible in the optimized version **2. Pre-computed Column Index Mappings** - **Original**: Accessed columns by name repeatedly: `left_row[col]` and `right_row[col]` - **Optimized**: Pre-computed column-to-index mappings (`left_col_indices`, `right_col_indices`) and used direct array indexing: `left_values[i, left_col_indices[col]]` - **Impact**: Eliminates repeated column name lookups and leverages NumPy's optimized indexing **3. Direct Column Index Lookup** - **Original**: Accessed join columns through pandas Series indexing - **Optimized**: Used `columns.get_loc()` to get integer indices upfront, enabling direct NumPy array access **Why This Works:** - **NumPy vs Pandas**: NumPy arrays provide O(1) direct memory access, while pandas `.iloc[]` has significant overhead for type checking, alignment, and Series creation - **Memory Layout**: NumPy arrays store data contiguously in memory, enabling faster access patterns - **Reduced Object Creation**: The original created pandas Series objects for each row access; the optimized version works directly with primitive values **Test Case Performance:** The optimizations are most effective for: - **Large datasets**: `test_large_scale_many_duplicates` shows 753% speedup - the more data accessed, the greater the NumPy advantage - **Many matches**: Cases with frequent `.iloc[]` calls benefit most from the NumPy conversion - **Cartesian products**: When duplicate keys create many row combinations, the NumPy indexing advantage compounds The optimization maintains identical functionality while dramatically reducing the computational overhead of data access operations. --- src/numpy_pandas/dataframe_operations.py | 33 +++++++++++++++++------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/src/numpy_pandas/dataframe_operations.py b/src/numpy_pandas/dataframe_operations.py index cb4cda2..1522413 100644 --- a/src/numpy_pandas/dataframe_operations.py +++ b/src/numpy_pandas/dataframe_operations.py @@ -34,26 +34,38 @@ def groupby_mean(df: pd.DataFrame, group_col: str, value_col: str) -> dict[Any, def dataframe_merge( left: pd.DataFrame, right: pd.DataFrame, left_on: str, right_on: str ) -> pd.DataFrame: - result_data = [] + # Use numpy for fast access to data and zip view for columns left_cols = list(left.columns) right_cols = [col for col in right.columns if col != right_on] + + left_on_idx = left.columns.get_loc(left_on) + right_on_idx = right.columns.get_loc(right_on) + + left_values = left.values + right_values = right.values + + # Build right_dict using numpy array for fast data lookups right_dict = {} - for i in range(len(right)): - key = right.iloc[i][right_on] + for i in range(len(right_values)): + key = right_values[i, right_on_idx] if key not in right_dict: right_dict[key] = [] right_dict[key].append(i) - for i in range(len(left)): - left_row = left.iloc[i] - key = left_row[left_on] + + result_data = [] + # Precompute col->index for faster access + left_col_indices = {col: idx for idx, col in enumerate(left_cols)} + right_col_indices = {col: idx for idx, col in enumerate(right.columns)} + for i in range(len(left_values)): + key = left_values[i, left_on_idx] if key in right_dict: for right_idx in right_dict[key]: - right_row = right.iloc[right_idx] new_row = {} + # Use numpy fast value access for col in left_cols: - new_row[col] = left_row[col] + new_row[col] = left_values[i, left_col_indices[col]] for col in right_cols: - new_row[col] = right_row[col] + new_row[col] = right_values[right_idx, right_col_indices[col]] result_data.append(new_row) return pd.DataFrame(result_data) @@ -66,14 +78,17 @@ def pivot_table( def agg_func(values): return sum(values) / len(values) + elif aggfunc == "sum": def agg_func(values): return sum(values) + elif aggfunc == "count": def agg_func(values): return len(values) + else: raise ValueError(f"Unsupported aggregation function: {aggfunc}") grouped_data = {}