From fc5f78813ac6cfd911d1c54cea966ac05e18b0eb Mon Sep 17 00:00:00 2001
From: "codeflash-ai[bot]"
 <148906541+codeflash-ai[bot]@users.noreply.github.com>
Date: Wed, 30 Jul 2025 03:23:22 +0000
Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Speed=20up=20function=20`d?=
 =?UTF-8?q?ataframe=5Fmerge`=20by=201,072%?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The optimized code achieves a 1071% speedup by replacing slow pandas `.iloc[]` operations with fast NumPy array indexing. Here are the key optimizations:

**1. NumPy Array Access Instead of .iloc[]**
- **Original**: Used `right.iloc[i][right_on]` and `left.iloc[i]` for data access, which are extremely slow pandas operations
- **Optimized**: Converted DataFrames to NumPy arrays (`left.values`, `right.values`) and used direct array indexing like `right_values[i, right_on_idx]`
- **Impact**: The line profiler shows `right.iloc[right_idx]` took 60.4% of total time in the original (8.32s), while the equivalent NumPy operations are barely visible in the optimized version

**2. Pre-computed Column Index Mappings**
- **Original**: Accessed columns by name repeatedly: `left_row[col]` and `right_row[col]`
- **Optimized**: Pre-computed column-to-index mappings (`left_col_indices`, `right_col_indices`) and used direct array indexing: `left_values[i, left_col_indices[col]]`
- **Impact**: Eliminates repeated column name lookups and leverages NumPy's optimized indexing

**3. Direct Column Index Lookup**
- **Original**: Accessed join columns through pandas Series indexing
- **Optimized**: Used `columns.get_loc()` to get integer indices upfront, enabling direct NumPy array access

**Why This Works:**
- **NumPy vs Pandas**: NumPy arrays provide O(1) direct memory access, while pandas `.iloc[]` has significant overhead for type checking, alignment, and Series creation
- **Memory Layout**: NumPy arrays store data contiguously in memory, enabling faster access patterns
- **Reduced Object Creation**: The original created pandas Series objects for each row access; the optimized version works directly with primitive values

**Test Case Performance:**
The optimizations are most effective for:
- **Large datasets**: `test_large_scale_many_duplicates` shows 753% speedup - the more data accessed, the greater the NumPy advantage
- **Many matches**: Cases with frequent `.iloc[]` calls benefit most from the NumPy conversion
- **Cartesian products**: When duplicate keys create many row combinations, the NumPy indexing advantage compounds

The optimization maintains identical functionality while dramatically reducing the computational overhead of data access operations.
---
 src/numpy_pandas/dataframe_operations.py | 33 +++++++++++++++++-------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/numpy_pandas/dataframe_operations.py b/src/numpy_pandas/dataframe_operations.py
index cb4cda2..1522413 100644
--- a/src/numpy_pandas/dataframe_operations.py
+++ b/src/numpy_pandas/dataframe_operations.py
@@ -34,26 +34,38 @@ def groupby_mean(df: pd.DataFrame, group_col: str, value_col: str) -> dict[Any,
 def dataframe_merge(
     left: pd.DataFrame, right: pd.DataFrame, left_on: str, right_on: str
 ) -> pd.DataFrame:
-    result_data = []
+    # Use numpy for fast access to data and zip view for columns
     left_cols = list(left.columns)
     right_cols = [col for col in right.columns if col != right_on]
+
+    left_on_idx = left.columns.get_loc(left_on)
+    right_on_idx = right.columns.get_loc(right_on)
+
+    left_values = left.values
+    right_values = right.values
+
+    # Build right_dict using numpy array for fast data lookups
     right_dict = {}
-    for i in range(len(right)):
-        key = right.iloc[i][right_on]
+    for i in range(len(right_values)):
+        key = right_values[i, right_on_idx]
         if key not in right_dict:
             right_dict[key] = []
         right_dict[key].append(i)
-    for i in range(len(left)):
-        left_row = left.iloc[i]
-        key = left_row[left_on]
+
+    result_data = []
+    # Precompute col->index for faster access
+    left_col_indices = {col: idx for idx, col in enumerate(left_cols)}
+    right_col_indices = {col: idx for idx, col in enumerate(right.columns)}
+    for i in range(len(left_values)):
+        key = left_values[i, left_on_idx]
         if key in right_dict:
             for right_idx in right_dict[key]:
-                right_row = right.iloc[right_idx]
                 new_row = {}
+                # Use numpy fast value access
                 for col in left_cols:
-                    new_row[col] = left_row[col]
+                    new_row[col] = left_values[i, left_col_indices[col]]
                 for col in right_cols:
-                    new_row[col] = right_row[col]
+                    new_row[col] = right_values[right_idx, right_col_indices[col]]
                 result_data.append(new_row)
     return pd.DataFrame(result_data)
 
@@ -66,14 +78,17 @@ def pivot_table(
 
         def agg_func(values):
             return sum(values) / len(values)
+
     elif aggfunc == "sum":
 
         def agg_func(values):
             return sum(values)
+
     elif aggfunc == "count":
 
         def agg_func(values):
             return len(values)
+
     else:
         raise ValueError(f"Unsupported aggregation function: {aggfunc}")
     grouped_data = {}