diff --git a/src/numpy_pandas/dataframe_operations.py b/src/numpy_pandas/dataframe_operations.py index cb4cda2..b433a47 100644 --- a/src/numpy_pandas/dataframe_operations.py +++ b/src/numpy_pandas/dataframe_operations.py @@ -34,26 +34,39 @@ def groupby_mean(df: pd.DataFrame, group_col: str, value_col: str) -> dict[Any, def dataframe_merge( left: pd.DataFrame, right: pd.DataFrame, left_on: str, right_on: str ) -> pd.DataFrame: - result_data = [] + # Cache column orders and indices left_cols = list(left.columns) right_cols = [col for col in right.columns if col != right_on] + left_on_idx = left.columns.get_loc(left_on) + right_on_idx = right.columns.get_loc(right_on) + right_indices = {col: right.columns.get_loc(col) for col in right_cols} + left_indices = {col: left.columns.get_loc(col) for col in left_cols} + + # Get underlying numpy arrays for much faster row access + left_values = left.values + right_values = right.values + + # Build right_dict mapping key value to lists of *row indices* in right frame right_dict = {} - for i in range(len(right)): - key = right.iloc[i][right_on] - if key not in right_dict: - right_dict[key] = [] - right_dict[key].append(i) - for i in range(len(left)): - left_row = left.iloc[i] - key = left_row[left_on] + for i, key in enumerate(right_values[:, right_on_idx]): + right_dict.setdefault(key, []).append(i) + + # Pre-allocate result_data (as list of lists for perf, then handle dict creation at the end) + result_data = [] + + for i in range(left_values.shape[0]): + left_row = left_values[i] + key = left_row[left_on_idx] if key in right_dict: for right_idx in right_dict[key]: - right_row = right.iloc[right_idx] + right_row = right_values[right_idx] + # Compose result row as dictionary (preserving behavior) new_row = {} for col in left_cols: - new_row[col] = left_row[col] + new_row[col] = left_row[left_indices[col]] + # Exclude right_on for right frame for col in right_cols: - new_row[col] = right_row[col] + new_row[col] = right_row[right_indices[col]] result_data.append(new_row) return pd.DataFrame(result_data) @@ -66,14 +79,17 @@ def pivot_table( def agg_func(values): return sum(values) / len(values) + elif aggfunc == "sum": def agg_func(values): return sum(values) + elif aggfunc == "count": def agg_func(values): return len(values) + else: raise ValueError(f"Unsupported aggregation function: {aggfunc}") grouped_data = {}