From b29b688b6c7d4ea283e9495cd6e1c30d7d8bc9b6 Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Sun, 20 Apr 2025 10:47:21 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Speed=20up=20function=20`d?= =?UTF-8?q?rop=5Fduplicates`=20by=20485%=20To=20improve=20the=20performanc?= =?UTF-8?q?e=20of=20this=20code,=20we=20can=20leverage=20the=20built-in=20?= =?UTF-8?q?`drop=5Fduplicates`=20method=20provided=20by=20pandas,=20which?= =?UTF-8?q?=20is=20optimized=20for=20such=20operations.=20Using=20this=20b?= =?UTF-8?q?uilt-in=20method=20is=20both=20faster=20and=20more=20concise.?= =?UTF-8?q?=20Here=20is=20the=20optimized=20version=20of=20the=20function.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This optimized version leverages the efficient internal implementation of `drop_duplicates` provided by pandas, significantly improving the runtime. --- src/numpy_pandas/dataframe_operations.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/numpy_pandas/dataframe_operations.py b/src/numpy_pandas/dataframe_operations.py index cb4cda2..3261d67 100644 --- a/src/numpy_pandas/dataframe_operations.py +++ b/src/numpy_pandas/dataframe_operations.py @@ -113,16 +113,11 @@ def fillna(df: pd.DataFrame, column: str, value: Any) -> pd.DataFrame: def drop_duplicates(df: pd.DataFrame, subset: List[str] = None) -> pd.DataFrame: - if subset is None: - subset = df.columns.tolist() - seen = set() - keep_indices = [] - for i in range(len(df)): - values = tuple(df.iloc[i][col] for col in subset) - if values not in seen: - seen.add(values) - keep_indices.append(i) - return df.iloc[keep_indices].reset_index(drop=True) + """ + Drops duplicate rows from the DataFrame based on the provided subset of columns. + """ + # Use pandas built-in drop_duplicates, which is optimized for performance + return df.drop_duplicates(subset=subset).reset_index(drop=True) def sort_values(df: pd.DataFrame, by: str, ascending: bool = True) -> pd.DataFrame: