Skip to content

Commit 5da6e49

Browse files
leestottCopilot
andauthored
Update 2-Working-With-Data/08-data-preparation/notebook.ipynb
Co-authored-by: Copilot <[email protected]>
1 parent 1863271 commit 5da6e49

File tree

1 file changed

+41
-38
lines changed

1 file changed

+41
-38
lines changed

2-Working-With-Data/08-data-preparation/notebook.ipynb

Lines changed: 41 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3997,49 +3997,52 @@
39973997
"metadata": {},
39983998
"outputs": [],
39993999
"source": [
4000-
"from rapidfuzz import process, fuzz\n",
4000+
"try:\n",
4001+
" from rapidfuzz import process, fuzz\n",
40014002
"\n",
4002-
"# Function to find potential duplicates\n",
4003-
"def find_near_duplicates(df, column, threshold=90):\n",
4004-
" \"\"\"\n",
4005-
" Find near-duplicate entries in a column using fuzzy matching.\n",
4006-
" \n",
4007-
" Parameters:\n",
4008-
" - df: DataFrame\n",
4009-
" - column: Column name to check for duplicates\n",
4010-
" - threshold: Similarity threshold (0-100)\n",
4011-
" \n",
4012-
" Returns: List of potential duplicate groups\n",
4013-
" \"\"\"\n",
4014-
" values = df[column].unique()\n",
4015-
" duplicate_groups = []\n",
4016-
" checked = set()\n",
4017-
" \n",
4018-
" for value in values:\n",
4019-
" if value in checked:\n",
4020-
" continue\n",
4003+
" # Function to find potential duplicates\n",
4004+
" def find_near_duplicates(df, column, threshold=90):\n",
4005+
" \"\"\"\n",
4006+
" Find near-duplicate entries in a column using fuzzy matching.\n",
4007+
" \n",
4008+
" Parameters:\n",
4009+
" - df: DataFrame\n",
4010+
" - column: Column name to check for duplicates\n",
4011+
" - threshold: Similarity threshold (0-100)\n",
4012+
" \n",
4013+
" Returns: List of potential duplicate groups\n",
4014+
" \"\"\"\n",
4015+
" values = df[column].unique()\n",
4016+
" duplicate_groups = []\n",
4017+
" checked = set()\n",
4018+
" \n",
4019+
" for value in values:\n",
4020+
" if value in checked:\n",
4021+
" continue\n",
4022+
" \n",
4023+
" # Find similar values\n",
4024+
" matches = process.extract(value, values, scorer=fuzz.ratio, limit=len(values))\n",
4025+
" similar = [m[0] for m in matches if m[1] >= threshold]\n",
40214026
" \n",
4022-
" # Find similar values\n",
4023-
" matches = process.extract(value, values, scorer=fuzz.ratio, limit=len(values))\n",
4024-
" similar = [m[0] for m in matches if m[1] >= threshold]\n",
4027+
" if len(similar) > 1:\n",
4028+
" duplicate_groups.append(similar)\n",
4029+
" checked.update(similar)\n",
40254030
" \n",
4026-
" if len(similar) > 1:\n",
4027-
" duplicate_groups.append(similar)\n",
4028-
" checked.update(similar)\n",
4029-
" \n",
4030-
" return duplicate_groups\n",
4031+
" return duplicate_groups\n",
40314032
"\n",
4032-
"# Find near-duplicate names\n",
4033-
"duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n",
4033+
" # Find near-duplicate names\n",
4034+
" duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n",
40344035
"\n",
4035-
"print(\"Potential duplicate groups:\")\n",
4036-
"for i, group in enumerate(duplicate_groups, 1):\n",
4037-
" print(f\"\\nGroup {i}:\")\n",
4038-
" for name in group:\n",
4039-
" matching_rows = dirty_data[dirty_data['name'] == name]\n",
4040-
" print(f\" '{name}': {len(matching_rows)} occurrence(s)\")\n",
4041-
" for _, row in matching_rows.iterrows():\n",
4042-
" print(f\" - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")"
4036+
" print(\"Potential duplicate groups:\")\n",
4037+
" for i, group in enumerate(duplicate_groups, 1):\n",
4038+
" print(f\"\\nGroup {i}:\")\n",
4039+
" for name in group:\n",
4040+
" matching_rows = dirty_data[dirty_data['name'] == name]\n",
4041+
" print(f\" '{name}': {len(matching_rows)} occurrence(s)\")\n",
4042+
" for _, row in matching_rows.iterrows():\n",
4043+
" print(f\" - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")\n",
4044+
"except ImportError:\n",
4045+
" print(\"rapidfuzz is not installed. Skipping fuzzy matching for near-duplicates.\")"
40434046
]
40444047
},
40454048
{

0 commit comments

Comments
 (0)