|
3997 | 3997 | "metadata": {}, |
3998 | 3998 | "outputs": [], |
3999 | 3999 | "source": [ |
4000 | | - "from rapidfuzz import process, fuzz\n", |
| 4000 | + "try:\n", |
| 4001 | + " from rapidfuzz import process, fuzz\n", |
4001 | 4002 | "\n", |
4002 | | - "# Function to find potential duplicates\n", |
4003 | | - "def find_near_duplicates(df, column, threshold=90):\n", |
4004 | | - " \"\"\"\n", |
4005 | | - " Find near-duplicate entries in a column using fuzzy matching.\n", |
4006 | | - " \n", |
4007 | | - " Parameters:\n", |
4008 | | - " - df: DataFrame\n", |
4009 | | - " - column: Column name to check for duplicates\n", |
4010 | | - " - threshold: Similarity threshold (0-100)\n", |
4011 | | - " \n", |
4012 | | - " Returns: List of potential duplicate groups\n", |
4013 | | - " \"\"\"\n", |
4014 | | - " values = df[column].unique()\n", |
4015 | | - " duplicate_groups = []\n", |
4016 | | - " checked = set()\n", |
4017 | | - " \n", |
4018 | | - " for value in values:\n", |
4019 | | - " if value in checked:\n", |
4020 | | - " continue\n", |
| 4003 | + " # Function to find potential duplicates\n", |
| 4004 | + " def find_near_duplicates(df, column, threshold=90):\n", |
| 4005 | + " \"\"\"\n", |
| 4006 | + " Find near-duplicate entries in a column using fuzzy matching.\n", |
| 4007 | + " \n", |
| 4008 | + " Parameters:\n", |
| 4009 | + " - df: DataFrame\n", |
| 4010 | + " - column: Column name to check for duplicates\n", |
| 4011 | + " - threshold: Similarity threshold (0-100)\n", |
| 4012 | + " \n", |
| 4013 | + " Returns: List of potential duplicate groups\n", |
| 4014 | + " \"\"\"\n", |
| 4015 | + " values = df[column].unique()\n", |
| 4016 | + " duplicate_groups = []\n", |
| 4017 | + " checked = set()\n", |
| 4018 | + " \n", |
| 4019 | + " for value in values:\n", |
| 4020 | + " if value in checked:\n", |
| 4021 | + " continue\n", |
| 4022 | + " \n", |
| 4023 | + " # Find similar values\n", |
| 4024 | + " matches = process.extract(value, values, scorer=fuzz.ratio, limit=len(values))\n", |
| 4025 | + " similar = [m[0] for m in matches if m[1] >= threshold]\n", |
4021 | 4026 | " \n", |
4022 | | - " # Find similar values\n", |
4023 | | - " matches = process.extract(value, values, scorer=fuzz.ratio, limit=len(values))\n", |
4024 | | - " similar = [m[0] for m in matches if m[1] >= threshold]\n", |
| 4027 | + " if len(similar) > 1:\n", |
| 4028 | + " duplicate_groups.append(similar)\n", |
| 4029 | + " checked.update(similar)\n", |
4025 | 4030 | " \n", |
4026 | | - " if len(similar) > 1:\n", |
4027 | | - " duplicate_groups.append(similar)\n", |
4028 | | - " checked.update(similar)\n", |
4029 | | - " \n", |
4030 | | - " return duplicate_groups\n", |
| 4031 | + " return duplicate_groups\n", |
4031 | 4032 | "\n", |
4032 | | - "# Find near-duplicate names\n", |
4033 | | - "duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n", |
| 4033 | + " # Find near-duplicate names\n", |
| 4034 | + " duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n", |
4034 | 4035 | "\n", |
4035 | | - "print(\"Potential duplicate groups:\")\n", |
4036 | | - "for i, group in enumerate(duplicate_groups, 1):\n", |
4037 | | - " print(f\"\\nGroup {i}:\")\n", |
4038 | | - " for name in group:\n", |
4039 | | - " matching_rows = dirty_data[dirty_data['name'] == name]\n", |
4040 | | - " print(f\" '{name}': {len(matching_rows)} occurrence(s)\")\n", |
4041 | | - " for _, row in matching_rows.iterrows():\n", |
4042 | | - " print(f\" - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")" |
| 4036 | + " print(\"Potential duplicate groups:\")\n", |
| 4037 | + " for i, group in enumerate(duplicate_groups, 1):\n", |
| 4038 | + " print(f\"\\nGroup {i}:\")\n", |
| 4039 | + " for name in group:\n", |
| 4040 | + " matching_rows = dirty_data[dirty_data['name'] == name]\n", |
| 4041 | + " print(f\" '{name}': {len(matching_rows)} occurrence(s)\")\n", |
| 4042 | + " for _, row in matching_rows.iterrows():\n", |
| 4043 | + " print(f\" - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")\n", |
| 4044 | + "except ImportError:\n", |
| 4045 | + " print(\"rapidfuzz is not installed. Skipping fuzzy matching for near-duplicates.\")" |
4043 | 4046 | ] |
4044 | 4047 | }, |
4045 | 4048 | { |
|
0 commit comments