|
3815 | 3815 | "metadata": {}, |
3816 | 3816 | "outputs": [], |
3817 | 3817 | "source": [ |
3818 | | - "# rapidfuzz was already imported in an earlier cell\n", |
3819 | | - "from rapidfuzz import process, fuzz\n", |
3820 | | - "\n", |
3821 | | - "\n", |
| 3818 | + "try:\n", |
| 3819 | + " from rapidfuzz import process, fuzz\n", |
| 3820 | + "except ImportError:\n", |
| 3821 | + " print(\"rapidfuzz is not installed. Please install it with 'pip install rapidfuzz' to use fuzzy matching.\")\n", |
| 3822 | + " process = None\n", |
| 3823 | + " fuzz = None\n", |
3822 | 3824 | "\n", |
3823 | 3825 | "# Get unique countries\n", |
3824 | 3826 | "unique_countries = dirty_data['country'].unique()\n", |
3825 | 3827 | "\n", |
3826 | 3828 | "# For each country, find similar matches\n", |
3827 | | - "print(\"Finding similar country names (similarity > 70%):\")\n", |
3828 | | - "for country in unique_countries:\n", |
3829 | | - " matches = process.extract(country, unique_countries, scorer=fuzz.ratio, limit=3)\n", |
3830 | | - " # Filter matches with similarity > 70 and not identical\n", |
3831 | | - " similar = [m for m in matches if m[1] > 70 and m[0] != country]\n", |
3832 | | - " if similar:\n", |
3833 | | - " print(f\"\\n'{country}' is similar to:\")\n", |
3834 | | - " for match, score, _ in similar:\n", |
3835 | | - " print(f\" - '{match}' (similarity: {score}%)\")" |
| 3829 | + "if process is not None and fuzz is not None:\n", |
| 3830 | + " print(\"Finding similar country names (similarity > 70%):\")\n", |
| 3831 | + " for country in unique_countries:\n", |
| 3832 | + " matches = process.extract(country, unique_countries, scorer=fuzz.ratio, limit=3)\n", |
| 3833 | + " # Filter matches with similarity > 70 and not identical\n", |
| 3834 | + " similar = [m for m in matches if m[1] > 70 and m[0] != country]\n", |
| 3835 | + " if similar:\n", |
| 3836 | + " print(f\"\\n'{country}' is similar to:\")\n", |
| 3837 | + " for match, score, _ in similar:\n", |
| 3838 | + " print(f\" - '{match}' (similarity: {score}%)\")\n", |
| 3839 | + "else:\n", |
| 3840 | + " print(\"Skipping fuzzy matching because rapidfuzz is not available.\")" |
3836 | 3841 | ] |
3837 | 3842 | }, |
3838 | 3843 | { |
|
0 commit comments