diff --git a/README.md b/README.md index dfe6c795..349b06c4 100644 --- a/README.md +++ b/README.md @@ -104,3 +104,4 @@ Disclaimer: Examples contributed by the community and partners do not represent | [Build a bank support agent with Pydantic AI and Mistral AI](third_party/PydanticAI/pydantic_bank_support_agent.ipynb)| Agent | Pydantic | | [Mistral and MLflow Tracing](third_party/MLflow/mistral-mlflow-tracing.ipynb) | Tracing, Observability | MLflow | | [Mistral OCR with Gradio](third_party/gradio/MistralOCR.md) | OCR | Gradio | +| [european_company_enrichment_serper.ipynb](mistral/data_enrichment/european_company_enrichment_serper.ipynb) | data enrichment, web search | Serper | diff --git a/mistral/data_enrichment/.env.example b/mistral/data_enrichment/.env.example new file mode 100644 index 00000000..31412e97 --- /dev/null +++ b/mistral/data_enrichment/.env.example @@ -0,0 +1,2 @@ +MISTRAL_API_KEY=your_mistral_api_key_here +SERPER_API_KEY=your_serper_api_key_here \ No newline at end of file diff --git a/mistral/data_enrichment/european_company_enrichment_serper.ipynb b/mistral/data_enrichment/european_company_enrichment_serper.ipynb new file mode 100644 index 00000000..5eea74c2 --- /dev/null +++ b/mistral/data_enrichment/european_company_enrichment_serper.ipynb @@ -0,0 +1,513 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# European Company Data Enrichment with Mistral AI & Serper\n", + "\n", + "Author: Sascha Seniuk\n", + "GitHub: @saschaseniuk\n", + "Affiliation: AIscream - Peisker & Seniuk GbR\n", + "\n", + "A streamlined tool for enriching European company data using Mistral AI and Serper.dev for reliable web search." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup and Installation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install mistralai requests beautifulsoup4 trafilatura pydantic python-dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import requests\n", + "from datetime import datetime\n", + "from typing import List, Optional, Dict, Any, Literal\n", + "from pydantic import BaseModel, Field\n", + "from bs4 import BeautifulSoup\n", + "import trafilatura\n", + "\n", + "from mistralai.client import MistralClient\n", + "from mistralai.models.chat_completion import ChatMessage\n", + "\n", + "# API Keys setup for Google Colab compatibility\n", + "try:\n", + " from google.colab import userdata\n", + " MISTRAL_API_KEY = userdata.get('MISTRAL_API_KEY')\n", + " SERPER_API_KEY = userdata.get('SERPER_API_KEY')\n", + "except:\n", + " from dotenv import load_dotenv\n", + " load_dotenv()\n", + " MISTRAL_API_KEY = os.getenv(\"MISTRAL_API_KEY\")\n", + " SERPER_API_KEY = os.getenv(\"SERPER_API_KEY\")\n", + "\n", + "if not MISTRAL_API_KEY:\n", + " MISTRAL_API_KEY = input(\"Enter MISTRAL_API_KEY: \")\n", + "if not SERPER_API_KEY:\n", + " SERPER_API_KEY = input(\"Enter SERPER_API_KEY (get from https://serper.dev): \")\n", + "\n", + "mistral_client = MistralClient(api_key=MISTRAL_API_KEY)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class CompanyProfile(BaseModel):\n", + " legal_name: str\n", + " country: str\n", + " vat_id: Optional[str] = None\n", + " registration_number: Optional[str] = None\n", + " address: Optional[str] = None\n", + " phone: Optional[str] = None\n", + " email: Optional[str] = None\n", + " website: Optional[str] = None\n", + " industry: Optional[str] = None\n", + " description: Optional[str] = None\n", + " confidence_score: float = 0.0\n", + " sources: List[str] = Field(default_factory=list)\n", + "\n", + "class SearchRequest(BaseModel):\n", + " company_name: str\n", + " country: Optional[str] = None # DE, FR, ES, IT, etc." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Search and Scraping Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# European search query templates\n", + "SEARCH_TEMPLATES = {\n", + " \"DE\": [\n", + " '{company} impressum',\n", + " '{company} kontakt adresse',\n", + " '{company} offizielle website'\n", + " ],\n", + " \"FR\": [\n", + " '{company} mentions lΓ©gales',\n", + " '{company} contact adresse',\n", + " '{company} site officiel'\n", + " ],\n", + " \"ES\": [\n", + " '{company} aviso legal',\n", + " '{company} contacto direcciΓ³n',\n", + " '{company} sitio oficial'\n", + " ],\n", + " \"IT\": [\n", + " '{company} note legali',\n", + " '{company} contatti indirizzo',\n", + " '{company} sito ufficiale'\n", + " ],\n", + " \"default\": [\n", + " '{company} legal information',\n", + " '{company} contact address',\n", + " '{company} official website'\n", + " ]\n", + "}\n", + "\n", + "def search_company_urls(company_name: str, country: str = None) -> List[str]:\n", + " \"\"\"Search for company URLs using Serper API\"\"\"\n", + " \n", + " # Get search templates\n", + " templates = SEARCH_TEMPLATES.get(country, SEARCH_TEMPLATES[\"default\"])\n", + " \n", + " all_urls = []\n", + " \n", + " for template in templates[:2]: # Limit to 2 searches\n", + " query = template.format(company=company_name)\n", + " \n", + " payload = {\n", + " \"q\": query,\n", + " \"num\": 5,\n", + " \"gl\": country.lower() if country else \"us\"\n", + " }\n", + " \n", + " headers = {\n", + " \"X-API-KEY\": SERPER_API_KEY,\n", + " \"Content-Type\": \"application/json\"\n", + " }\n", + " \n", + " try:\n", + " response = requests.post(\n", + " \"https://google.serper.dev/search\",\n", + " json=payload,\n", + " headers=headers,\n", + " timeout=10\n", + " )\n", + " \n", + " if response.status_code == 200:\n", + " data = response.json()\n", + " for result in data.get(\"organic\", []):\n", + " all_urls.append(result[\"link\"])\n", + " print(f\"Found {len(data.get('organic', []))} results for: {query}\")\n", + " else:\n", + " print(f\"Search failed for '{query}': {response.status_code}\")\n", + " \n", + " except Exception as e:\n", + " print(f\"Error searching '{query}': {e}\")\n", + " \n", + " # Remove duplicates and limit\n", + " unique_urls = list(dict.fromkeys(all_urls))[:5]\n", + " print(f\"Total unique URLs found: {len(unique_urls)}\")\n", + " return unique_urls\n", + "\n", + "def scrape_url(url: str) -> Dict[str, Any]:\n", + " \"\"\"Scrape content from a URL\"\"\"\n", + " headers = {\n", + " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\"\n", + " }\n", + " \n", + " try:\n", + " response = requests.get(url, headers=headers, timeout=10)\n", + " response.raise_for_status()\n", + " \n", + " # Use trafilatura for clean text extraction\n", + " text = trafilatura.extract(response.text, include_formatting=False)\n", + " \n", + " if not text:\n", + " # Fallback to BeautifulSoup\n", + " soup = BeautifulSoup(response.text, 'html.parser')\n", + " for script in soup([\"script\", \"style\"]):\n", + " script.decompose()\n", + " text = soup.get_text(separator='\\n', strip=True)\n", + " \n", + " return {\n", + " \"url\": url,\n", + " \"text\": text[:5000] if text else \"\", # Limit text length\n", + " \"success\": True\n", + " }\n", + " \n", + " except Exception as e:\n", + " return {\n", + " \"url\": url,\n", + " \"text\": \"\",\n", + " \"success\": False,\n", + " \"error\": str(e)\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Mistral AI Data Extraction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_company_data(text: str, company_name: str, source_url: str) -> Dict[str, Any]:\n", + " \"\"\"Extract company data using Mistral AI function calling\"\"\"\n", + " \n", + " extraction_tool = {\n", + " \"type\": \"function\",\n", + " \"function\": {\n", + " \"name\": \"extract_company_info\",\n", + " \"description\": \"Extract European company information from text\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"legal_name\": {\"type\": \"string\", \"description\": \"Official company name\"},\n", + " \"vat_id\": {\"type\": \"string\", \"description\": \"VAT number (USt-IdNr, TVA, IVA, etc.)\"},\n", + " \"registration_number\": {\"type\": \"string\", \"description\": \"Registration number (HRB, SIRET, etc.)\"},\n", + " \"address\": {\"type\": \"string\", \"description\": \"Full business address\"},\n", + " \"phone\": {\"type\": \"string\", \"description\": \"Phone number\"},\n", + " \"email\": {\"type\": \"string\", \"description\": \"Email address\"},\n", + " \"website\": {\"type\": \"string\", \"description\": \"Website URL\"},\n", + " \"industry\": {\"type\": \"string\", \"description\": \"Industry sector\"},\n", + " \"description\": {\"type\": \"string\", \"description\": \"Company description\"}\n", + " },\n", + " \"required\": [\"legal_name\"]\n", + " }\n", + " }\n", + " }\n", + " \n", + " system_prompt = f\"\"\"Extract company information from the text. The company is likely '{company_name}'.\n", + "Focus on legal registration data, VAT numbers, and contact information.\n", + "Only extract data explicitly stated in the text.\"\"\"\n", + " \n", + " user_prompt = f\"\"\"Extract company information from this text:\\n\\n{text[:2000]}\\n\\nSource: {source_url}\"\"\"\n", + " \n", + " try:\n", + " messages = [\n", + " ChatMessage(role=\"system\", content=system_prompt),\n", + " ChatMessage(role=\"user\", content=user_prompt)\n", + " ]\n", + " \n", + " response = mistral_client.chat(\n", + " model=\"mistral-large-latest\",\n", + " messages=messages,\n", + " tools=[extraction_tool],\n", + " tool_choice=\"any\"\n", + " )\n", + " \n", + " if response.choices[0].message.tool_calls:\n", + " tool_call = response.choices[0].message.tool_calls[0]\n", + " extracted_data = json.loads(tool_call.function.arguments)\n", + " return {\"success\": True, \"data\": extracted_data, \"source\": source_url}\n", + " else:\n", + " return {\"success\": False, \"data\": {}, \"source\": source_url, \"error\": \"No extraction\"}\n", + " \n", + " except Exception as e:\n", + " return {\"success\": False, \"data\": {}, \"source\": source_url, \"error\": str(e)}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Main Enrichment Function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def enrich_company(company_name: str, country: str = None) -> CompanyProfile:\n", + " \"\"\"Main function to enrich company data\"\"\"\n", + " \n", + " print(f\"πŸ” Enriching: {company_name} ({country or 'No country specified'})\")\n", + " \n", + " # Step 1: Search for URLs\n", + " urls = search_company_urls(company_name, country)\n", + " \n", + " if not urls:\n", + " print(\"❌ No URLs found\")\n", + " return CompanyProfile(\n", + " legal_name=company_name,\n", + " country=country or \"Unknown\",\n", + " confidence_score=0.0\n", + " )\n", + " \n", + " # Step 2: Scrape content\n", + " print(f\"πŸ•·οΈ Scraping {len(urls)} URLs...\")\n", + " scraped_data = []\n", + " for url in urls:\n", + " result = scrape_url(url)\n", + " if result[\"success\"] and result[\"text\"]:\n", + " scraped_data.append(result)\n", + " print(f\" βœ… {url} - {len(result['text'])} chars\")\n", + " else:\n", + " print(f\" ❌ {url} - {result.get('error', 'No content')}\")\n", + " \n", + " if not scraped_data:\n", + " print(\"❌ No content scraped\")\n", + " return CompanyProfile(\n", + " legal_name=company_name,\n", + " country=country or \"Unknown\",\n", + " confidence_score=0.0\n", + " )\n", + " \n", + " # Step 3: Extract data with Mistral AI\n", + " print(f\"πŸ€– Extracting data from {len(scraped_data)} pages...\")\n", + " extractions = []\n", + " for item in scraped_data:\n", + " result = extract_company_data(item[\"text\"], company_name, item[\"url\"])\n", + " if result[\"success\"]:\n", + " extractions.append(result)\n", + " print(f\" βœ… Extracted from {item['url']}\")\n", + " \n", + " # Step 4: Merge and create profile\n", + " merged_data = {}\n", + " sources = []\n", + " \n", + " for extraction in extractions:\n", + " sources.append(extraction[\"source\"])\n", + " for field, value in extraction[\"data\"].items():\n", + " if value and field not in merged_data:\n", + " merged_data[field] = value\n", + " \n", + " # Calculate confidence based on data completeness\n", + " critical_fields = [\"legal_name\", \"vat_id\", \"address\", \"website\"]\n", + " confidence = sum(1 for field in critical_fields if merged_data.get(field)) / len(critical_fields)\n", + " \n", + " profile = CompanyProfile(\n", + " legal_name=merged_data.get(\"legal_name\", company_name),\n", + " country=country or \"Unknown\",\n", + " vat_id=merged_data.get(\"vat_id\"),\n", + " registration_number=merged_data.get(\"registration_number\"),\n", + " address=merged_data.get(\"address\"),\n", + " phone=merged_data.get(\"phone\"),\n", + " email=merged_data.get(\"email\"),\n", + " website=merged_data.get(\"website\"),\n", + " industry=merged_data.get(\"industry\"),\n", + " description=merged_data.get(\"description\"),\n", + " confidence_score=confidence,\n", + " sources=sources\n", + " )\n", + " \n", + " print(f\"✨ Enrichment complete! Confidence: {confidence:.1%}\")\n", + " return profile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Examples and Usage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example 1: German company\n", + "result = enrich_company(\"Siemens\", \"DE\")\n", + "\n", + "print(\"\\n\" + \"=\"*50)\n", + "print(f\"Company: {result.legal_name}\")\n", + "print(f\"Country: {result.country}\")\n", + "print(f\"VAT ID: {result.vat_id}\")\n", + "print(f\"Registration: {result.registration_number}\")\n", + "print(f\"Address: {result.address}\")\n", + "print(f\"Website: {result.website}\")\n", + "print(f\"Industry: {result.industry}\")\n", + "print(f\"Confidence: {result.confidence_score:.1%}\")\n", + "print(f\"Sources: {len(result.sources)} pages\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example 2: French company\n", + "result_fr = enrich_company(\"Mistral AI\", \"FR\")\n", + "\n", + "print(\"\\n\" + \"=\"*50)\n", + "print(f\"Company: {result_fr.legal_name}\")\n", + "print(f\"Country: {result_fr.country}\")\n", + "print(f\"VAT ID: {result_fr.vat_id}\")\n", + "print(f\"Address: {result_fr.address}\")\n", + "print(f\"Website: {result_fr.website}\")\n", + "print(f\"Description: {result_fr.description}\")\n", + "print(f\"Confidence: {result_fr.confidence_score:.1%}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Batch processing example\n", + "companies = [\n", + " {\"name\": \"BMW\", \"country\": \"DE\"},\n", + " {\"name\": \"LVMH\", \"country\": \"FR\"},\n", + " {\"name\": \"Telefonica\", \"country\": \"ES\"}\n", + "]\n", + "\n", + "results = []\n", + "for company in companies:\n", + " try:\n", + " profile = enrich_company(company[\"name\"], company[\"country\"])\n", + " results.append({\n", + " \"name\": profile.legal_name,\n", + " \"country\": profile.country,\n", + " \"vat_id\": profile.vat_id,\n", + " \"website\": profile.website,\n", + " \"confidence\": f\"{profile.confidence_score:.1%}\"\n", + " })\n", + " print(f\"βœ… {company['name']} - {profile.confidence_score:.1%}\")\n", + " except Exception as e:\n", + " print(f\"❌ {company['name']} - Error: {e}\")\n", + " results.append({\"name\": company[\"name\"], \"error\": str(e)})\n", + "\n", + "# Display results\n", + "import json\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(\"BATCH RESULTS:\")\n", + "print(json.dumps(results, indent=2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup Instructions\n", + "\n", + "1. **Get Serper API Key**: Go to https://serper.dev and sign up for a free account\n", + "2. **Create .env file** with:\n", + " ```\n", + " MISTRAL_API_KEY=your_mistral_key\n", + " SERPER_API_KEY=your_serper_key\n", + " ```\n", + "3. **Run the notebook** - it should work reliably!\n", + "\n", + "## Key Features\n", + "\n", + "- βœ… **Reliable Search** with Serper.dev API (no Google blocking)\n", + "- βœ… **European Language Support** (DE, FR, ES, IT)\n", + "- βœ… **Mistral AI Function Calling** for structured extraction\n", + "- βœ… **Clean Text Extraction** with trafilatura\n", + "- βœ… **Confidence Scoring** based on data completeness\n", + "- βœ… **Batch Processing** for multiple companies\n", + "- βœ… **Slim & Fast** - under 200 lines of code!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}