feat: refactoring of the dependencies

VinciGit00 · VinciGit00 · commit 114372d990d9 · 2025-11-21T09:50:35.000-08:00
diff --git a/.agent/system/project_architecture.md b/.agent/system/project_architecture.md
@@ -85,7 +85,14 @@ scrapegraph-sdk/
 - **aiohttp** 3.10+ - Async HTTP client
 - **pydantic** 2.10.2+ - Data validation and modeling
 - **python-dotenv** 1.0.1+ - Environment variable management
-- **beautifulsoup4** 4.12.3+ - HTML parsing (for pagination)
+
+**Optional Dependencies:**
+- **beautifulsoup4** 4.12.3+ - HTML parsing (for HTML validation when using `website_html`)
+  - Install with: `pip install scrapegraph-py[html]`
+- **langchain** 0.3.0+ - Langchain integration for AI workflows
+- **langchain-community** 0.2.11+ - Community integrations for Langchain
+- **langchain-scrapegraph** 0.1.0+ - ScrapeGraph integration for Langchain
+  - Install with: `pip install scrapegraph-py[langchain]`
 
 **Development Tools:**
 - **pytest** 7.4.0+ - Testing framework
@@ -879,12 +886,17 @@ npm publish
 
 ### Python SDK Dependencies
 
-**Runtime:**
+**Core Runtime:**
 - **requests**: Sync HTTP client
 - **aiohttp**: Async HTTP client
 - **pydantic**: Data validation
 - **python-dotenv**: Environment variables
-- **beautifulsoup4**: HTML parsing
+
+**Optional Runtime (install with extras):**
+- **beautifulsoup4**: HTML parsing (required when using `website_html`)
+  - Install with: `pip install scrapegraph-py[html]`
+- **langchain, langchain-community, langchain-scrapegraph**: Langchain integration
+  - Install with: `pip install scrapegraph-py[langchain]`
 
 **Development:**
 - **pytest & plugins**: Testing framework
@@ -918,7 +930,7 @@ Both SDKs depend on the ScrapeGraph AI API:
 | **Architecture** | Class-based (Client, AsyncClient) | Function-based |
 | **Async Support** | ✅ Separate AsyncClient | ✅ All functions async |
 | **Type Safety** | ✅ Pydantic models, mypy | ⚠️ JSDoc comments |
-| **Dependencies** | 5 runtime deps | 0 runtime deps |
+| **Dependencies** | 4 core + 2 optional extras | 0 runtime deps |
 | **Testing** | pytest with mocking | Manual tests |
 | **Documentation** | MkDocs auto-generated | README examples |
 | **Package Size** | ~50KB | ~20KB |
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -35,7 +35,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install pytest pytest-asyncio responses
         cd scrapegraph-py
-        pip install -e .
+        pip install -e ".[html]"
 
     - name: Run mocked tests with coverage
       run: |
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -34,7 +34,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install pytest pytest-asyncio responses
         cd scrapegraph-py
-        pip install -e .
+        pip install -e ".[html]"
 
     - name: Run mocked tests with coverage
       run: |
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -44,7 +44,10 @@ scrapegraph-sdk/
 ### Python SDK
 - **Language**: Python 3.10+
 - **Package Manager**: uv (recommended) or pip
-- **Dependencies**: requests, pydantic, python-dotenv, aiohttp, beautifulsoup4
+- **Core Dependencies**: requests, pydantic, python-dotenv, aiohttp
+- **Optional Dependencies**:
+  - `html`: beautifulsoup4 (for HTML validation when using `website_html`)
+  - `langchain`: langchain, langchain-community, langchain-scrapegraph (for Langchain integrations)
 - **Testing**: pytest, pytest-asyncio, pytest-mock, aioresponses
 - **Code Quality**: black, isort, ruff, mypy, pre-commit
 - **Documentation**: mkdocs, mkdocs-material
diff --git a/scrapegraph-py/README.md b/scrapegraph-py/README.md
@@ -14,10 +14,33 @@ Official [Python SDK ](https://scrapegraphai.com) for the ScrapeGraph API - Smar
 
 ## 📦 Installation
 
+### Basic Installation
+
 ```bash
 pip install scrapegraph-py
 ```
 
+This installs the core SDK with minimal dependencies. The SDK is fully functional with just the core dependencies.
+
+### Optional Dependencies
+
+For specific use cases, you can install optional extras:
+
+**HTML Validation** (required when using `website_html` parameter):
+```bash
+pip install scrapegraph-py[html]
+```
+
+**Langchain Integration** (for using with Langchain/Langgraph):
+```bash
+pip install scrapegraph-py[langchain]
+```
+
+**All Optional Dependencies**:
+```bash
+pip install scrapegraph-py[html,langchain]
+```
+
 ## 🚀 Features
 
 - 🤖 AI-powered web scraping and search
@@ -58,6 +81,7 @@ response = client.smartscraper(
 )
 
 # Or using HTML content
+# Note: Using website_html requires the [html] extra: pip install scrapegraph-py[html]
 html_content = """
 <html>
     <body>
diff --git a/scrapegraph-py/TESTING.md b/scrapegraph-py/TESTING.md
@@ -39,9 +39,11 @@ Install test dependencies:
 ```bash
 cd scrapegraph-py
 pip install -r requirements-test.txt
-pip install -e .
+pip install -e ".[html]"
 ```
 
+**Note**: Tests require the `html` extra to be installed because they test HTML validation features. The `[html]` extra includes `beautifulsoup4` which is used for HTML validation in `SmartScraperRequest`.
+
 ### Basic Test Execution
 
 ```bash
@@ -255,7 +257,7 @@ The `pytest.ini` file configures:
 
 1. **Import Errors**
    ```bash
-   pip install -e .
+   pip install -e ".[html]"
    ```
 
 2. **Missing Dependencies**
diff --git a/scrapegraph-py/pyproject.toml b/scrapegraph-py/pyproject.toml
@@ -3,8 +3,8 @@ name = "scrapegraph_py"
 version = "1.12.2"
 description = "ScrapeGraph Python SDK for API"
 authors = [
-    { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" },
-    { name = "Lorenzo Padoan", email = "lorenzo.padoan977@gmail.com" }
+    { name = "Marco Vinciguerra", email = "marco@scrapegraphai.com" },
+    { name = "Lorenzo Padoan", email = "lorenzo@scrapegraphai.com" }
 ]
 
 
@@ -41,11 +41,15 @@ dependencies = [
     "pydantic>=2.10.2",
     "python-dotenv>=1.0.1",
     "aiohttp>=3.10",
-    "requests>=2.32.3",
-    "beautifulsoup4>=4.12.3",
 ]
 
 [project.optional-dependencies]
+html = ["beautifulsoup4>=4.12.3"]
+langchain = [
+    "langchain>=0.3.0",
+    "langchain-community>=0.2.11",
+    "langchain-scrapegraph>=0.1.0",
+]
 docs = ["sphinx==6.0", "furo==2024.5.6"]
 
 [tool.uv]
diff --git a/scrapegraph-py/scrapegraph_py/models/smartscraper.py b/scrapegraph-py/scrapegraph_py/models/smartscraper.py
@@ -15,7 +15,12 @@
 from typing import Dict, Optional, Type
 from uuid import UUID
 
-from bs4 import BeautifulSoup
+try:
+    from bs4 import BeautifulSoup
+    HAS_BS4 = True
+except ImportError:
+    HAS_BS4 = False
+
 from pydantic import BaseModel, Field, conint, model_validator
 
 
@@ -122,11 +127,18 @@ def validate_url_and_html(self) -> "SmartScraperRequest":
         if self.website_html is not None:
             if len(self.website_html.encode("utf-8")) > 2 * 1024 * 1024:
                 raise ValueError("Website HTML content exceeds maximum size of 2MB")
+            if not HAS_BS4:
+                raise ImportError(
+                    "beautifulsoup4 is required for HTML validation. "
+                    "Install it with: pip install scrapegraph-py[html] or pip install beautifulsoup4"
+                )
             try:
                 soup = BeautifulSoup(self.website_html, "html.parser")
                 if not soup.find():
                     raise ValueError("Invalid HTML - no parseable content found")
             except Exception as e:
+                if isinstance(e, ImportError):
+                    raise
                 raise ValueError(f"Invalid HTML structure: {str(e)}")
 
         # Validate URL