diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 8431e3e..46232ae 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -8,8 +8,53 @@ on: types: [published] jobs: - deploy: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Cache pip dependencies + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('**/pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-pip-${{ matrix.python-version }}- + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest pytest-asyncio responses + cd scrapegraph-py + pip install -e . + + - name: Run tests with coverage + run: | + cd scrapegraph-py + python -m pytest tests/ -v --cov=scrapegraph_py --cov-report=xml --cov-report=term-missing + + - name: Run real API tests (if API key available) + run: | + cd scrapegraph-py + if [ -n "$SGAI_API_KEY" ]; then + python -m pytest tests/test_real_apis.py -v --tb=short + else + echo "SGAI_API_KEY not set, skipping real API tests" + fi + env: + SGAI_API_KEY: ${{ secrets.SGAI_API_KEY }} + deploy: + needs: test runs-on: ubuntu-latest steps: @@ -21,12 +66,13 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel twine + pip install setuptools wheel twine build - name: Build and publish env: TWINE_USERNAME: mvincig11 TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | git fetch --all --tags - python setup.py sdist bdist_wheel + cd scrapegraph-py + python -m build twine upload dist/* \ No newline at end of file diff --git a/scrapegraph-py/tests/test_async_client.py b/scrapegraph-py/tests/test_async_client.py deleted file mode 100644 index f5e1dcc..0000000 --- a/scrapegraph-py/tests/test_async_client.py +++ /dev/null @@ -1,495 +0,0 @@ -from uuid import uuid4 - -import pytest -from aioresponses import aioresponses -from pydantic import BaseModel - -from scrapegraph_py.async_client import AsyncClient -from scrapegraph_py.exceptions import APIError -from tests.utils import generate_mock_api_key - - -@pytest.fixture -def mock_api_key(): - return generate_mock_api_key() - - -@pytest.fixture -def mock_uuid(): - return str(uuid4()) - - -@pytest.mark.asyncio -async def test_smartscraper_with_url(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Example domain."}, - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com", user_prompt="Describe this page." - ) - assert response["status"] == "completed" - assert "description" in response["result"] - - -@pytest.mark.asyncio -async def test_smartscraper_with_html(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Test content."}, - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_html="
Test content
", - user_prompt="Extract info", - ) - assert response["status"] == "completed" - assert "description" in response["result"] - - -@pytest.mark.asyncio -async def test_smartscraper_with_headers(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Example domain."}, - }, - ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com", - user_prompt="Describe this page.", - headers=headers, - ) - assert response["status"] == "completed" - assert "description" in response["result"] - - -@pytest.mark.asyncio -async def test_get_credits(mock_api_key): - with aioresponses() as mocked: - mocked.get( - "https://api.scrapegraphai.com/v1/credits", - payload={"remaining_credits": 100, "total_credits_used": 50}, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_credits() - assert response["remaining_credits"] == 100 - assert response["total_credits_used"] == 50 - - -@pytest.mark.asyncio -async def test_submit_feedback(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/feedback", payload={"status": "success"} - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.submit_feedback( - request_id=str(uuid4()), rating=5, feedback_text="Great service!" - ) - assert response["status"] == "success" - - -@pytest.mark.asyncio -async def test_get_smartscraper(mock_api_key, mock_uuid): - with aioresponses() as mocked: - mocked.get( - f"https://api.scrapegraphai.com/v1/smartscraper/{mock_uuid}", - payload={ - "request_id": mock_uuid, - "status": "completed", - "result": {"data": "test"}, - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_smartscraper(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - - -@pytest.mark.asyncio -async def test_smartscraper_with_pagination(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10"}, - {"name": "Product 2", "price": "$20"}, - {"name": "Product 3", "price": "$30"}, - ] - }, - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=3 - ) - assert response["status"] == "completed" - assert "products" in response["result"] - assert len(response["result"]["products"]) == 3 - - -@pytest.mark.asyncio -async def test_smartscraper_with_pagination_and_scrolls(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10"}, - {"name": "Product 2", "price": "$20"}, - {"name": "Product 3", "price": "$30"}, - {"name": "Product 4", "price": "$40"}, - {"name": "Product 5", "price": "$50"}, - ] - }, - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information from paginated results", - total_pages=5, - number_of_scrolls=10 - ) - assert response["status"] == "completed" - assert "products" in response["result"] - assert len(response["result"]["products"]) == 5 - - -@pytest.mark.asyncio -async def test_smartscraper_with_pagination_and_all_features(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10", "rating": 4.5}, - {"name": "Product 2", "price": "$20", "rating": 4.0}, - ] - }, - }, - ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - class ProductSchema(BaseModel): - name: str - price: str - rating: float - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information with ratings", - headers=headers, - output_schema=ProductSchema, - number_of_scrolls=5, - total_pages=2 - ) - assert response["status"] == "completed" - assert "products" in response["result"] - - -@pytest.mark.asyncio -async def test_api_error(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - status=400, - payload={"error": "Bad request"}, - exception=APIError("Bad request", status_code=400), - ) - - async with AsyncClient(api_key=mock_api_key) as client: - with pytest.raises(APIError) as exc_info: - await client.smartscraper( - website_url="https://example.com", user_prompt="Describe this page." - ) - assert exc_info.value.status_code == 400 - assert "Bad request" in str(exc_info.value) - - -@pytest.mark.asyncio -async def test_markdownify(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/markdownify", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.markdownify(website_url="https://example.com") - assert response["status"] == "completed" - assert "# Example Page" in response["result"] - - -@pytest.mark.asyncio -async def test_markdownify_with_headers(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/markdownify", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.markdownify( - website_url="https://example.com", headers=headers - ) - assert response["status"] == "completed" - assert "# Example Page" in response["result"] - - -@pytest.mark.asyncio -async def test_get_markdownify(mock_api_key, mock_uuid): - with aioresponses() as mocked: - mocked.get( - f"https://api.scrapegraphai.com/v1/markdownify/{mock_uuid}", - payload={ - "request_id": mock_uuid, - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_markdownify(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - - -@pytest.mark.asyncio -async def test_searchscraper(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/searchscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.searchscraper( - user_prompt="What is the latest version of Python?" - ) - assert response["status"] == "completed" - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) - - -@pytest.mark.asyncio -async def test_searchscraper_with_headers(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/searchscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, - ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.searchscraper( - user_prompt="What is the latest version of Python?", - headers=headers, - ) - assert response["status"] == "completed" - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) - - -@pytest.mark.asyncio -async def test_get_searchscraper(mock_api_key, mock_uuid): - with aioresponses() as mocked: - mocked.get( - f"https://api.scrapegraphai.com/v1/searchscraper/{mock_uuid}", - payload={ - "request_id": mock_uuid, - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_searchscraper(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) - - -@pytest.mark.asyncio -async def test_crawl(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/crawl", - payload={ - "id": str(uuid4()), - "status": "processing", - "message": "Crawl job started", - }, - ) - - schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Test Schema", - "type": "object", - "properties": { - "name": {"type": "string"}, - "age": {"type": "integer"}, - }, - "required": ["name"], - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.crawl( - url="https://example.com", - prompt="Extract company information", - schema=schema, - cache_website=True, - depth=2, - max_pages=5, - same_domain_only=True, - batch_size=1, - ) - assert response["status"] == "processing" - assert "id" in response - - -@pytest.mark.asyncio -async def test_crawl_with_minimal_params(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/crawl", - payload={ - "id": str(uuid4()), - "status": "processing", - "message": "Crawl job started", - }, - ) - - schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Test Schema", - "type": "object", - "properties": { - "name": {"type": "string"}, - }, - "required": ["name"], - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.crawl( - url="https://example.com", - prompt="Extract company information", - schema=schema, - ) - assert response["status"] == "processing" - assert "id" in response - - -@pytest.mark.asyncio -async def test_get_crawl(mock_api_key, mock_uuid): - with aioresponses() as mocked: - mocked.get( - f"https://api.scrapegraphai.com/v1/crawl/{mock_uuid}", - payload={ - "id": mock_uuid, - "status": "completed", - "result": { - "llm_result": { - "company": { - "name": "Example Corp", - "description": "A technology company", - }, - "services": [ - { - "service_name": "Web Development", - "description": "Custom web solutions", - } - ], - "legal": { - "privacy_policy": "Privacy policy content", - "terms_of_service": "Terms of service content", - }, - } - }, - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_crawl(mock_uuid) - assert response["status"] == "completed" - assert response["id"] == mock_uuid - assert "result" in response - assert "llm_result" in response["result"] diff --git a/scrapegraph-py/tests/test_client.py b/scrapegraph-py/tests/test_client.py deleted file mode 100644 index c7ad078..0000000 --- a/scrapegraph-py/tests/test_client.py +++ /dev/null @@ -1,498 +0,0 @@ -from uuid import uuid4 - -import pytest -import responses -from pydantic import BaseModel - -from scrapegraph_py.client import Client -from tests.utils import generate_mock_api_key - - -@pytest.fixture -def mock_api_key(): - return generate_mock_api_key() - - -@pytest.fixture -def mock_uuid(): - return str(uuid4()) - - -@responses.activate -def test_smartscraper_with_url(mock_api_key): - # Mock the API response - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Example domain."}, - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com", user_prompt="Describe this page." - ) - assert response["status"] == "completed" - - -@responses.activate -def test_smartscraper_with_html(mock_api_key): - # Mock the API response - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Test content."}, - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_html="Test content
", - user_prompt="Extract info", - ) - assert response["status"] == "completed" - - -@responses.activate -def test_smartscraper_with_headers(mock_api_key): - # Mock the API response - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Example domain."}, - }, - ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com", - user_prompt="Describe this page.", - headers=headers, - ) - assert response["status"] == "completed" - - -@responses.activate -def test_get_smartscraper(mock_api_key, mock_uuid): - responses.add( - responses.GET, - f"https://api.scrapegraphai.com/v1/smartscraper/{mock_uuid}", - json={ - "request_id": mock_uuid, - "status": "completed", - "result": {"data": "test"}, - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.get_smartscraper(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - - -@responses.activate -def test_smartscraper_with_pagination(mock_api_key): - # Mock the API response for pagination request - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10"}, - {"name": "Product 2", "price": "$20"}, - {"name": "Product 3", "price": "$30"}, - ] - }, - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=3 - ) - assert response["status"] == "completed" - assert "products" in response["result"] - assert len(response["result"]["products"]) == 3 - - -@responses.activate -def test_smartscraper_with_pagination_and_scrolls(mock_api_key): - # Mock the API response for pagination with scrolls - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10"}, - {"name": "Product 2", "price": "$20"}, - {"name": "Product 3", "price": "$30"}, - {"name": "Product 4", "price": "$40"}, - {"name": "Product 5", "price": "$50"}, - ] - }, - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information from paginated results", - total_pages=5, - number_of_scrolls=10 - ) - assert response["status"] == "completed" - assert "products" in response["result"] - assert len(response["result"]["products"]) == 5 - - -@responses.activate -def test_smartscraper_with_pagination_and_all_features(mock_api_key): - # Mock the API response for pagination with all features - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10", "rating": 4.5}, - {"name": "Product 2", "price": "$20", "rating": 4.0}, - ] - }, - }, - ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - class ProductSchema(BaseModel): - name: str - price: str - rating: float - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information with ratings", - headers=headers, - output_schema=ProductSchema, - number_of_scrolls=5, - total_pages=2 - ) - assert response["status"] == "completed" - assert "products" in response["result"] - - -@responses.activate -def test_get_credits(mock_api_key): - responses.add( - responses.GET, - "https://api.scrapegraphai.com/v1/credits", - json={"remaining_credits": 100, "total_credits_used": 50}, - ) - - with Client(api_key=mock_api_key) as client: - response = client.get_credits() - assert response["remaining_credits"] == 100 - assert response["total_credits_used"] == 50 - - -@responses.activate -def test_submit_feedback(mock_api_key): - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/feedback", - json={"status": "success"}, - ) - - with Client(api_key=mock_api_key) as client: - response = client.submit_feedback( - request_id=str(uuid4()), rating=5, feedback_text="Great service!" - ) - assert response["status"] == "success" - - -@responses.activate -def test_network_error(mock_api_key): - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - body=ConnectionError("Network error"), - ) - - with Client(api_key=mock_api_key) as client: - with pytest.raises(ConnectionError): - client.smartscraper( - website_url="https://example.com", user_prompt="Describe this page." - ) - - -@responses.activate -def test_markdownify(mock_api_key): - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/markdownify", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.markdownify(website_url="https://example.com") - assert response["status"] == "completed" - assert "# Example Page" in response["result"] - - -@responses.activate -def test_markdownify_with_headers(mock_api_key): - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/markdownify", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - with Client(api_key=mock_api_key) as client: - response = client.markdownify( - website_url="https://example.com", headers=headers - ) - assert response["status"] == "completed" - assert "# Example Page" in response["result"] - - -@responses.activate -def test_get_markdownify(mock_api_key, mock_uuid): - responses.add( - responses.GET, - f"https://api.scrapegraphai.com/v1/markdownify/{mock_uuid}", - json={ - "request_id": mock_uuid, - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.get_markdownify(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - - -@responses.activate -def test_searchscraper(mock_api_key): - # Mock the API response - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/searchscraper", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.searchscraper( - user_prompt="What is the latest version of Python?" - ) - assert response["status"] == "completed" - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) - - -@responses.activate -def test_searchscraper_with_headers(mock_api_key): - # Mock the API response - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/searchscraper", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, - ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - with Client(api_key=mock_api_key) as client: - response = client.searchscraper( - user_prompt="What is the latest version of Python?", - headers=headers, - ) - assert response["status"] == "completed" - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) - - -@responses.activate -def test_get_searchscraper(mock_api_key, mock_uuid): - responses.add( - responses.GET, - f"https://api.scrapegraphai.com/v1/searchscraper/{mock_uuid}", - json={ - "request_id": mock_uuid, - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.get_searchscraper(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) - - -@responses.activate -def test_crawl(mock_api_key): - # Mock the API response - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/crawl", - json={ - "id": str(uuid4()), - "status": "processing", - "message": "Crawl job started", - }, - ) - - schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Test Schema", - "type": "object", - "properties": { - "name": {"type": "string"}, - "age": {"type": "integer"}, - }, - "required": ["name"], - } - - with Client(api_key=mock_api_key) as client: - response = client.crawl( - url="https://example.com", - prompt="Extract company information", - schema=schema, - cache_website=True, - depth=2, - max_pages=5, - same_domain_only=True, - batch_size=1, - ) - assert response["status"] == "processing" - assert "id" in response - - -@responses.activate -def test_crawl_with_minimal_params(mock_api_key): - # Mock the API response - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/crawl", - json={ - "id": str(uuid4()), - "status": "processing", - "message": "Crawl job started", - }, - ) - - schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Test Schema", - "type": "object", - "properties": { - "name": {"type": "string"}, - }, - "required": ["name"], - } - - with Client(api_key=mock_api_key) as client: - response = client.crawl( - url="https://example.com", - prompt="Extract company information", - schema=schema, - ) - assert response["status"] == "processing" - assert "id" in response - - -@responses.activate -def test_get_crawl(mock_api_key, mock_uuid): - responses.add( - responses.GET, - f"https://api.scrapegraphai.com/v1/crawl/{mock_uuid}", - json={ - "id": mock_uuid, - "status": "completed", - "result": { - "llm_result": { - "company": { - "name": "Example Corp", - "description": "A technology company", - }, - "services": [ - { - "service_name": "Web Development", - "description": "Custom web solutions", - } - ], - "legal": { - "privacy_policy": "Privacy policy content", - "terms_of_service": "Terms of service content", - }, - } - }, - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.get_crawl(mock_uuid) - assert response["status"] == "completed" - assert response["id"] == mock_uuid - assert "result" in response - assert "llm_result" in response["result"] diff --git a/scrapegraph-py/tests/test_comprehensive_apis.py b/scrapegraph-py/tests/test_comprehensive_apis.py deleted file mode 100644 index faa9ebf..0000000 --- a/scrapegraph-py/tests/test_comprehensive_apis.py +++ /dev/null @@ -1,830 +0,0 @@ -from uuid import uuid4 - -import pytest -import responses -from pydantic import BaseModel - -from scrapegraph_py.async_client import AsyncClient -from scrapegraph_py.client import Client -from tests.utils import generate_mock_api_key - - -@pytest.fixture -def mock_api_key(): - return generate_mock_api_key() - - -@pytest.fixture -def mock_uuid(): - return str(uuid4()) - - -class TestSchema(BaseModel): - """Test schema for output validation""" - title: str - description: str - price: float - - -# ============================================================================ -# SMART SCRAPER TESTS -# ============================================================================ - -@responses.activate -def test_smartscraper_basic_success(mock_api_key): - """Test basic smartscraper with URL - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"title": "Test Page", "description": "Test content"} - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com", - user_prompt="Extract title and description" - ) - assert response["status"] == "completed" - assert response["request_id"] == mock_request_id - - -@responses.activate -def test_smartscraper_with_html_success(mock_api_key): - """Test smartscraper with HTML content - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"extracted_data": "Test data"} - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_html="Test content
", - user_prompt="Extract info", - ) - assert request.website_url is None - assert request.website_html == "Test content
" - assert request.user_prompt == "Extract info" - assert request.headers is None - - # Valid input with headers - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - request = SmartScraperRequest( - website_url="https://example.com", - user_prompt="Describe this page.", - headers=headers, - ) - assert request.headers == headers - - # Test with output_schema - request = SmartScraperRequest( - website_url="https://example.com", - user_prompt="Describe this page.", - output_schema=ExampleSchema, - ) - - # When we dump the model, the output_schema should be converted to a dict - dumped = request.model_dump() - assert isinstance(dumped["output_schema"], dict) - assert "properties" in dumped["output_schema"] - assert "name" in dumped["output_schema"]["properties"] - assert "age" in dumped["output_schema"]["properties"] - - # Invalid URL - with pytest.raises(ValidationError): - SmartScraperRequest( - website_url="invalid-url", user_prompt="Describe this page." - ) - - # Empty prompt - with pytest.raises(ValidationError): - SmartScraperRequest(website_url="https://example.com", user_prompt="") - - # Invalid HTML - with pytest.raises(ValidationError): - SmartScraperRequest( - website_html="not valid html", - user_prompt="Extract info", - ) - - # HTML too large (>2MB) - large_html = "x" * (2 * 1024 * 1024 + 1) - with pytest.raises(ValidationError): - SmartScraperRequest( - website_html=large_html, - user_prompt="Extract info", - ) - - # Neither URL nor HTML provided - with pytest.raises(ValidationError): - SmartScraperRequest(user_prompt="Extract info") - - -def test_get_smartscraper_request_validation(): - # Valid UUID - request = GetSmartScraperRequest(request_id="123e4567-e89b-12d3-a456-426614174000") - assert request.request_id == "123e4567-e89b-12d3-a456-426614174000" - - # Invalid UUID - with pytest.raises(ValidationError): - GetSmartScraperRequest(request_id="invalid-uuid") - - -def test_feedback_request_validation(): - # Valid input - request = FeedbackRequest( - request_id="123e4567-e89b-12d3-a456-426614174000", - rating=5, - feedback_text="Great service!", - ) - assert request.request_id == "123e4567-e89b-12d3-a456-426614174000" - assert request.rating == 5 - assert request.feedback_text == "Great service!" - - # Invalid rating - with pytest.raises(ValidationError): - FeedbackRequest( - request_id="123e4567-e89b-12d3-a456-426614174000", - rating=6, - feedback_text="Great service!", - ) - - # Invalid UUID - with pytest.raises(ValidationError): - FeedbackRequest( - request_id="invalid-uuid", rating=5, feedback_text="Great service!" - ) - - -def test_markdownify_request_validation(): - # Valid input without headers - request = MarkdownifyRequest(website_url="https://example.com") - assert request.website_url == "https://example.com" - assert request.headers is None - - # Valid input with headers - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - request = MarkdownifyRequest(website_url="https://example.com", headers=headers) - assert request.website_url == "https://example.com" - assert request.headers == headers - - # Invalid URL - with pytest.raises(ValidationError): - MarkdownifyRequest(website_url="invalid-url") - - # Empty URL - with pytest.raises(ValidationError): - MarkdownifyRequest(website_url="") - - -def test_get_markdownify_request_validation(): - # Valid UUID - request = GetMarkdownifyRequest(request_id="123e4567-e89b-12d3-a456-426614174000") - assert request.request_id == "123e4567-e89b-12d3-a456-426614174000" - - # Invalid UUID - with pytest.raises(ValidationError): - GetMarkdownifyRequest(request_id="invalid-uuid") - - -def test_searchscraper_request_validation(): - class ExampleSchema(BaseModel): - name: str - age: int - - # Valid input without headers - request = SearchScraperRequest(user_prompt="What is the latest version of Python?") - assert request.user_prompt == "What is the latest version of Python?" - assert request.headers is None - assert request.output_schema is None - - # Valid input with headers - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - request = SearchScraperRequest( - user_prompt="What is the latest version of Python?", - headers=headers, - ) - assert request.headers == headers - - # Test with output_schema - request = SearchScraperRequest( - user_prompt="What is the latest version of Python?", - output_schema=ExampleSchema, - ) - - # When we dump the model, the output_schema should be converted to a dict - dumped = request.model_dump() - assert isinstance(dumped["output_schema"], dict) - assert "properties" in dumped["output_schema"] - assert "name" in dumped["output_schema"]["properties"] - assert "age" in dumped["output_schema"]["properties"] - - # Empty prompt - with pytest.raises(ValidationError): - SearchScraperRequest(user_prompt="") - - # Invalid prompt (no alphanumeric characters) - with pytest.raises(ValidationError): - SearchScraperRequest(user_prompt="!@#$%^") - - -def test_get_searchscraper_request_validation(): - # Valid UUID - request = GetSearchScraperRequest(request_id="123e4567-e89b-12d3-a456-426614174000") - assert request.request_id == "123e4567-e89b-12d3-a456-426614174000" - - # Invalid UUID - with pytest.raises(ValidationError): - GetSearchScraperRequest(request_id="invalid-uuid") - - -def test_crawl_request_validation(): - # Example schema - schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Test Schema", - "type": "object", - "properties": { - "name": {"type": "string"}, - "age": {"type": "integer"}, - }, - "required": ["name"], - } - - # Valid input with all parameters - request = CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - cache_website=True, - depth=2, - max_pages=5, - same_domain_only=True, - batch_size=1, - ) - assert request.url == "https://example.com" - assert request.prompt == "Extract company information" - assert request.data_schema == schema - assert request.cache_website is True - assert request.depth == 2 - assert request.max_pages == 5 - assert request.same_domain_only is True - assert request.batch_size == 1 - - # Valid input with minimal parameters - request = CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - ) - assert request.url == "https://example.com" - assert request.prompt == "Extract company information" - assert request.data_schema == schema - assert request.cache_website is True # default - assert request.depth == 2 # default - assert request.max_pages == 2 # default - assert request.same_domain_only is True # default - assert request.batch_size == 1 # default - - # Invalid URL - with pytest.raises(ValidationError): - CrawlRequest( - url="invalid-url", - prompt="Extract company information", - data_schema=schema, - ) - - # Empty URL - with pytest.raises(ValidationError): - CrawlRequest( - url="", - prompt="Extract company information", - data_schema=schema, - ) - - # Empty prompt - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="", - data_schema=schema, - ) - - # Invalid prompt (no alphanumeric characters) - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="!@#$%^", - data_schema=schema, - ) - - # Empty schema - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema={}, - ) - - # Invalid schema (not a dict) - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema="not a dict", - ) - - # Invalid depth (too low) - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - depth=0, - ) - - # Invalid depth (too high) - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - depth=11, - ) - - # Invalid max_pages (too low) - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - max_pages=0, - ) - - # Invalid max_pages (too high) - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - max_pages=101, - ) - - # Invalid batch_size (too low) - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - batch_size=0, - ) - - # Invalid batch_size (too high) - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - batch_size=11, - ) - - -def test_get_crawl_request_validation(): - # Valid UUID - request = GetCrawlRequest(crawl_id="123e4567-e89b-12d3-a456-426614174000") - assert request.crawl_id == "123e4567-e89b-12d3-a456-426614174000" - - # Invalid UUID - with pytest.raises(ValidationError): - GetCrawlRequest(crawl_id="invalid-uuid") diff --git a/scrapegraph-py/tests/test_models_fix.py b/scrapegraph-py/tests/test_models_fix.py deleted file mode 100644 index 6c1eb52..0000000 --- a/scrapegraph-py/tests/test_models_fix.py +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/env python3 -""" -Test script to verify that the Pydantic warning is resolved and models work correctly. -""" - -import warnings -from scrapegraph_py.models.crawl import CrawlRequest -from scrapegraph_py.models.smartscraper import SmartScraperRequest -from scrapegraph_py.models.searchscraper import SearchScraperRequest -from scrapegraph_py.models.markdownify import MarkdownifyRequest -from scrapegraph_py.models.feedback import FeedbackRequest - -# Capture warnings -warnings.simplefilter("always") - -def test_crawl_request(): - """Test CrawlRequest model""" - print("Testing CrawlRequest...") - - schema = { - "type": "object", - "properties": { - "name": {"type": "string"}, - "description": {"type": "string"} - } - } - - request = CrawlRequest( - url="https://example.com", - prompt="Test prompt", - data_schema=schema - ) - - # Test model_dump - data = request.model_dump() - print(f"โ CrawlRequest model_dump works: {len(data)} fields") - assert "data_schema" in data - assert "schema" not in data # Old field should not be present - -def test_smartscraper_request(): - """Test SmartScraperRequest model""" - print("Testing SmartScraperRequest...") - - # Test without number_of_scrolls (should be None) - request = SmartScraperRequest( - user_prompt="Test prompt", - website_url="https://example.com" - ) - - # Test model_dump - number_of_scrolls should be excluded when None - data = request.model_dump() - print(f"โ SmartScraperRequest model_dump works: {len(data)} fields") - assert "number_of_scrolls" not in data # Should be excluded when None - - # Test with number_of_scrolls - request_with_scrolls = SmartScraperRequest( - user_prompt="Test prompt", - website_url="https://example.com", - number_of_scrolls=5 - ) - - data_with_scrolls = request_with_scrolls.model_dump() - assert "number_of_scrolls" in data_with_scrolls # Should be included when not None - assert data_with_scrolls["number_of_scrolls"] == 5 - -def test_searchscraper_request(): - """Test SearchScraperRequest model""" - print("Testing SearchScraperRequest...") - - request = SearchScraperRequest( - user_prompt="Test prompt" - ) - - data = request.model_dump() - print(f"โ SearchScraperRequest model_dump works: {len(data)} fields") - assert "headers" not in data # Should be excluded when None - -def test_markdownify_request(): - """Test MarkdownifyRequest model""" - print("Testing MarkdownifyRequest...") - - request = MarkdownifyRequest( - website_url="https://example.com" - ) - - data = request.model_dump() - print(f"โ MarkdownifyRequest model_dump works: {len(data)} fields") - assert "headers" not in data # Should be excluded when None - -def test_feedback_request(): - """Test FeedbackRequest model""" - print("Testing FeedbackRequest...") - - request = FeedbackRequest( - request_id="123e4567-e89b-12d3-a456-426614174000", - rating=5 - ) - - data = request.model_dump() - print(f"โ FeedbackRequest model_dump works: {len(data)} fields") - assert "feedback_text" not in data # Should be excluded when None - -if __name__ == "__main__": - print("๐งช Testing Pydantic model fixes...") - - test_crawl_request() - test_smartscraper_request() - test_searchscraper_request() - test_markdownify_request() - test_feedback_request() - - print("\nโ All tests passed! The Pydantic warning should be resolved.") - print("๐ Models now properly exclude None values from serialization.") \ No newline at end of file diff --git a/scrapegraph-py/tests/test_real_apis.py b/scrapegraph-py/tests/test_real_apis.py index 945479b..0559ed7 100644 --- a/scrapegraph-py/tests/test_real_apis.py +++ b/scrapegraph-py/tests/test_real_apis.py @@ -4,14 +4,14 @@ These tests use actual API calls with environment variables """ +import asyncio import os + import pytest -import asyncio -from uuid import uuid4 from pydantic import BaseModel -from scrapegraph_py.client import Client from scrapegraph_py.async_client import AsyncClient +from scrapegraph_py.client import Client class ProductSchema(BaseModel): @@ -251,121 +251,6 @@ def test_get_markdownify_status_real(): assert "request_id" in status_response -def test_crawl_basic_real(): - """Test basic crawl""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - data_schema = { - "type": "object", - "properties": { - "title": {"type": "string"}, - "content": {"type": "string"} - } - } - - with Client.from_env() as client: - response = client.crawl( - url="https://example.com", - prompt="Extract page information", - data_schema=data_schema - ) - assert response["status"] in ["completed", "processing", "pending"] - assert "crawl_id" in response - - -def test_crawl_with_all_params_real(): - """Test crawl with all parameters""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - data_schema = { - "type": "object", - "properties": { - "title": {"type": "string"}, - "description": {"type": "string"} - } - } - - with Client.from_env() as client: - response = client.crawl( - url="https://example.com", - prompt="Extract comprehensive page data", - data_schema=data_schema, - cache_website=True, - depth=2, - max_pages=3, - same_domain_only=True, - batch_size=5 - ) - assert response["status"] in ["completed", "processing", "pending"] - assert "crawl_id" in response - - -def test_get_crawl_status_real(): - """Test getting crawl status""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - with Client.from_env() as client: - # First create a crawl request - data_schema = { - "type": "object", - "properties": { - "title": {"type": "string"} - } - } - - initial_response = client.crawl( - url="https://example.com", - prompt="Extract page titles", - data_schema=data_schema - ) - - crawl_id = initial_response["crawl_id"] - - # Then get the status - status_response = client.get_crawl(crawl_id) - assert "status" in status_response - assert "crawl_id" in status_response - - -def test_get_credits_real(): - """Test getting credits""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - with Client.from_env() as client: - response = client.get_credits() - assert "credits" in response - assert "used_credits" in response - assert "remaining_credits" in response - - -def test_submit_feedback_real(): - """Test submitting feedback""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - with Client.from_env() as client: - # First create a request to get a request_id - initial_response = client.smartscraper( - website_url="https://example.com", - user_prompt="Extract basic info" - ) - - request_id = initial_response["request_id"] - - # Submit feedback - feedback_response = client.submit_feedback( - request_id=request_id, - rating=5, - feedback_text="Great service! Very accurate results." - ) - assert "status" in feedback_response - - -def test_submit_feedback_without_text_real(): """Test submitting feedback without text""" if not os.getenv("SGAI_API_KEY"): pytest.skip("SGAI_API_KEY not set") @@ -448,88 +333,12 @@ async def test_async_markdownify_basic_real(): assert "request_id" in response -@pytest.mark.asyncio -async def test_async_crawl_basic_real(): - """Test basic async crawl""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - data_schema = { - "type": "object", - "properties": { - "title": {"type": "string"}, - "content": {"type": "string"} - } - } - - async with AsyncClient.from_env() as client: - response = await client.crawl( - url="https://example.com", - prompt="Extract async page data", - data_schema=data_schema - ) - assert response["status"] in ["completed", "processing", "pending"] - assert "crawl_id" in response - - -@pytest.mark.asyncio -async def test_async_get_credits_real(): - """Test async get credits""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - async with AsyncClient.from_env() as client: - response = await client.get_credits() - assert "credits" in response - assert "used_credits" in response - assert "remaining_credits" in response - - -@pytest.mark.asyncio -async def test_async_submit_feedback_real(): - """Test async submit feedback""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - async with AsyncClient.from_env() as client: - # First create a request to get a request_id - initial_response = await client.smartscraper( - website_url="https://example.com", - user_prompt="Extract basic info for feedback" - ) - - request_id = initial_response["request_id"] - - # Submit feedback - feedback_response = await client.submit_feedback( - request_id=request_id, - rating=5, - feedback_text="Excellent async service!" - ) - assert "status" in feedback_response # ============================================================================ # CLIENT INITIALIZATION TESTS # ============================================================================ -def test_client_from_env_real(): - """Test client initialization from environment""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - client = Client.from_env() - assert client.api_key == os.getenv("SGAI_API_KEY") - - -def test_async_client_from_env_real(): - """Test async client initialization from environment""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - client = AsyncClient.from_env() - assert client.api_key == os.getenv("SGAI_API_KEY") - def test_client_context_manager_real(): """Test client context manager""" @@ -556,27 +365,6 @@ async def test_async_client_context_manager_real(): # ERROR HANDLING TESTS # ============================================================================ -def test_invalid_api_key_handling(): - """Test handling of invalid API key""" - # Temporarily set invalid API key - original_key = os.getenv("SGAI_API_KEY") - os.environ["SGAI_API_KEY"] = "invalid-key" - - try: - with Client.from_env() as client: - response = client.smartscraper( - website_url="https://example.com", - user_prompt="Test" - ) - # Should handle gracefully even with invalid key - assert "status" in response - finally: - # Restore original key - if original_key: - os.environ["SGAI_API_KEY"] = original_key - else: - del os.environ["SGAI_API_KEY"] - def test_missing_api_key_handling(): """Test handling of missing API key""" diff --git a/scrapegraph-py/tests/test_smartscraper.py b/scrapegraph-py/tests/test_smartscraper.py deleted file mode 100644 index 00eca9c..0000000 --- a/scrapegraph-py/tests/test_smartscraper.py +++ /dev/null @@ -1,175 +0,0 @@ -import pytest -from pydantic import BaseModel, ValidationError -from scrapegraph_py.models.smartscraper import SmartScraperRequest, GetSmartScraperRequest - -# Define a dummy schema to test the output_schema conversion in model_dump -class DummySchema(BaseModel): - """A dummy schema to simulate a Pydantic model with JSON schema conversion.""" - a: int = 1 - -def test_model_dump_with_output_schema_conversion(): - """ - Test that model_dump on SmartScraperRequest converts the provided output_schema into a JSON schema dict. - """ - # Create a request with a valid user prompt, website URL, and a dummy output_schema. - request = SmartScraperRequest( - user_prompt="Extract information about the company", - website_url="https://scrapegraphai.com/", - output_schema=DummySchema - ) - # Get the dump dict from the model. - output = request.model_dump() - # The model_dump should include the 'output_schema' converted to its JSON schema representation. - expected_schema = DummySchema.model_json_schema() - assert output.get("output_schema") == expected_schema - -def test_model_dump_without_output_schema(): - """ - Test that model_dump on SmartScraperRequest returns output_schema as None - when no output_schema is provided. This ensures that the conversion logic is only - applied when output_schema is not None. - """ - # Create a valid SmartScraperRequest without providing an output_schema. - request = SmartScraperRequest( - user_prompt="Extract some meaningful data", - website_url="https://scrapegraphai.com/" - ) - # Get the dumped dictionary from the model. - output = request.model_dump() - # Ensure that the output contains the key "output_schema" and its value is None. - assert "output_schema" in output, "Output schema key should be present even if None" - assert output["output_schema"] is None, "Output schema should be None when not provided" - -def test_invalid_get_smartscraper_request_id(): - """ - Test that GetSmartScraperRequest raises a ValueError when provided with an invalid UUID. - This test ensures that the request_id field is validated correctly. - """ - with pytest.raises(ValueError, match="request_id must be a valid UUID"): - GetSmartScraperRequest(request_id="invalid-uuid") - - -def test_smartscraper_request_with_pagination(): - """ - Test SmartScraperRequest with pagination parameter. - This test ensures that the total_pages field is properly handled. - """ - # Test with valid pagination - request = SmartScraperRequest( - user_prompt="Extract product information", - website_url="https://example.com/products", - total_pages=5 - ) - - assert request.total_pages == 5 - - # Test model_dump includes pagination - output = request.model_dump() - assert output["total_pages"] == 5 - - # Test without pagination (default behavior) - request_no_pagination = SmartScraperRequest( - user_prompt="Extract product information", - website_url="https://example.com/products" - ) - - assert request_no_pagination.total_pages is None - - # Test model_dump excludes None pagination - output_no_pagination = request_no_pagination.model_dump() - assert "total_pages" not in output_no_pagination - - -def test_smartscraper_request_pagination_validation(): - """ - Test pagination validation constraints. - This test ensures that total_pages is properly validated. - """ - # Test minimum value - request = SmartScraperRequest( - user_prompt="Extract products", - website_url="https://example.com/products", - total_pages=1 - ) - assert request.total_pages == 1 - - # Test maximum value - request = SmartScraperRequest( - user_prompt="Extract products", - website_url="https://example.com/products", - total_pages=10 - ) - assert request.total_pages == 10 - - # Test invalid values - with pytest.raises(ValidationError): - SmartScraperRequest( - user_prompt="Extract products", - website_url="https://example.com/products", - total_pages=0 - ) - - with pytest.raises(ValidationError): - SmartScraperRequest( - user_prompt="Extract products", - website_url="https://example.com/products", - total_pages=11 - ) - - -def test_smartscraper_request_pagination_with_all_features(): - """ - Test pagination combined with other SmartScraper features. - This test ensures pagination works with output_schema, scrolls, and headers. - """ - headers = {"User-Agent": "test-agent"} - - request = SmartScraperRequest( - user_prompt="Extract all product information", - website_url="https://example.com/products", - headers=headers, - output_schema=DummySchema, - number_of_scrolls=5, - total_pages=3 - ) - - assert request.total_pages == 3 - assert request.number_of_scrolls == 5 - assert request.headers == headers - assert request.output_schema == DummySchema - - # Test model_dump with all features - output = request.model_dump() - assert output["total_pages"] == 3 - assert output["number_of_scrolls"] == 5 - assert output["headers"] == headers - assert isinstance(output["output_schema"], dict) - -def test_invalid_url_in_smartscraper_request(): - """ - Test that SmartScraperRequest raises a ValueError when provided with a website_url - that does not start with 'http://' or 'https://'. This ensures the URL validation works. - """ - with pytest.raises(ValueError, match="Invalid URL"): - SmartScraperRequest( - user_prompt="Extract data", - website_url="ftp://invalid-url" - ) - -def test_invalid_user_prompt_empty_and_non_alnum(): - """ - Test that SmartScraperRequest raises a ValueError when the user_prompt is either empty (or only whitespace) - or when it contains no alphanumeric characters. This ensures the user prompt validator is working correctly. - """ - # Test with a user_prompt that is empty (only whitespace) - with pytest.raises(ValueError, match="User prompt cannot be empty"): - SmartScraperRequest( - user_prompt=" ", - website_url="https://scrapegraphai.com/" - ) - # Test with a user_prompt that contains no alphanumeric characters - with pytest.raises(ValueError, match="User prompt must contain a valid prompt"): - SmartScraperRequest( - user_prompt="!!!", - website_url="https://scrapegraphai.com/" - ) diff --git a/scrapegraph-py/tests/test_smartscraper_pagination.py b/scrapegraph-py/tests/test_smartscraper_pagination.py deleted file mode 100644 index 53ffd56..0000000 --- a/scrapegraph-py/tests/test_smartscraper_pagination.py +++ /dev/null @@ -1,302 +0,0 @@ -import pytest -from pydantic import BaseModel, ValidationError -from scrapegraph_py.models.smartscraper import SmartScraperRequest, GetSmartScraperRequest - - -class TestProductSchema(BaseModel): - """Test schema for pagination tests""" - name: str - price: str - rating: float = None - - -class TestSmartScraperPagination: - """Test suite for SmartScraper pagination functionality""" - - def test_smartscraper_request_with_pagination(self): - """Test SmartScraperRequest with valid pagination parameters""" - request = SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=5 - ) - - assert request.website_url == "https://example.com/products" - assert request.user_prompt == "Extract product information" - assert request.total_pages == 5 - assert request.number_of_scrolls is None - assert request.output_schema is None - - def test_smartscraper_request_with_pagination_and_schema(self): - """Test SmartScraperRequest with pagination and output schema""" - request = SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=3, - output_schema=TestProductSchema - ) - - assert request.total_pages == 3 - assert request.output_schema == TestProductSchema - - # Test model_dump with pagination and schema - dumped = request.model_dump() - assert dumped["total_pages"] == 3 - assert isinstance(dumped["output_schema"], dict) - assert "properties" in dumped["output_schema"] - - def test_smartscraper_request_with_pagination_and_scrolls(self): - """Test SmartScraperRequest with both pagination and scrolling""" - request = SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=2, - number_of_scrolls=10 - ) - - assert request.total_pages == 2 - assert request.number_of_scrolls == 10 - - # Test model_dump excludes None values - dumped = request.model_dump() - assert dumped["total_pages"] == 2 - assert dumped["number_of_scrolls"] == 10 - assert "website_html" not in dumped # Should be excluded since it's None - - def test_smartscraper_request_pagination_validation_minimum(self): - """Test pagination validation - minimum value""" - # Valid minimum value - request = SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=1 - ) - assert request.total_pages == 1 - - # Invalid minimum value (less than 1) - with pytest.raises(ValidationError) as exc_info: - SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=0 - ) - assert "greater than or equal to 1" in str(exc_info.value) - - def test_smartscraper_request_pagination_validation_maximum(self): - """Test pagination validation - maximum value""" - # Valid maximum value - request = SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=10 - ) - assert request.total_pages == 10 - - # Invalid maximum value (greater than 10) - with pytest.raises(ValidationError) as exc_info: - SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=11 - ) - assert "less than or equal to 10" in str(exc_info.value) - - def test_smartscraper_request_pagination_none_value(self): - """Test SmartScraperRequest with None pagination (default behavior)""" - request = SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=None - ) - - assert request.total_pages is None - - # Test model_dump excludes None values - dumped = request.model_dump() - assert "total_pages" not in dumped - - def test_smartscraper_request_pagination_with_html(self): - """Test pagination with HTML content instead of URL""" - html_content = """ - - -