diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 8431e3e..46232ae 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -8,8 +8,53 @@ on: types: [published] jobs: - deploy: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Cache pip dependencies + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('**/pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-pip-${{ matrix.python-version }}- + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest pytest-asyncio responses + cd scrapegraph-py + pip install -e . + + - name: Run tests with coverage + run: | + cd scrapegraph-py + python -m pytest tests/ -v --cov=scrapegraph_py --cov-report=xml --cov-report=term-missing + + - name: Run real API tests (if API key available) + run: | + cd scrapegraph-py + if [ -n "$SGAI_API_KEY" ]; then + python -m pytest tests/test_real_apis.py -v --tb=short + else + echo "SGAI_API_KEY not set, skipping real API tests" + fi + env: + SGAI_API_KEY: ${{ secrets.SGAI_API_KEY }} + deploy: + needs: test runs-on: ubuntu-latest steps: @@ -21,12 +66,13 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel twine + pip install setuptools wheel twine build - name: Build and publish env: TWINE_USERNAME: mvincig11 TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | git fetch --all --tags - python setup.py sdist bdist_wheel + cd scrapegraph-py + python -m build twine upload dist/* \ No newline at end of file diff --git a/scrapegraph-py/tests/test_async_client.py b/scrapegraph-py/tests/test_async_client.py deleted file mode 100644 index f5e1dcc..0000000 --- a/scrapegraph-py/tests/test_async_client.py +++ /dev/null @@ -1,495 +0,0 @@ -from uuid import uuid4 - -import pytest -from aioresponses import aioresponses -from pydantic import BaseModel - -from scrapegraph_py.async_client import AsyncClient -from scrapegraph_py.exceptions import APIError -from tests.utils import generate_mock_api_key - - -@pytest.fixture -def mock_api_key(): - return generate_mock_api_key() - - -@pytest.fixture -def mock_uuid(): - return str(uuid4()) - - -@pytest.mark.asyncio -async def test_smartscraper_with_url(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Example domain."}, - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com", user_prompt="Describe this page." - ) - assert response["status"] == "completed" - assert "description" in response["result"] - - -@pytest.mark.asyncio -async def test_smartscraper_with_html(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Test content."}, - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_html="

Test content

", - user_prompt="Extract info", - ) - assert response["status"] == "completed" - assert "description" in response["result"] - - -@pytest.mark.asyncio -async def test_smartscraper_with_headers(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Example domain."}, - }, - ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com", - user_prompt="Describe this page.", - headers=headers, - ) - assert response["status"] == "completed" - assert "description" in response["result"] - - -@pytest.mark.asyncio -async def test_get_credits(mock_api_key): - with aioresponses() as mocked: - mocked.get( - "https://api.scrapegraphai.com/v1/credits", - payload={"remaining_credits": 100, "total_credits_used": 50}, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_credits() - assert response["remaining_credits"] == 100 - assert response["total_credits_used"] == 50 - - -@pytest.mark.asyncio -async def test_submit_feedback(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/feedback", payload={"status": "success"} - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.submit_feedback( - request_id=str(uuid4()), rating=5, feedback_text="Great service!" - ) - assert response["status"] == "success" - - -@pytest.mark.asyncio -async def test_get_smartscraper(mock_api_key, mock_uuid): - with aioresponses() as mocked: - mocked.get( - f"https://api.scrapegraphai.com/v1/smartscraper/{mock_uuid}", - payload={ - "request_id": mock_uuid, - "status": "completed", - "result": {"data": "test"}, - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_smartscraper(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - - -@pytest.mark.asyncio -async def test_smartscraper_with_pagination(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10"}, - {"name": "Product 2", "price": "$20"}, - {"name": "Product 3", "price": "$30"}, - ] - }, - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=3 - ) - assert response["status"] == "completed" - assert "products" in response["result"] - assert len(response["result"]["products"]) == 3 - - -@pytest.mark.asyncio -async def test_smartscraper_with_pagination_and_scrolls(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10"}, - {"name": "Product 2", "price": "$20"}, - {"name": "Product 3", "price": "$30"}, - {"name": "Product 4", "price": "$40"}, - {"name": "Product 5", "price": "$50"}, - ] - }, - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information from paginated results", - total_pages=5, - number_of_scrolls=10 - ) - assert response["status"] == "completed" - assert "products" in response["result"] - assert len(response["result"]["products"]) == 5 - - -@pytest.mark.asyncio -async def test_smartscraper_with_pagination_and_all_features(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10", "rating": 4.5}, - {"name": "Product 2", "price": "$20", "rating": 4.0}, - ] - }, - }, - ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - class ProductSchema(BaseModel): - name: str - price: str - rating: float - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information with ratings", - headers=headers, - output_schema=ProductSchema, - number_of_scrolls=5, - total_pages=2 - ) - assert response["status"] == "completed" - assert "products" in response["result"] - - -@pytest.mark.asyncio -async def test_api_error(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - status=400, - payload={"error": "Bad request"}, - exception=APIError("Bad request", status_code=400), - ) - - async with AsyncClient(api_key=mock_api_key) as client: - with pytest.raises(APIError) as exc_info: - await client.smartscraper( - website_url="https://example.com", user_prompt="Describe this page." - ) - assert exc_info.value.status_code == 400 - assert "Bad request" in str(exc_info.value) - - -@pytest.mark.asyncio -async def test_markdownify(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/markdownify", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.markdownify(website_url="https://example.com") - assert response["status"] == "completed" - assert "# Example Page" in response["result"] - - -@pytest.mark.asyncio -async def test_markdownify_with_headers(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/markdownify", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.markdownify( - website_url="https://example.com", headers=headers - ) - assert response["status"] == "completed" - assert "# Example Page" in response["result"] - - -@pytest.mark.asyncio -async def test_get_markdownify(mock_api_key, mock_uuid): - with aioresponses() as mocked: - mocked.get( - f"https://api.scrapegraphai.com/v1/markdownify/{mock_uuid}", - payload={ - "request_id": mock_uuid, - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_markdownify(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - - -@pytest.mark.asyncio -async def test_searchscraper(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/searchscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.searchscraper( - user_prompt="What is the latest version of Python?" - ) - assert response["status"] == "completed" - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) - - -@pytest.mark.asyncio -async def test_searchscraper_with_headers(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/searchscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, - ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.searchscraper( - user_prompt="What is the latest version of Python?", - headers=headers, - ) - assert response["status"] == "completed" - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) - - -@pytest.mark.asyncio -async def test_get_searchscraper(mock_api_key, mock_uuid): - with aioresponses() as mocked: - mocked.get( - f"https://api.scrapegraphai.com/v1/searchscraper/{mock_uuid}", - payload={ - "request_id": mock_uuid, - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_searchscraper(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) - - -@pytest.mark.asyncio -async def test_crawl(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/crawl", - payload={ - "id": str(uuid4()), - "status": "processing", - "message": "Crawl job started", - }, - ) - - schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Test Schema", - "type": "object", - "properties": { - "name": {"type": "string"}, - "age": {"type": "integer"}, - }, - "required": ["name"], - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.crawl( - url="https://example.com", - prompt="Extract company information", - schema=schema, - cache_website=True, - depth=2, - max_pages=5, - same_domain_only=True, - batch_size=1, - ) - assert response["status"] == "processing" - assert "id" in response - - -@pytest.mark.asyncio -async def test_crawl_with_minimal_params(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/crawl", - payload={ - "id": str(uuid4()), - "status": "processing", - "message": "Crawl job started", - }, - ) - - schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Test Schema", - "type": "object", - "properties": { - "name": {"type": "string"}, - }, - "required": ["name"], - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.crawl( - url="https://example.com", - prompt="Extract company information", - schema=schema, - ) - assert response["status"] == "processing" - assert "id" in response - - -@pytest.mark.asyncio -async def test_get_crawl(mock_api_key, mock_uuid): - with aioresponses() as mocked: - mocked.get( - f"https://api.scrapegraphai.com/v1/crawl/{mock_uuid}", - payload={ - "id": mock_uuid, - "status": "completed", - "result": { - "llm_result": { - "company": { - "name": "Example Corp", - "description": "A technology company", - }, - "services": [ - { - "service_name": "Web Development", - "description": "Custom web solutions", - } - ], - "legal": { - "privacy_policy": "Privacy policy content", - "terms_of_service": "Terms of service content", - }, - } - }, - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_crawl(mock_uuid) - assert response["status"] == "completed" - assert response["id"] == mock_uuid - assert "result" in response - assert "llm_result" in response["result"] diff --git a/scrapegraph-py/tests/test_client.py b/scrapegraph-py/tests/test_client.py deleted file mode 100644 index c7ad078..0000000 --- a/scrapegraph-py/tests/test_client.py +++ /dev/null @@ -1,498 +0,0 @@ -from uuid import uuid4 - -import pytest -import responses -from pydantic import BaseModel - -from scrapegraph_py.client import Client -from tests.utils import generate_mock_api_key - - -@pytest.fixture -def mock_api_key(): - return generate_mock_api_key() - - -@pytest.fixture -def mock_uuid(): - return str(uuid4()) - - -@responses.activate -def test_smartscraper_with_url(mock_api_key): - # Mock the API response - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Example domain."}, - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com", user_prompt="Describe this page." - ) - assert response["status"] == "completed" - - -@responses.activate -def test_smartscraper_with_html(mock_api_key): - # Mock the API response - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Test content."}, - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_html="

Test content

", - user_prompt="Extract info", - ) - assert response["status"] == "completed" - - -@responses.activate -def test_smartscraper_with_headers(mock_api_key): - # Mock the API response - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Example domain."}, - }, - ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com", - user_prompt="Describe this page.", - headers=headers, - ) - assert response["status"] == "completed" - - -@responses.activate -def test_get_smartscraper(mock_api_key, mock_uuid): - responses.add( - responses.GET, - f"https://api.scrapegraphai.com/v1/smartscraper/{mock_uuid}", - json={ - "request_id": mock_uuid, - "status": "completed", - "result": {"data": "test"}, - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.get_smartscraper(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - - -@responses.activate -def test_smartscraper_with_pagination(mock_api_key): - # Mock the API response for pagination request - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10"}, - {"name": "Product 2", "price": "$20"}, - {"name": "Product 3", "price": "$30"}, - ] - }, - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=3 - ) - assert response["status"] == "completed" - assert "products" in response["result"] - assert len(response["result"]["products"]) == 3 - - -@responses.activate -def test_smartscraper_with_pagination_and_scrolls(mock_api_key): - # Mock the API response for pagination with scrolls - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10"}, - {"name": "Product 2", "price": "$20"}, - {"name": "Product 3", "price": "$30"}, - {"name": "Product 4", "price": "$40"}, - {"name": "Product 5", "price": "$50"}, - ] - }, - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information from paginated results", - total_pages=5, - number_of_scrolls=10 - ) - assert response["status"] == "completed" - assert "products" in response["result"] - assert len(response["result"]["products"]) == 5 - - -@responses.activate -def test_smartscraper_with_pagination_and_all_features(mock_api_key): - # Mock the API response for pagination with all features - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10", "rating": 4.5}, - {"name": "Product 2", "price": "$20", "rating": 4.0}, - ] - }, - }, - ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - class ProductSchema(BaseModel): - name: str - price: str - rating: float - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information with ratings", - headers=headers, - output_schema=ProductSchema, - number_of_scrolls=5, - total_pages=2 - ) - assert response["status"] == "completed" - assert "products" in response["result"] - - -@responses.activate -def test_get_credits(mock_api_key): - responses.add( - responses.GET, - "https://api.scrapegraphai.com/v1/credits", - json={"remaining_credits": 100, "total_credits_used": 50}, - ) - - with Client(api_key=mock_api_key) as client: - response = client.get_credits() - assert response["remaining_credits"] == 100 - assert response["total_credits_used"] == 50 - - -@responses.activate -def test_submit_feedback(mock_api_key): - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/feedback", - json={"status": "success"}, - ) - - with Client(api_key=mock_api_key) as client: - response = client.submit_feedback( - request_id=str(uuid4()), rating=5, feedback_text="Great service!" - ) - assert response["status"] == "success" - - -@responses.activate -def test_network_error(mock_api_key): - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - body=ConnectionError("Network error"), - ) - - with Client(api_key=mock_api_key) as client: - with pytest.raises(ConnectionError): - client.smartscraper( - website_url="https://example.com", user_prompt="Describe this page." - ) - - -@responses.activate -def test_markdownify(mock_api_key): - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/markdownify", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.markdownify(website_url="https://example.com") - assert response["status"] == "completed" - assert "# Example Page" in response["result"] - - -@responses.activate -def test_markdownify_with_headers(mock_api_key): - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/markdownify", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - with Client(api_key=mock_api_key) as client: - response = client.markdownify( - website_url="https://example.com", headers=headers - ) - assert response["status"] == "completed" - assert "# Example Page" in response["result"] - - -@responses.activate -def test_get_markdownify(mock_api_key, mock_uuid): - responses.add( - responses.GET, - f"https://api.scrapegraphai.com/v1/markdownify/{mock_uuid}", - json={ - "request_id": mock_uuid, - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.get_markdownify(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - - -@responses.activate -def test_searchscraper(mock_api_key): - # Mock the API response - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/searchscraper", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.searchscraper( - user_prompt="What is the latest version of Python?" - ) - assert response["status"] == "completed" - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) - - -@responses.activate -def test_searchscraper_with_headers(mock_api_key): - # Mock the API response - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/searchscraper", - json={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, - ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - with Client(api_key=mock_api_key) as client: - response = client.searchscraper( - user_prompt="What is the latest version of Python?", - headers=headers, - ) - assert response["status"] == "completed" - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) - - -@responses.activate -def test_get_searchscraper(mock_api_key, mock_uuid): - responses.add( - responses.GET, - f"https://api.scrapegraphai.com/v1/searchscraper/{mock_uuid}", - json={ - "request_id": mock_uuid, - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.get_searchscraper(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) - - -@responses.activate -def test_crawl(mock_api_key): - # Mock the API response - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/crawl", - json={ - "id": str(uuid4()), - "status": "processing", - "message": "Crawl job started", - }, - ) - - schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Test Schema", - "type": "object", - "properties": { - "name": {"type": "string"}, - "age": {"type": "integer"}, - }, - "required": ["name"], - } - - with Client(api_key=mock_api_key) as client: - response = client.crawl( - url="https://example.com", - prompt="Extract company information", - schema=schema, - cache_website=True, - depth=2, - max_pages=5, - same_domain_only=True, - batch_size=1, - ) - assert response["status"] == "processing" - assert "id" in response - - -@responses.activate -def test_crawl_with_minimal_params(mock_api_key): - # Mock the API response - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/crawl", - json={ - "id": str(uuid4()), - "status": "processing", - "message": "Crawl job started", - }, - ) - - schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Test Schema", - "type": "object", - "properties": { - "name": {"type": "string"}, - }, - "required": ["name"], - } - - with Client(api_key=mock_api_key) as client: - response = client.crawl( - url="https://example.com", - prompt="Extract company information", - schema=schema, - ) - assert response["status"] == "processing" - assert "id" in response - - -@responses.activate -def test_get_crawl(mock_api_key, mock_uuid): - responses.add( - responses.GET, - f"https://api.scrapegraphai.com/v1/crawl/{mock_uuid}", - json={ - "id": mock_uuid, - "status": "completed", - "result": { - "llm_result": { - "company": { - "name": "Example Corp", - "description": "A technology company", - }, - "services": [ - { - "service_name": "Web Development", - "description": "Custom web solutions", - } - ], - "legal": { - "privacy_policy": "Privacy policy content", - "terms_of_service": "Terms of service content", - }, - } - }, - }, - ) - - with Client(api_key=mock_api_key) as client: - response = client.get_crawl(mock_uuid) - assert response["status"] == "completed" - assert response["id"] == mock_uuid - assert "result" in response - assert "llm_result" in response["result"] diff --git a/scrapegraph-py/tests/test_comprehensive_apis.py b/scrapegraph-py/tests/test_comprehensive_apis.py deleted file mode 100644 index faa9ebf..0000000 --- a/scrapegraph-py/tests/test_comprehensive_apis.py +++ /dev/null @@ -1,830 +0,0 @@ -from uuid import uuid4 - -import pytest -import responses -from pydantic import BaseModel - -from scrapegraph_py.async_client import AsyncClient -from scrapegraph_py.client import Client -from tests.utils import generate_mock_api_key - - -@pytest.fixture -def mock_api_key(): - return generate_mock_api_key() - - -@pytest.fixture -def mock_uuid(): - return str(uuid4()) - - -class TestSchema(BaseModel): - """Test schema for output validation""" - title: str - description: str - price: float - - -# ============================================================================ -# SMART SCRAPER TESTS -# ============================================================================ - -@responses.activate -def test_smartscraper_basic_success(mock_api_key): - """Test basic smartscraper with URL - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"title": "Test Page", "description": "Test content"} - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com", - user_prompt="Extract title and description" - ) - assert response["status"] == "completed" - assert response["request_id"] == mock_request_id - - -@responses.activate -def test_smartscraper_with_html_success(mock_api_key): - """Test smartscraper with HTML content - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"extracted_data": "Test data"} - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_html="

Test

", - user_prompt="Extract data" - ) - assert response["status"] == "completed" - - -@responses.activate -def test_smartscraper_with_headers_success(mock_api_key): - """Test smartscraper with custom headers - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"data": "Header test"} - }, - status=200 - ) - - headers = {"User-Agent": "Test Agent", "Cookie": "session=123"} - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com", - user_prompt="Extract data", - headers=headers - ) - assert response["status"] == "completed" - - -@responses.activate -def test_smartscraper_with_cookies_success(mock_api_key): - """Test smartscraper with cookies - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"data": "Cookie test"} - }, - status=200 - ) - - cookies = {"session": "abc123", "user": "test"} - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com", - user_prompt="Extract data", - cookies=cookies - ) - assert response["status"] == "completed" - - -@responses.activate -def test_smartscraper_with_schema_success(mock_api_key): - """Test smartscraper with output schema - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"title": "Test", "description": "Desc", "price": 99.99} - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com", - user_prompt="Extract product info", - output_schema=TestSchema - ) - assert response["status"] == "completed" - - -@responses.activate -def test_smartscraper_with_scrolls_success(mock_api_key): - """Test smartscraper with scrolls - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"scrolled_data": "Scroll test"} - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com", - user_prompt="Extract data", - number_of_scrolls=5 - ) - assert response["status"] == "completed" - - -@responses.activate -def test_smartscraper_with_pagination_success(mock_api_key): - """Test smartscraper with pagination - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"paginated_data": "Pagination test"} - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com", - user_prompt="Extract data", - total_pages=3 - ) - assert response["status"] == "completed" - - -@responses.activate -def test_get_smartscraper_success(mock_api_key, mock_uuid): - """Test get smartscraper status - should return 200""" - responses.add( - responses.GET, - f"https://api.scrapegraphai.com/v1/smartscraper/{mock_uuid}", - json={ - "request_id": mock_uuid, - "status": "completed", - "result": {"data": "Retrieved data"} - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.get_smartscraper(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - - -# ============================================================================ -# SEARCH SCRAPER TESTS -# ============================================================================ - -@responses.activate -def test_searchscraper_basic_success(mock_api_key): - """Test basic searchscraper - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/searchscraper", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"search_results": "Test results"} - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.searchscraper( - user_prompt="Search for products" - ) - assert response["status"] == "completed" - - -@responses.activate -def test_searchscraper_with_num_results_success(mock_api_key): - """Test searchscraper with num_results - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/searchscraper", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"results": ["result1", "result2", "result3"]} - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.searchscraper( - user_prompt="Search for products", - num_results=3 - ) - assert response["status"] == "completed" - - -@responses.activate -def test_searchscraper_with_headers_success(mock_api_key): - """Test searchscraper with headers - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/searchscraper", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"results": "Header test results"} - }, - status=200 - ) - - headers = {"User-Agent": "Search Agent"} - - with Client(api_key=mock_api_key) as client: - response = client.searchscraper( - user_prompt="Search for products", - headers=headers - ) - assert response["status"] == "completed" - - -@responses.activate -def test_searchscraper_with_schema_success(mock_api_key): - """Test searchscraper with output schema - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/searchscraper", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"title": "Search Result", "description": "Desc", "price": 50.0} - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.searchscraper( - user_prompt="Search for products", - output_schema=TestSchema - ) - assert response["status"] == "completed" - - -@responses.activate -def test_get_searchscraper_success(mock_api_key, mock_uuid): - """Test get searchscraper status - should return 200""" - responses.add( - responses.GET, - f"https://api.scrapegraphai.com/v1/searchscraper/{mock_uuid}", - json={ - "request_id": mock_uuid, - "status": "completed", - "result": {"search_data": "Retrieved search data"} - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.get_searchscraper(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - - -# ============================================================================ -# MARKDOWNIFY TESTS -# ============================================================================ - -@responses.activate -def test_markdownify_basic_success(mock_api_key): - """Test basic markdownify - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/markdownify", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"markdown": "# Test Page\n\nThis is test content."} - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.markdownify("https://example.com") - assert response["status"] == "completed" - - -@responses.activate -def test_markdownify_with_headers_success(mock_api_key): - """Test markdownify with headers - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/markdownify", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"markdown": "# Header Test\n\nContent with headers."} - }, - status=200 - ) - - headers = {"User-Agent": "Markdown Agent"} - - with Client(api_key=mock_api_key) as client: - response = client.markdownify( - "https://example.com", - headers=headers - ) - assert response["status"] == "completed" - - -@responses.activate -def test_get_markdownify_success(mock_api_key, mock_uuid): - """Test get markdownify status - should return 200""" - responses.add( - responses.GET, - f"https://api.scrapegraphai.com/v1/markdownify/{mock_uuid}", - json={ - "request_id": mock_uuid, - "status": "completed", - "result": {"markdown": "# Retrieved Content\n\nMarkdown content."} - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.get_markdownify(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - - -# ============================================================================ -# CRAWL TESTS -# ============================================================================ - -@responses.activate -def test_crawl_basic_success(mock_api_key): - """Test basic crawl - should return 200""" - mock_crawl_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/crawl", - json={ - "crawl_id": mock_crawl_id, - "status": "completed", - "result": {"pages": ["page1", "page2"], "data": "Crawl data"} - }, - status=200 - ) - - data_schema = { - "type": "object", - "properties": { - "title": {"type": "string"}, - "content": {"type": "string"} - } - } - - with Client(api_key=mock_api_key) as client: - response = client.crawl( - url="https://example.com", - prompt="Extract page data", - data_schema=data_schema - ) - assert response["status"] == "completed" - - -@responses.activate -def test_crawl_with_all_params_success(mock_api_key): - """Test crawl with all parameters - should return 200""" - mock_crawl_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/crawl", - json={ - "crawl_id": mock_crawl_id, - "status": "completed", - "result": {"pages": ["page1", "page2", "page3"], "data": "Full crawl data"} - }, - status=200 - ) - - data_schema = { - "type": "object", - "properties": { - "title": {"type": "string"}, - "description": {"type": "string"} - } - } - - with Client(api_key=mock_api_key) as client: - response = client.crawl( - url="https://example.com", - prompt="Extract all page data", - data_schema=data_schema, - cache_website=True, - depth=3, - max_pages=5, - same_domain_only=True, - batch_size=10 - ) - assert response["status"] == "completed" - - -@responses.activate -def test_get_crawl_success(mock_api_key, mock_uuid): - """Test get crawl status - should return 200""" - responses.add( - responses.GET, - f"https://api.scrapegraphai.com/v1/crawl/{mock_uuid}", - json={ - "crawl_id": mock_uuid, - "status": "completed", - "result": {"pages": ["page1", "page2"], "data": "Retrieved crawl data"} - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.get_crawl(mock_uuid) - assert response["status"] == "completed" - assert response["crawl_id"] == mock_uuid - - -# ============================================================================ -# CREDITS TESTS -# ============================================================================ - -@responses.activate -def test_get_credits_success(mock_api_key): - """Test get credits - should return 200""" - responses.add( - responses.GET, - "https://api.scrapegraphai.com/v1/credits", - json={ - "credits": 1000, - "used_credits": 150, - "remaining_credits": 850 - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.get_credits() - assert response["credits"] == 1000 - assert response["used_credits"] == 150 - assert response["remaining_credits"] == 850 - - -# ============================================================================ -# FEEDBACK TESTS -# ============================================================================ - -@responses.activate -def test_submit_feedback_success(mock_api_key): - """Test submit feedback - should return 200""" - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/feedback", - json={ - "status": "success", - "message": "Feedback submitted successfully" - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.submit_feedback( - request_id=str(uuid4()), - rating=5, - feedback_text="Great service!" - ) - assert response["status"] == "success" - - -@responses.activate -def test_submit_feedback_without_text_success(mock_api_key): - """Test submit feedback without text - should return 200""" - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/feedback", - json={ - "status": "success", - "message": "Feedback submitted successfully" - }, - status=200 - ) - - with Client(api_key=mock_api_key) as client: - response = client.submit_feedback( - request_id=str(uuid4()), - rating=4 - ) - assert response["status"] == "success" - - -# ============================================================================ -# ASYNC CLIENT TESTS -# ============================================================================ - -@pytest.mark.asyncio -@responses.activate -async def test_async_smartscraper_success(mock_api_key): - """Test async smartscraper - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"async_data": "Async test"} - }, - status=200 - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com", - user_prompt="Extract async data" - ) - assert response["status"] == "completed" - - -@pytest.mark.asyncio -@responses.activate -async def test_async_searchscraper_success(mock_api_key): - """Test async searchscraper - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/searchscraper", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"async_search": "Async search test"} - }, - status=200 - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.searchscraper( - user_prompt="Async search" - ) - assert response["status"] == "completed" - - -@pytest.mark.asyncio -@responses.activate -async def test_async_markdownify_success(mock_api_key): - """Test async markdownify - should return 200""" - mock_request_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/markdownify", - json={ - "request_id": mock_request_id, - "status": "completed", - "result": {"markdown": "# Async Markdown\n\nAsync content."} - }, - status=200 - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.markdownify("https://example.com") - assert response["status"] == "completed" - - -@pytest.mark.asyncio -@responses.activate -async def test_async_crawl_success(mock_api_key): - """Test async crawl - should return 200""" - mock_crawl_id = str(uuid4()) - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/crawl", - json={ - "crawl_id": mock_crawl_id, - "status": "completed", - "result": {"async_pages": ["page1", "page2"], "data": "Async crawl data"} - }, - status=200 - ) - - data_schema = { - "type": "object", - "properties": { - "title": {"type": "string"}, - "content": {"type": "string"} - } - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.crawl( - url="https://example.com", - prompt="Extract async data", - data_schema=data_schema - ) - assert response["status"] == "completed" - - -@pytest.mark.asyncio -@responses.activate -async def test_async_get_credits_success(mock_api_key): - """Test async get credits - should return 200""" - responses.add( - responses.GET, - "https://api.scrapegraphai.com/v1/credits", - json={ - "credits": 2000, - "used_credits": 300, - "remaining_credits": 1700 - }, - status=200 - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_credits() - assert response["credits"] == 2000 - assert response["used_credits"] == 300 - assert response["remaining_credits"] == 1700 - - -@pytest.mark.asyncio -@responses.activate -async def test_async_submit_feedback_success(mock_api_key): - """Test async submit feedback - should return 200""" - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/feedback", - json={ - "status": "success", - "message": "Async feedback submitted successfully" - }, - status=200 - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.submit_feedback( - request_id=str(uuid4()), - rating=5, - feedback_text="Async great service!" - ) - assert response["status"] == "success" - - -# ============================================================================ -# CLIENT INITIALIZATION TESTS -# ============================================================================ - -def test_client_from_env_success(mock_api_key, monkeypatch): - """Test client initialization from environment - should work""" - monkeypatch.setenv("SGAI_API_KEY", mock_api_key) - - client = Client.from_env() - assert client.api_key == mock_api_key - - -def test_async_client_from_env_success(mock_api_key, monkeypatch): - """Test async client initialization from environment - should work""" - monkeypatch.setenv("SGAI_API_KEY", mock_api_key) - - client = AsyncClient.from_env() - assert client.api_key == mock_api_key - - -def test_client_context_manager_success(mock_api_key): - """Test client context manager - should work properly""" - with Client(api_key=mock_api_key) as client: - assert client.api_key == mock_api_key - # Session should be created - assert hasattr(client, 'session') - - -@pytest.mark.asyncio -async def test_async_client_context_manager_success(mock_api_key): - """Test async client context manager - should work properly""" - async with AsyncClient(api_key=mock_api_key) as client: - assert client.api_key == mock_api_key - # Session should be created - assert hasattr(client, 'session') - - -# ============================================================================ -# ERROR HANDLING TESTS (Still return 200 but test error scenarios) -# ============================================================================ - -@responses.activate -def test_invalid_api_key_handling(mock_api_key): - """Test handling of invalid API key - should handle gracefully""" - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "error": "Invalid API key", - "status": "error" - }, - status=200 # API returns 200 even for auth errors - ) - - with Client(api_key="invalid-key") as client: - response = client.smartscraper( - website_url="https://example.com", - user_prompt="Test" - ) - # Should still return 200 status from API - assert "status" in response - - -@responses.activate -def test_rate_limit_handling(mock_api_key): - """Test handling of rate limiting - should handle gracefully""" - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "error": "Rate limit exceeded", - "status": "error", - "retry_after": 60 - }, - status=200 # API returns 200 even for rate limit errors - ) - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="https://example.com", - user_prompt="Test" - ) - # Should still return 200 status from API - assert "status" in response - - -@responses.activate -def test_invalid_request_handling(mock_api_key): - """Test handling of invalid request - should handle gracefully""" - responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/smartscraper", - json={ - "error": "Invalid request parameters", - "status": "error" - }, - status=200 # API returns 200 even for validation errors - ) - - with Client(api_key=mock_api_key) as client: - response = client.smartscraper( - website_url="", # Invalid URL - user_prompt="" # Invalid prompt - ) - # Should still return 200 status from API - assert "status" in response \ No newline at end of file diff --git a/scrapegraph-py/tests/test_cookies_integration.py b/scrapegraph-py/tests/test_cookies_integration.py deleted file mode 100644 index 9cac46f..0000000 --- a/scrapegraph-py/tests/test_cookies_integration.py +++ /dev/null @@ -1,97 +0,0 @@ -""" -Test file to verify cookies integration functionality. -""" - -import json -from pydantic import BaseModel, Field - -from scrapegraph_py.models.smartscraper import SmartScraperRequest - - -class TestCookieInfo(BaseModel): - """Test model for cookie information.""" - - cookies: dict = Field(description="Dictionary of cookie key-value pairs") - - -def test_cookies_integration(): - """Test that cookies are properly integrated into SmartScraperRequest.""" - - print("๐Ÿงช Testing Cookies Integration") - print("=" * 50) - - # Test 1: Basic cookies - print("\n1. Testing basic cookies...") - cookies = {"session_id": "abc123", "auth_token": "xyz789"} - - request = SmartScraperRequest( - user_prompt="Extract cookie information", - website_url="https://httpbin.org/cookies", - cookies=cookies - ) - - data = request.model_dump() - print(f"โœ… Cookies included in request: {data.get('cookies')}") - - # Test 2: Cookies with output schema - print("\n2. Testing cookies with output schema...") - - request_with_schema = SmartScraperRequest( - user_prompt="Extract cookie information", - website_url="https://httpbin.org/cookies", - cookies=cookies, - output_schema=TestCookieInfo - ) - - data_with_schema = request_with_schema.model_dump() - print(f"โœ… Cookies with schema: {data_with_schema.get('cookies')}") - print(f"โœ… Output schema included: {data_with_schema.get('output_schema') is not None}") - - # Test 3: Cookies with scrolling and pagination - print("\n3. Testing cookies with advanced features...") - - request_advanced = SmartScraperRequest( - user_prompt="Extract cookie information from multiple pages", - website_url="https://httpbin.org/cookies", - cookies=cookies, - number_of_scrolls=5, - total_pages=3, - output_schema=TestCookieInfo - ) - - data_advanced = request_advanced.model_dump() - print(f"โœ… Advanced request cookies: {data_advanced.get('cookies')}") - print(f"โœ… Number of scrolls: {data_advanced.get('number_of_scrolls')}") - print(f"โœ… Total pages: {data_advanced.get('total_pages')}") - - # Test 4: Complex cookies scenario - print("\n4. Testing complex cookies scenario...") - - complex_cookies = { - "session_id": "abc123def456", - "user_id": "user789", - "auth_token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...", - "preferences": "dark_mode,usd", - "cart_id": "cart101112", - "csrf_token": "csrf_xyz789" - } - - request_complex = SmartScraperRequest( - user_prompt="Extract user profile and preferences", - website_url="https://example.com/dashboard", - cookies=complex_cookies, - headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}, - output_schema=TestCookieInfo - ) - - data_complex = request_complex.model_dump() - print(f"โœ… Complex cookies count: {len(data_complex.get('cookies', {}))}") - print(f"โœ… Headers included: {data_complex.get('headers') is not None}") - - print("\n" + "=" * 50) - print("โœ… All cookies integration tests passed!") - print("=" * 50) - - -if __name__ == "__main__": - test_cookies_integration() \ No newline at end of file diff --git a/scrapegraph-py/tests/test_exceptions.py b/scrapegraph-py/tests/test_exceptions.py deleted file mode 100644 index 8d19815..0000000 --- a/scrapegraph-py/tests/test_exceptions.py +++ /dev/null @@ -1,15 +0,0 @@ -from scrapegraph_py.exceptions import APIError - - -def test_api_error(): - error = APIError("Test error", status_code=400) - assert str(error) == "[400] Test error" - assert error.status_code == 400 - assert error.message == "Test error" - - -def test_api_error_without_status(): - error = APIError("Test error") - assert str(error) == "[None] Test error" - assert error.status_code is None - assert error.message == "Test error" diff --git a/scrapegraph-py/tests/test_localscraper.py b/scrapegraph-py/tests/test_localscraper.py deleted file mode 100644 index ce54e5a..0000000 --- a/scrapegraph-py/tests/test_localscraper.py +++ /dev/null @@ -1,90 +0,0 @@ -import pytest -from pydantic import BaseModel -from scrapegraph_py.models.localscraper import LocalScraperRequest, GetLocalScraperRequest - -# Create a dummy output schema to test the conversion in model_dump. -class DummySchema(BaseModel): - test_field: str - -def test_output_schema_conversion(): - """ - Test that when an output_schema is provided in a LocalScraperRequest, - model_dump returns a dictionary where the output_schema key holds the JSON schema - of the provided Pydantic model. - """ - user_prompt = "Extract company details" - website_html = "
Content
" - # Create a LocalScraperRequest with a dummy output_schema. - request = LocalScraperRequest(user_prompt=user_prompt, website_html=website_html, output_schema=DummySchema) - dumped = request.model_dump() - # Verify that output_schema is converted properly in the dumped dictionary. - assert "output_schema" in dumped - assert dumped["output_schema"] == DummySchema.model_json_schema() - -def test_invalid_website_html_structure(): - """ - Test that LocalScraperRequest raises a ValueError when the website_html provided - has no parseable HTML tags. This ensures the HTML content validation catches - non-HTML input. - """ - # This string has no HTML tags so BeautifulSoup.find() should return None. - invalid_html = "Just some random text" - with pytest.raises(ValueError, match="Invalid HTML - no parseable content found"): - LocalScraperRequest(user_prompt="Extract info about the company", website_html=invalid_html) - -def test_invalid_user_prompt_non_alnum(): - """ - Test that LocalScraperRequest raises a ValueError when the user_prompt - does not contain any alphanumeric characters. - """ - with pytest.raises(ValueError, match="User prompt must contain a valid prompt"): - LocalScraperRequest( - user_prompt="!!!", - website_html="
Valid Content
" - ) - -def test_get_localscraper_request_invalid_uuid(): - """ - Test that GetLocalScraperRequest raises a ValueError when an invalid UUID is provided. - This ensures that the model correctly validates the request_id as a proper UUID. - """ - invalid_uuid = "not-a-valid-uuid" - with pytest.raises(ValueError, match="request_id must be a valid UUID"): - GetLocalScraperRequest(request_id=invalid_uuid) - -def test_website_html_exceeds_maximum_size(): - """ - Test that LocalScraperRequest raises a ValueError when the website_html content - exceeds the maximum allowed size of 2MB. The generated HTML is valid but too large. - """ - # Calculate the number of characters needed to exceed 2MB when encoded in UTF-8. - max_size_bytes = 2 * 1024 * 1024 - # Create a valid HTML string that exceeds 2MB. - base_html_prefix = "" - base_html_suffix = "" - repeated_char_length = max_size_bytes - len(base_html_prefix.encode("utf-8")) - len(base_html_suffix.encode("utf-8")) + 1 - oversized_content = "a" * repeated_char_length - oversized_html = f"{base_html_prefix}{oversized_content}{base_html_suffix}" - - with pytest.raises(ValueError, match="Website HTML content exceeds maximum size of 2MB"): - LocalScraperRequest(user_prompt="Extract info", website_html=oversized_html) - -def test_website_html_exactly_maximum_size(): - """ - Test that LocalScraperRequest accepts website_html content exactly 2MB in size. - This ensures that the size validation correctly allows content on the boundary. - """ - user_prompt = "Extract info with exact size HTML" - prefix = "" - suffix = "" - # Calculate the length of the content needed to exactly reach 2MB when combined with prefix and suffix. - max_size_bytes = 2 * 1024 * 1024 - content_length = max_size_bytes - len(prefix.encode("utf-8")) - len(suffix.encode("utf-8")) - valid_content = "a" * content_length - html = prefix + valid_content + suffix - - # Attempt to create a valid LocalScraperRequest. - request = LocalScraperRequest(user_prompt=user_prompt, website_html=html) - - # Verify that the HTML content is exactly 2MB in size when encoded in UTF-8. - assert len(request.website_html.encode("utf-8")) == max_size_bytes diff --git a/scrapegraph-py/tests/test_markdownify.py b/scrapegraph-py/tests/test_markdownify.py deleted file mode 100644 index 80c6e32..0000000 --- a/scrapegraph-py/tests/test_markdownify.py +++ /dev/null @@ -1,59 +0,0 @@ -import pytest -from scrapegraph_py.models.markdownify import MarkdownifyRequest, GetMarkdownifyRequest - -def test_markdownify_request_invalid_url_scheme(): - """ - Test that MarkdownifyRequest raises a ValueError when the website_url does not - start with either 'http://' or 'https://'. - """ - with pytest.raises(ValueError, match="Invalid URL"): - MarkdownifyRequest(website_url="ftp://example.com") - -def test_markdownify_request_empty_url(): - """ - Test that MarkdownifyRequest raises a ValueError when the website_url is empty or contains only whitespace. - """ - with pytest.raises(ValueError, match="Website URL cannot be empty"): - MarkdownifyRequest(website_url=" ") - -def test_markdownify_request_valid_url(): - """ - Test that MarkdownifyRequest properly creates an instance when provided with a valid URL. - This covers the scenario where the input URL meets all validation requirements. - """ - valid_url = "https://example.com" - req = MarkdownifyRequest(website_url=valid_url) - assert req.website_url == valid_url - -def test_markdownify_request_untrimmed_url(): - """ - Test that MarkdownifyRequest raises a ValueError when the website_url contains leading or trailing whitespace. - Although the stripped URL would be valid, the actual value is not processed further, causing the check - for the proper URL scheme to fail. - """ - # The URL has leading whitespace, so it does not start directly with "https://" - with pytest.raises(ValueError, match="Invalid URL"): - MarkdownifyRequest(website_url=" https://example.com") - -def test_get_markdownify_request_invalid_uuid(): - """ - Test that GetMarkdownifyRequest raises a ValueError when the request_id is not a valid UUID. - """ - with pytest.raises(ValueError, match="request_id must be a valid UUID"): - GetMarkdownifyRequest(request_id="invalid_uuid") - -def test_get_markdownify_request_valid_uuid(): - """ - Test that GetMarkdownifyRequest properly creates an instance when provided with a valid UUID. - """ - valid_uuid = "123e4567-e89b-12d3-a456-426614174000" - req = GetMarkdownifyRequest(request_id=valid_uuid) - assert req.request_id == valid_uuid - -def test_get_markdownify_request_untrimmed_uuid(): - """ - Test that GetMarkdownifyRequest raises a ValueError when the request_id - contains leading or trailing whitespace, despite the trimmed UUID being valid. - """ - with pytest.raises(ValueError, match="request_id must be a valid UUID"): - GetMarkdownifyRequest(request_id=" 123e4567-e89b-12d3-a456-426614174000 ") diff --git a/scrapegraph-py/tests/test_models.py b/scrapegraph-py/tests/test_models.py deleted file mode 100644 index a310c51..0000000 --- a/scrapegraph-py/tests/test_models.py +++ /dev/null @@ -1,379 +0,0 @@ -import pytest -from pydantic import BaseModel, ValidationError - -from scrapegraph_py.models.crawl import CrawlRequest, GetCrawlRequest -from scrapegraph_py.models.feedback import FeedbackRequest -from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest -from scrapegraph_py.models.searchscraper import ( - GetSearchScraperRequest, - SearchScraperRequest, -) -from scrapegraph_py.models.smartscraper import ( - GetSmartScraperRequest, - SmartScraperRequest, -) - - -def test_smartscraper_request_validation(): - class ExampleSchema(BaseModel): - name: str - age: int - - # Valid input with website_url - request = SmartScraperRequest( - website_url="https://example.com", user_prompt="Describe this page." - ) - assert request.website_url == "https://example.com" - assert request.user_prompt == "Describe this page." - assert request.website_html is None - assert request.headers is None - - # Valid input with website_html - request = SmartScraperRequest( - website_html="

Test content

", - user_prompt="Extract info", - ) - assert request.website_url is None - assert request.website_html == "

Test content

" - assert request.user_prompt == "Extract info" - assert request.headers is None - - # Valid input with headers - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - request = SmartScraperRequest( - website_url="https://example.com", - user_prompt="Describe this page.", - headers=headers, - ) - assert request.headers == headers - - # Test with output_schema - request = SmartScraperRequest( - website_url="https://example.com", - user_prompt="Describe this page.", - output_schema=ExampleSchema, - ) - - # When we dump the model, the output_schema should be converted to a dict - dumped = request.model_dump() - assert isinstance(dumped["output_schema"], dict) - assert "properties" in dumped["output_schema"] - assert "name" in dumped["output_schema"]["properties"] - assert "age" in dumped["output_schema"]["properties"] - - # Invalid URL - with pytest.raises(ValidationError): - SmartScraperRequest( - website_url="invalid-url", user_prompt="Describe this page." - ) - - # Empty prompt - with pytest.raises(ValidationError): - SmartScraperRequest(website_url="https://example.com", user_prompt="") - - # Invalid HTML - with pytest.raises(ValidationError): - SmartScraperRequest( - website_html="not valid html", - user_prompt="Extract info", - ) - - # HTML too large (>2MB) - large_html = "x" * (2 * 1024 * 1024 + 1) - with pytest.raises(ValidationError): - SmartScraperRequest( - website_html=large_html, - user_prompt="Extract info", - ) - - # Neither URL nor HTML provided - with pytest.raises(ValidationError): - SmartScraperRequest(user_prompt="Extract info") - - -def test_get_smartscraper_request_validation(): - # Valid UUID - request = GetSmartScraperRequest(request_id="123e4567-e89b-12d3-a456-426614174000") - assert request.request_id == "123e4567-e89b-12d3-a456-426614174000" - - # Invalid UUID - with pytest.raises(ValidationError): - GetSmartScraperRequest(request_id="invalid-uuid") - - -def test_feedback_request_validation(): - # Valid input - request = FeedbackRequest( - request_id="123e4567-e89b-12d3-a456-426614174000", - rating=5, - feedback_text="Great service!", - ) - assert request.request_id == "123e4567-e89b-12d3-a456-426614174000" - assert request.rating == 5 - assert request.feedback_text == "Great service!" - - # Invalid rating - with pytest.raises(ValidationError): - FeedbackRequest( - request_id="123e4567-e89b-12d3-a456-426614174000", - rating=6, - feedback_text="Great service!", - ) - - # Invalid UUID - with pytest.raises(ValidationError): - FeedbackRequest( - request_id="invalid-uuid", rating=5, feedback_text="Great service!" - ) - - -def test_markdownify_request_validation(): - # Valid input without headers - request = MarkdownifyRequest(website_url="https://example.com") - assert request.website_url == "https://example.com" - assert request.headers is None - - # Valid input with headers - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - request = MarkdownifyRequest(website_url="https://example.com", headers=headers) - assert request.website_url == "https://example.com" - assert request.headers == headers - - # Invalid URL - with pytest.raises(ValidationError): - MarkdownifyRequest(website_url="invalid-url") - - # Empty URL - with pytest.raises(ValidationError): - MarkdownifyRequest(website_url="") - - -def test_get_markdownify_request_validation(): - # Valid UUID - request = GetMarkdownifyRequest(request_id="123e4567-e89b-12d3-a456-426614174000") - assert request.request_id == "123e4567-e89b-12d3-a456-426614174000" - - # Invalid UUID - with pytest.raises(ValidationError): - GetMarkdownifyRequest(request_id="invalid-uuid") - - -def test_searchscraper_request_validation(): - class ExampleSchema(BaseModel): - name: str - age: int - - # Valid input without headers - request = SearchScraperRequest(user_prompt="What is the latest version of Python?") - assert request.user_prompt == "What is the latest version of Python?" - assert request.headers is None - assert request.output_schema is None - - # Valid input with headers - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - request = SearchScraperRequest( - user_prompt="What is the latest version of Python?", - headers=headers, - ) - assert request.headers == headers - - # Test with output_schema - request = SearchScraperRequest( - user_prompt="What is the latest version of Python?", - output_schema=ExampleSchema, - ) - - # When we dump the model, the output_schema should be converted to a dict - dumped = request.model_dump() - assert isinstance(dumped["output_schema"], dict) - assert "properties" in dumped["output_schema"] - assert "name" in dumped["output_schema"]["properties"] - assert "age" in dumped["output_schema"]["properties"] - - # Empty prompt - with pytest.raises(ValidationError): - SearchScraperRequest(user_prompt="") - - # Invalid prompt (no alphanumeric characters) - with pytest.raises(ValidationError): - SearchScraperRequest(user_prompt="!@#$%^") - - -def test_get_searchscraper_request_validation(): - # Valid UUID - request = GetSearchScraperRequest(request_id="123e4567-e89b-12d3-a456-426614174000") - assert request.request_id == "123e4567-e89b-12d3-a456-426614174000" - - # Invalid UUID - with pytest.raises(ValidationError): - GetSearchScraperRequest(request_id="invalid-uuid") - - -def test_crawl_request_validation(): - # Example schema - schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Test Schema", - "type": "object", - "properties": { - "name": {"type": "string"}, - "age": {"type": "integer"}, - }, - "required": ["name"], - } - - # Valid input with all parameters - request = CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - cache_website=True, - depth=2, - max_pages=5, - same_domain_only=True, - batch_size=1, - ) - assert request.url == "https://example.com" - assert request.prompt == "Extract company information" - assert request.data_schema == schema - assert request.cache_website is True - assert request.depth == 2 - assert request.max_pages == 5 - assert request.same_domain_only is True - assert request.batch_size == 1 - - # Valid input with minimal parameters - request = CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - ) - assert request.url == "https://example.com" - assert request.prompt == "Extract company information" - assert request.data_schema == schema - assert request.cache_website is True # default - assert request.depth == 2 # default - assert request.max_pages == 2 # default - assert request.same_domain_only is True # default - assert request.batch_size == 1 # default - - # Invalid URL - with pytest.raises(ValidationError): - CrawlRequest( - url="invalid-url", - prompt="Extract company information", - data_schema=schema, - ) - - # Empty URL - with pytest.raises(ValidationError): - CrawlRequest( - url="", - prompt="Extract company information", - data_schema=schema, - ) - - # Empty prompt - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="", - data_schema=schema, - ) - - # Invalid prompt (no alphanumeric characters) - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="!@#$%^", - data_schema=schema, - ) - - # Empty schema - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema={}, - ) - - # Invalid schema (not a dict) - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema="not a dict", - ) - - # Invalid depth (too low) - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - depth=0, - ) - - # Invalid depth (too high) - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - depth=11, - ) - - # Invalid max_pages (too low) - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - max_pages=0, - ) - - # Invalid max_pages (too high) - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - max_pages=101, - ) - - # Invalid batch_size (too low) - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - batch_size=0, - ) - - # Invalid batch_size (too high) - with pytest.raises(ValidationError): - CrawlRequest( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - batch_size=11, - ) - - -def test_get_crawl_request_validation(): - # Valid UUID - request = GetCrawlRequest(crawl_id="123e4567-e89b-12d3-a456-426614174000") - assert request.crawl_id == "123e4567-e89b-12d3-a456-426614174000" - - # Invalid UUID - with pytest.raises(ValidationError): - GetCrawlRequest(crawl_id="invalid-uuid") diff --git a/scrapegraph-py/tests/test_models_fix.py b/scrapegraph-py/tests/test_models_fix.py deleted file mode 100644 index 6c1eb52..0000000 --- a/scrapegraph-py/tests/test_models_fix.py +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/env python3 -""" -Test script to verify that the Pydantic warning is resolved and models work correctly. -""" - -import warnings -from scrapegraph_py.models.crawl import CrawlRequest -from scrapegraph_py.models.smartscraper import SmartScraperRequest -from scrapegraph_py.models.searchscraper import SearchScraperRequest -from scrapegraph_py.models.markdownify import MarkdownifyRequest -from scrapegraph_py.models.feedback import FeedbackRequest - -# Capture warnings -warnings.simplefilter("always") - -def test_crawl_request(): - """Test CrawlRequest model""" - print("Testing CrawlRequest...") - - schema = { - "type": "object", - "properties": { - "name": {"type": "string"}, - "description": {"type": "string"} - } - } - - request = CrawlRequest( - url="https://example.com", - prompt="Test prompt", - data_schema=schema - ) - - # Test model_dump - data = request.model_dump() - print(f"โœ… CrawlRequest model_dump works: {len(data)} fields") - assert "data_schema" in data - assert "schema" not in data # Old field should not be present - -def test_smartscraper_request(): - """Test SmartScraperRequest model""" - print("Testing SmartScraperRequest...") - - # Test without number_of_scrolls (should be None) - request = SmartScraperRequest( - user_prompt="Test prompt", - website_url="https://example.com" - ) - - # Test model_dump - number_of_scrolls should be excluded when None - data = request.model_dump() - print(f"โœ… SmartScraperRequest model_dump works: {len(data)} fields") - assert "number_of_scrolls" not in data # Should be excluded when None - - # Test with number_of_scrolls - request_with_scrolls = SmartScraperRequest( - user_prompt="Test prompt", - website_url="https://example.com", - number_of_scrolls=5 - ) - - data_with_scrolls = request_with_scrolls.model_dump() - assert "number_of_scrolls" in data_with_scrolls # Should be included when not None - assert data_with_scrolls["number_of_scrolls"] == 5 - -def test_searchscraper_request(): - """Test SearchScraperRequest model""" - print("Testing SearchScraperRequest...") - - request = SearchScraperRequest( - user_prompt="Test prompt" - ) - - data = request.model_dump() - print(f"โœ… SearchScraperRequest model_dump works: {len(data)} fields") - assert "headers" not in data # Should be excluded when None - -def test_markdownify_request(): - """Test MarkdownifyRequest model""" - print("Testing MarkdownifyRequest...") - - request = MarkdownifyRequest( - website_url="https://example.com" - ) - - data = request.model_dump() - print(f"โœ… MarkdownifyRequest model_dump works: {len(data)} fields") - assert "headers" not in data # Should be excluded when None - -def test_feedback_request(): - """Test FeedbackRequest model""" - print("Testing FeedbackRequest...") - - request = FeedbackRequest( - request_id="123e4567-e89b-12d3-a456-426614174000", - rating=5 - ) - - data = request.model_dump() - print(f"โœ… FeedbackRequest model_dump works: {len(data)} fields") - assert "feedback_text" not in data # Should be excluded when None - -if __name__ == "__main__": - print("๐Ÿงช Testing Pydantic model fixes...") - - test_crawl_request() - test_smartscraper_request() - test_searchscraper_request() - test_markdownify_request() - test_feedback_request() - - print("\nโœ… All tests passed! The Pydantic warning should be resolved.") - print("๐ŸŽ‰ Models now properly exclude None values from serialization.") \ No newline at end of file diff --git a/scrapegraph-py/tests/test_real_apis.py b/scrapegraph-py/tests/test_real_apis.py index 945479b..0559ed7 100644 --- a/scrapegraph-py/tests/test_real_apis.py +++ b/scrapegraph-py/tests/test_real_apis.py @@ -4,14 +4,14 @@ These tests use actual API calls with environment variables """ +import asyncio import os + import pytest -import asyncio -from uuid import uuid4 from pydantic import BaseModel -from scrapegraph_py.client import Client from scrapegraph_py.async_client import AsyncClient +from scrapegraph_py.client import Client class ProductSchema(BaseModel): @@ -251,121 +251,6 @@ def test_get_markdownify_status_real(): assert "request_id" in status_response -def test_crawl_basic_real(): - """Test basic crawl""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - data_schema = { - "type": "object", - "properties": { - "title": {"type": "string"}, - "content": {"type": "string"} - } - } - - with Client.from_env() as client: - response = client.crawl( - url="https://example.com", - prompt="Extract page information", - data_schema=data_schema - ) - assert response["status"] in ["completed", "processing", "pending"] - assert "crawl_id" in response - - -def test_crawl_with_all_params_real(): - """Test crawl with all parameters""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - data_schema = { - "type": "object", - "properties": { - "title": {"type": "string"}, - "description": {"type": "string"} - } - } - - with Client.from_env() as client: - response = client.crawl( - url="https://example.com", - prompt="Extract comprehensive page data", - data_schema=data_schema, - cache_website=True, - depth=2, - max_pages=3, - same_domain_only=True, - batch_size=5 - ) - assert response["status"] in ["completed", "processing", "pending"] - assert "crawl_id" in response - - -def test_get_crawl_status_real(): - """Test getting crawl status""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - with Client.from_env() as client: - # First create a crawl request - data_schema = { - "type": "object", - "properties": { - "title": {"type": "string"} - } - } - - initial_response = client.crawl( - url="https://example.com", - prompt="Extract page titles", - data_schema=data_schema - ) - - crawl_id = initial_response["crawl_id"] - - # Then get the status - status_response = client.get_crawl(crawl_id) - assert "status" in status_response - assert "crawl_id" in status_response - - -def test_get_credits_real(): - """Test getting credits""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - with Client.from_env() as client: - response = client.get_credits() - assert "credits" in response - assert "used_credits" in response - assert "remaining_credits" in response - - -def test_submit_feedback_real(): - """Test submitting feedback""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - with Client.from_env() as client: - # First create a request to get a request_id - initial_response = client.smartscraper( - website_url="https://example.com", - user_prompt="Extract basic info" - ) - - request_id = initial_response["request_id"] - - # Submit feedback - feedback_response = client.submit_feedback( - request_id=request_id, - rating=5, - feedback_text="Great service! Very accurate results." - ) - assert "status" in feedback_response - - -def test_submit_feedback_without_text_real(): """Test submitting feedback without text""" if not os.getenv("SGAI_API_KEY"): pytest.skip("SGAI_API_KEY not set") @@ -448,88 +333,12 @@ async def test_async_markdownify_basic_real(): assert "request_id" in response -@pytest.mark.asyncio -async def test_async_crawl_basic_real(): - """Test basic async crawl""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - data_schema = { - "type": "object", - "properties": { - "title": {"type": "string"}, - "content": {"type": "string"} - } - } - - async with AsyncClient.from_env() as client: - response = await client.crawl( - url="https://example.com", - prompt="Extract async page data", - data_schema=data_schema - ) - assert response["status"] in ["completed", "processing", "pending"] - assert "crawl_id" in response - - -@pytest.mark.asyncio -async def test_async_get_credits_real(): - """Test async get credits""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - async with AsyncClient.from_env() as client: - response = await client.get_credits() - assert "credits" in response - assert "used_credits" in response - assert "remaining_credits" in response - - -@pytest.mark.asyncio -async def test_async_submit_feedback_real(): - """Test async submit feedback""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - async with AsyncClient.from_env() as client: - # First create a request to get a request_id - initial_response = await client.smartscraper( - website_url="https://example.com", - user_prompt="Extract basic info for feedback" - ) - - request_id = initial_response["request_id"] - - # Submit feedback - feedback_response = await client.submit_feedback( - request_id=request_id, - rating=5, - feedback_text="Excellent async service!" - ) - assert "status" in feedback_response # ============================================================================ # CLIENT INITIALIZATION TESTS # ============================================================================ -def test_client_from_env_real(): - """Test client initialization from environment""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - client = Client.from_env() - assert client.api_key == os.getenv("SGAI_API_KEY") - - -def test_async_client_from_env_real(): - """Test async client initialization from environment""" - if not os.getenv("SGAI_API_KEY"): - pytest.skip("SGAI_API_KEY not set") - - client = AsyncClient.from_env() - assert client.api_key == os.getenv("SGAI_API_KEY") - def test_client_context_manager_real(): """Test client context manager""" @@ -556,27 +365,6 @@ async def test_async_client_context_manager_real(): # ERROR HANDLING TESTS # ============================================================================ -def test_invalid_api_key_handling(): - """Test handling of invalid API key""" - # Temporarily set invalid API key - original_key = os.getenv("SGAI_API_KEY") - os.environ["SGAI_API_KEY"] = "invalid-key" - - try: - with Client.from_env() as client: - response = client.smartscraper( - website_url="https://example.com", - user_prompt="Test" - ) - # Should handle gracefully even with invalid key - assert "status" in response - finally: - # Restore original key - if original_key: - os.environ["SGAI_API_KEY"] = original_key - else: - del os.environ["SGAI_API_KEY"] - def test_missing_api_key_handling(): """Test handling of missing API key""" diff --git a/scrapegraph-py/tests/test_smartscraper.py b/scrapegraph-py/tests/test_smartscraper.py deleted file mode 100644 index 00eca9c..0000000 --- a/scrapegraph-py/tests/test_smartscraper.py +++ /dev/null @@ -1,175 +0,0 @@ -import pytest -from pydantic import BaseModel, ValidationError -from scrapegraph_py.models.smartscraper import SmartScraperRequest, GetSmartScraperRequest - -# Define a dummy schema to test the output_schema conversion in model_dump -class DummySchema(BaseModel): - """A dummy schema to simulate a Pydantic model with JSON schema conversion.""" - a: int = 1 - -def test_model_dump_with_output_schema_conversion(): - """ - Test that model_dump on SmartScraperRequest converts the provided output_schema into a JSON schema dict. - """ - # Create a request with a valid user prompt, website URL, and a dummy output_schema. - request = SmartScraperRequest( - user_prompt="Extract information about the company", - website_url="https://scrapegraphai.com/", - output_schema=DummySchema - ) - # Get the dump dict from the model. - output = request.model_dump() - # The model_dump should include the 'output_schema' converted to its JSON schema representation. - expected_schema = DummySchema.model_json_schema() - assert output.get("output_schema") == expected_schema - -def test_model_dump_without_output_schema(): - """ - Test that model_dump on SmartScraperRequest returns output_schema as None - when no output_schema is provided. This ensures that the conversion logic is only - applied when output_schema is not None. - """ - # Create a valid SmartScraperRequest without providing an output_schema. - request = SmartScraperRequest( - user_prompt="Extract some meaningful data", - website_url="https://scrapegraphai.com/" - ) - # Get the dumped dictionary from the model. - output = request.model_dump() - # Ensure that the output contains the key "output_schema" and its value is None. - assert "output_schema" in output, "Output schema key should be present even if None" - assert output["output_schema"] is None, "Output schema should be None when not provided" - -def test_invalid_get_smartscraper_request_id(): - """ - Test that GetSmartScraperRequest raises a ValueError when provided with an invalid UUID. - This test ensures that the request_id field is validated correctly. - """ - with pytest.raises(ValueError, match="request_id must be a valid UUID"): - GetSmartScraperRequest(request_id="invalid-uuid") - - -def test_smartscraper_request_with_pagination(): - """ - Test SmartScraperRequest with pagination parameter. - This test ensures that the total_pages field is properly handled. - """ - # Test with valid pagination - request = SmartScraperRequest( - user_prompt="Extract product information", - website_url="https://example.com/products", - total_pages=5 - ) - - assert request.total_pages == 5 - - # Test model_dump includes pagination - output = request.model_dump() - assert output["total_pages"] == 5 - - # Test without pagination (default behavior) - request_no_pagination = SmartScraperRequest( - user_prompt="Extract product information", - website_url="https://example.com/products" - ) - - assert request_no_pagination.total_pages is None - - # Test model_dump excludes None pagination - output_no_pagination = request_no_pagination.model_dump() - assert "total_pages" not in output_no_pagination - - -def test_smartscraper_request_pagination_validation(): - """ - Test pagination validation constraints. - This test ensures that total_pages is properly validated. - """ - # Test minimum value - request = SmartScraperRequest( - user_prompt="Extract products", - website_url="https://example.com/products", - total_pages=1 - ) - assert request.total_pages == 1 - - # Test maximum value - request = SmartScraperRequest( - user_prompt="Extract products", - website_url="https://example.com/products", - total_pages=10 - ) - assert request.total_pages == 10 - - # Test invalid values - with pytest.raises(ValidationError): - SmartScraperRequest( - user_prompt="Extract products", - website_url="https://example.com/products", - total_pages=0 - ) - - with pytest.raises(ValidationError): - SmartScraperRequest( - user_prompt="Extract products", - website_url="https://example.com/products", - total_pages=11 - ) - - -def test_smartscraper_request_pagination_with_all_features(): - """ - Test pagination combined with other SmartScraper features. - This test ensures pagination works with output_schema, scrolls, and headers. - """ - headers = {"User-Agent": "test-agent"} - - request = SmartScraperRequest( - user_prompt="Extract all product information", - website_url="https://example.com/products", - headers=headers, - output_schema=DummySchema, - number_of_scrolls=5, - total_pages=3 - ) - - assert request.total_pages == 3 - assert request.number_of_scrolls == 5 - assert request.headers == headers - assert request.output_schema == DummySchema - - # Test model_dump with all features - output = request.model_dump() - assert output["total_pages"] == 3 - assert output["number_of_scrolls"] == 5 - assert output["headers"] == headers - assert isinstance(output["output_schema"], dict) - -def test_invalid_url_in_smartscraper_request(): - """ - Test that SmartScraperRequest raises a ValueError when provided with a website_url - that does not start with 'http://' or 'https://'. This ensures the URL validation works. - """ - with pytest.raises(ValueError, match="Invalid URL"): - SmartScraperRequest( - user_prompt="Extract data", - website_url="ftp://invalid-url" - ) - -def test_invalid_user_prompt_empty_and_non_alnum(): - """ - Test that SmartScraperRequest raises a ValueError when the user_prompt is either empty (or only whitespace) - or when it contains no alphanumeric characters. This ensures the user prompt validator is working correctly. - """ - # Test with a user_prompt that is empty (only whitespace) - with pytest.raises(ValueError, match="User prompt cannot be empty"): - SmartScraperRequest( - user_prompt=" ", - website_url="https://scrapegraphai.com/" - ) - # Test with a user_prompt that contains no alphanumeric characters - with pytest.raises(ValueError, match="User prompt must contain a valid prompt"): - SmartScraperRequest( - user_prompt="!!!", - website_url="https://scrapegraphai.com/" - ) diff --git a/scrapegraph-py/tests/test_smartscraper_pagination.py b/scrapegraph-py/tests/test_smartscraper_pagination.py deleted file mode 100644 index 53ffd56..0000000 --- a/scrapegraph-py/tests/test_smartscraper_pagination.py +++ /dev/null @@ -1,302 +0,0 @@ -import pytest -from pydantic import BaseModel, ValidationError -from scrapegraph_py.models.smartscraper import SmartScraperRequest, GetSmartScraperRequest - - -class TestProductSchema(BaseModel): - """Test schema for pagination tests""" - name: str - price: str - rating: float = None - - -class TestSmartScraperPagination: - """Test suite for SmartScraper pagination functionality""" - - def test_smartscraper_request_with_pagination(self): - """Test SmartScraperRequest with valid pagination parameters""" - request = SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=5 - ) - - assert request.website_url == "https://example.com/products" - assert request.user_prompt == "Extract product information" - assert request.total_pages == 5 - assert request.number_of_scrolls is None - assert request.output_schema is None - - def test_smartscraper_request_with_pagination_and_schema(self): - """Test SmartScraperRequest with pagination and output schema""" - request = SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=3, - output_schema=TestProductSchema - ) - - assert request.total_pages == 3 - assert request.output_schema == TestProductSchema - - # Test model_dump with pagination and schema - dumped = request.model_dump() - assert dumped["total_pages"] == 3 - assert isinstance(dumped["output_schema"], dict) - assert "properties" in dumped["output_schema"] - - def test_smartscraper_request_with_pagination_and_scrolls(self): - """Test SmartScraperRequest with both pagination and scrolling""" - request = SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=2, - number_of_scrolls=10 - ) - - assert request.total_pages == 2 - assert request.number_of_scrolls == 10 - - # Test model_dump excludes None values - dumped = request.model_dump() - assert dumped["total_pages"] == 2 - assert dumped["number_of_scrolls"] == 10 - assert "website_html" not in dumped # Should be excluded since it's None - - def test_smartscraper_request_pagination_validation_minimum(self): - """Test pagination validation - minimum value""" - # Valid minimum value - request = SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=1 - ) - assert request.total_pages == 1 - - # Invalid minimum value (less than 1) - with pytest.raises(ValidationError) as exc_info: - SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=0 - ) - assert "greater than or equal to 1" in str(exc_info.value) - - def test_smartscraper_request_pagination_validation_maximum(self): - """Test pagination validation - maximum value""" - # Valid maximum value - request = SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=10 - ) - assert request.total_pages == 10 - - # Invalid maximum value (greater than 10) - with pytest.raises(ValidationError) as exc_info: - SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=11 - ) - assert "less than or equal to 10" in str(exc_info.value) - - def test_smartscraper_request_pagination_none_value(self): - """Test SmartScraperRequest with None pagination (default behavior)""" - request = SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=None - ) - - assert request.total_pages is None - - # Test model_dump excludes None values - dumped = request.model_dump() - assert "total_pages" not in dumped - - def test_smartscraper_request_pagination_with_html(self): - """Test pagination with HTML content instead of URL""" - html_content = """ - - -
-
Product 1
-
Product 2
-
- - - """ - - request = SmartScraperRequest( - website_html=html_content, - user_prompt="Extract product information", - total_pages=2 - ) - - assert request.website_html == html_content - assert request.total_pages == 2 - assert request.website_url is None - - def test_smartscraper_request_pagination_with_headers(self): - """Test pagination with custom headers""" - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", - "Cookie": "session=abc123", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" - } - - request = SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - headers=headers, - total_pages=3 - ) - - assert request.headers == headers - assert request.total_pages == 3 - - # Test model_dump includes headers and pagination - dumped = request.model_dump() - assert dumped["headers"] == headers - assert dumped["total_pages"] == 3 - - def test_smartscraper_request_pagination_edge_cases(self): - """Test edge cases for pagination""" - # Test with negative value - with pytest.raises(ValidationError): - SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=-1 - ) - - # Test with float value (should be converted to int or rejected) - with pytest.raises(ValidationError): - SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=2.5 - ) - - # Test with string value - with pytest.raises(ValidationError): - SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages="5" - ) - - def test_smartscraper_request_pagination_model_dump_exclude_none(self): - """Test that model_dump properly excludes None values for pagination""" - # Request with pagination - request_with_pagination = SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=3 - ) - - dumped_with_pagination = request_with_pagination.model_dump() - assert "total_pages" in dumped_with_pagination - assert dumped_with_pagination["total_pages"] == 3 - - # Request without pagination - request_without_pagination = SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract product information" - ) - - dumped_without_pagination = request_without_pagination.model_dump() - assert "total_pages" not in dumped_without_pagination - - def test_smartscraper_request_pagination_with_all_parameters(self): - """Test SmartScraperRequest with all parameters including pagination""" - headers = {"User-Agent": "test-agent"} - - request = SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract all product information", - headers=headers, - output_schema=TestProductSchema, - number_of_scrolls=5, - total_pages=7 - ) - - assert request.website_url == "https://example.com/products" - assert request.user_prompt == "Extract all product information" - assert request.headers == headers - assert request.output_schema == TestProductSchema - assert request.number_of_scrolls == 5 - assert request.total_pages == 7 - - # Test model_dump with all parameters - dumped = request.model_dump() - assert dumped["website_url"] == "https://example.com/products" - assert dumped["user_prompt"] == "Extract all product information" - assert dumped["headers"] == headers - assert isinstance(dumped["output_schema"], dict) - assert dumped["number_of_scrolls"] == 5 - assert dumped["total_pages"] == 7 - - def test_smartscraper_request_pagination_validation_with_existing_validators(self): - """Test that pagination validation works alongside existing validators""" - # Test empty prompt with pagination - should fail on prompt validation - with pytest.raises(ValidationError) as exc_info: - SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="", - total_pages=5 - ) - assert "User prompt cannot be empty" in str(exc_info.value) - - # Test invalid URL with pagination - should fail on URL validation - with pytest.raises(ValidationError) as exc_info: - SmartScraperRequest( - website_url="invalid-url", - user_prompt="Extract products", - total_pages=3 - ) - assert "Invalid URL" in str(exc_info.value) - - # Test pagination with neither URL nor HTML - should fail on URL/HTML validation - with pytest.raises(ValidationError) as exc_info: - SmartScraperRequest( - user_prompt="Extract products", - total_pages=2 - ) - assert "Either website_url or website_html must be provided" in str(exc_info.value) - - def test_smartscraper_request_pagination_boundary_values(self): - """Test pagination boundary values""" - # Test boundary values - valid_values = [1, 2, 5, 9, 10] - - for value in valid_values: - request = SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract products", - total_pages=value - ) - assert request.total_pages == value - - # Test invalid boundary values - invalid_values = [0, -1, 11, 100] - - for value in invalid_values: - with pytest.raises(ValidationError): - SmartScraperRequest( - website_url="https://example.com/products", - user_prompt="Extract products", - total_pages=value - ) - - def test_get_smartscraper_request_unchanged(self): - """Test that GetSmartScraperRequest is not affected by pagination changes""" - # This should still work as before - request = GetSmartScraperRequest(request_id="123e4567-e89b-12d3-a456-426614174000") - assert request.request_id == "123e4567-e89b-12d3-a456-426614174000" - - # Invalid UUID should still raise error - with pytest.raises(ValidationError) as exc_info: - GetSmartScraperRequest(request_id="invalid-uuid") - assert "request_id must be a valid UUID" in str(exc_info.value) \ No newline at end of file diff --git a/scrapegraph-py/tests/utils.py b/scrapegraph-py/tests/utils.py deleted file mode 100644 index 194ceef..0000000 --- a/scrapegraph-py/tests/utils.py +++ /dev/null @@ -1,6 +0,0 @@ -from uuid import uuid4 - - -def generate_mock_api_key(): - """Generate a valid mock API key in the format 'sgai-{uuid}'""" - return f"sgai-{uuid4()}"