diff --git a/scrapegraph-js/README.md b/scrapegraph-js/README.md index 2200fee..0b6fb8e 100644 --- a/scrapegraph-js/README.md +++ b/scrapegraph-js/README.md @@ -151,6 +151,105 @@ const prompt = 'What is the latest version of Python and what are its main featu })(); ``` +### Crawl API + +Start a crawl job to extract structured data from a website and its linked pages, using a custom schema. + +```javascript +import { crawl, getCrawlRequest } from 'scrapegraph-js'; +import 'dotenv/config'; + +const apiKey = process.env.SGAI_APIKEY; +const url = 'https://scrapegraphai.com/'; +const prompt = 'What does the company do? and I need text content from there privacy and terms'; + +const schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ScrapeGraphAI Website Content", + "type": "object", + "properties": { + "company": { + "type": "object", + "properties": { + "name": { "type": "string" }, + "description": { "type": "string" }, + "features": { "type": "array", "items": { "type": "string" } }, + "contact_email": { "type": "string", "format": "email" }, + "social_links": { + "type": "object", + "properties": { + "github": { "type": "string", "format": "uri" }, + "linkedin": { "type": "string", "format": "uri" }, + "twitter": { "type": "string", "format": "uri" } + }, + "additionalProperties": false + } + }, + "required": ["name", "description"] + }, + "services": { + "type": "array", + "items": { + "type": "object", + "properties": { + "service_name": { "type": "string" }, + "description": { "type": "string" }, + "features": { "type": "array", "items": { "type": "string" } } + }, + "required": ["service_name", "description"] + } + }, + "legal": { + "type": "object", + "properties": { + "privacy_policy": { "type": "string" }, + "terms_of_service": { "type": "string" } + }, + "required": ["privacy_policy", "terms_of_service"] + } + }, + "required": ["company", "services", "legal"] +}; + +(async () => { + try { + // Start the crawl job + const crawlResponse = await crawl(apiKey, url, prompt, schema, { + cacheWebsite: true, + depth: 2, + maxPages: 2, + sameDomainOnly: true, + batchSize: 1, + }); + console.log('Crawl job started. Response:', crawlResponse); + + // If the crawl is asynchronous and returns an ID, fetch the result + const crawlId = crawlResponse.id || crawlResponse.task_id; + if (crawlId) { + for (let i = 0; i < 10; i++) { + await new Promise((resolve) => setTimeout(resolve, 5000)); + const result = await getCrawlRequest(apiKey, crawlId); + if (result.status === 'success' && result.result) { + console.log('Crawl completed. Result:', result.result.llm_result); + break; + } else if (result.status === 'failed') { + console.log('Crawl failed. Result:', result); + break; + } else { + console.log(`Status: ${result.status}, waiting...`); + } + } + } else { + console.log('No crawl ID found in response. Synchronous result:', crawlResponse); + } + } catch (error) { + console.error('Error occurred:', error); + } +})(); +``` + +You can use a plain JSON schema or a [Zod](https://www.npmjs.com/package/zod) schema for the `schema` parameter. The crawl API supports options for crawl depth, max pages, domain restriction, and batch size. + ### Scraping local HTML Extract structured data from local HTML content diff --git a/scrapegraph-js/examples/crawl_example.js b/scrapegraph-js/examples/crawl_example.js new file mode 100644 index 0000000..f3df8f7 --- /dev/null +++ b/scrapegraph-js/examples/crawl_example.js @@ -0,0 +1,105 @@ +import { crawl, getCrawlRequest } from '../index.js'; +import 'dotenv/config'; + +// Example .env file: +// SGAI_APIKEY=your_sgai_api_key + +const apiKey = process.env.SGAI_APIKEY; + +const schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ScrapeGraphAI Website Content", + "type": "object", + "properties": { + "company": { + "type": "object", + "properties": { + "name": { "type": "string" }, + "description": { "type": "string" }, + "features": { "type": "array", "items": { "type": "string" } }, + "contact_email": { "type": "string", "format": "email" }, + "social_links": { + "type": "object", + "properties": { + "github": { "type": "string", "format": "uri" }, + "linkedin": { "type": "string", "format": "uri" }, + "twitter": { "type": "string", "format": "uri" } + }, + "additionalProperties": false + } + }, + "required": ["name", "description"] + }, + "services": { + "type": "array", + "items": { + "type": "object", + "properties": { + "service_name": { "type": "string" }, + "description": { "type": "string" }, + "features": { "type": "array", "items": { "type": "string" } } + }, + "required": ["service_name", "description"] + } + }, + "legal": { + "type": "object", + "properties": { + "privacy_policy": { "type": "string" }, + "terms_of_service": { "type": "string" } + }, + "required": ["privacy_policy", "terms_of_service"] + } + }, + "required": ["company", "services", "legal"] +}; + +const url = 'https://scrapegraphai.com/'; +const prompt = 'What does the company do? and I need text content from there privacy and terms'; + +(async () => { + if (!apiKey) { + console.error('SGAI_APIKEY not found in environment. Please set it in your .env file.'); + process.exit(1); + } + + try { + // Start the crawl job + console.log(`\nStarting crawl for: ${url}`); + const crawlResponse = await crawl(apiKey, url, prompt, schema, { + cacheWebsite: true, + depth: 2, + maxPages: 2, + sameDomainOnly: true, + batchSize: 1, + }); + console.log('\nCrawl job started. Response:'); + console.log(JSON.stringify(crawlResponse, null, 2)); + + // If the crawl is asynchronous and returns an ID, fetch the result + const crawlId = crawlResponse.id || crawlResponse.task_id; + if (crawlId) { + console.log('\nPolling for crawl result...'); + for (let i = 0; i < 10; i++) { + await new Promise((resolve) => setTimeout(resolve, 5000)); + const result = await getCrawlRequest(apiKey, crawlId); + if (result.status === 'success' && result.result) { + console.log(`\nCrawl completed. Result:`); + console.log(JSON.stringify(result.result.llm_result, null, 2)); + break; + } else if (result.status === 'failed') { + console.log('\nCrawl failed. Result:'); + console.log(JSON.stringify(result, null, 2)); + break; + } else { + console.log(`Status: ${result.status}, waiting...`); + } + } + } else { + console.log('No crawl ID found in response. Synchronous result:'); + console.log(JSON.stringify(crawlResponse, null, 2)); + } + } catch (error) { + console.error('Error occurred:', error); + } +})(); \ No newline at end of file diff --git a/scrapegraph-js/index.js b/scrapegraph-js/index.js index 4fcbd4c..84c4d47 100644 --- a/scrapegraph-js/index.js +++ b/scrapegraph-js/index.js @@ -3,3 +3,4 @@ export { markdownify, getMarkdownifyRequest } from './src/markdownify.js'; export { searchScraper, getSearchScraperRequest } from './src/searchScraper.js'; export { getCredits } from './src/credits.js'; export { sendFeedback } from './src/feedback.js'; +export { crawl, getCrawlRequest } from './src/crawl.js'; diff --git a/scrapegraph-js/src/crawl.js b/scrapegraph-js/src/crawl.js new file mode 100644 index 0000000..aa0b920 --- /dev/null +++ b/scrapegraph-js/src/crawl.js @@ -0,0 +1,93 @@ +import axios from 'axios'; +import handleError from './utils/handleError.js'; +import { ZodType } from 'zod'; +import { zodToJsonSchema } from 'zod-to-json-schema'; + +/** + * Start a crawl job using the ScrapeGraphAI API. + * + * @param {string} apiKey - Your ScrapeGraph AI API key + * @param {string} url - The starting URL for the crawl + * @param {string} prompt - The prompt to guide the crawl and extraction + * @param {Object|ZodType} schema - JSON schema or Zod schema defining the structure of the extracted data + * @param {Object} [options] - Optional crawl parameters + * @param {boolean} [options.cacheWebsite=true] - Whether to cache the website content + * @param {number} [options.depth=2] - Maximum depth of the crawl (1-10) + * @param {number} [options.maxPages=2] - Maximum number of pages to crawl (1-100) + * @param {boolean} [options.sameDomainOnly=true] - Whether to only crawl pages from the same domain + * @param {number} [options.batchSize=1] - Batch size for processing pages (1-10) + * @returns {Promise} The crawl job response + * @throws {Error} Throws an error if the HTTP request fails + */ +export async function crawl( + apiKey, + url, + prompt, + schema, + options = {} +) { + const endpoint = 'https://api.scrapegraphai.com/v1/crawl'; + const headers = { + 'accept': 'application/json', + 'SGAI-APIKEY': apiKey, + 'Content-Type': 'application/json', + }; + + let schemaPayload; + if (schema instanceof ZodType) { + schemaPayload = zodToJsonSchema(schema); + } else if (typeof schema === 'object' && schema !== null) { + schemaPayload = schema; + } else { + throw new Error('The schema must be a Zod schema or a plain object'); + } + + const { + cacheWebsite = true, + depth = 2, + maxPages = 2, + sameDomainOnly = true, + batchSize = 1, + } = options; + + const payload = { + url, + prompt, + schema: schemaPayload, + cache_website: cacheWebsite, + depth, + max_pages: maxPages, + same_domain_only: sameDomainOnly, + batch_size: batchSize, + }; + + try { + const response = await axios.post(endpoint, payload, { headers }); + return response.data; + } catch (error) { + handleError(error); + } +} + +/** + * Get the result of a crawl job by ID. + * + * @param {string} apiKey - Your ScrapeGraph AI API key + * @param {string} crawlId - The crawl job ID + * @returns {Promise} The crawl result + * @throws {Error} Throws an error if the HTTP request fails + */ +export async function getCrawlRequest(apiKey, crawlId) { + const endpoint = `https://api.scrapegraphai.com/v1/crawl/${crawlId}`; + const headers = { + 'accept': 'application/json', + 'SGAI-APIKEY': apiKey, + }; + + try { + const response = await axios.get(endpoint, { headers }); + return response.data; + } catch (error) { + handleError(error); + } +} \ No newline at end of file diff --git a/scrapegraph-py/examples/async/.env.example b/scrapegraph-py/examples/async/.env.example new file mode 100644 index 0000000..ec3b205 --- /dev/null +++ b/scrapegraph-py/examples/async/.env.example @@ -0,0 +1 @@ +SGAI_API_KEY="your_sgai_api_key" \ No newline at end of file diff --git a/scrapegraph-py/examples/async/async_crawl_example.py b/scrapegraph-py/examples/async/async_crawl_example.py new file mode 100644 index 0000000..e295873 --- /dev/null +++ b/scrapegraph-py/examples/async/async_crawl_example.py @@ -0,0 +1,142 @@ +""" +Example demonstrating how to use the ScrapeGraphAI /v1/crawl/ API endpoint with a custom schema using the async client. + +Requirements: +- Python 3.7+ +- scrapegraph-py +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY=your_api_key_here +""" + +import asyncio +import json +import os +import time +from typing import Dict, Any + +from dotenv import load_dotenv + +from scrapegraph_py import AsyncClient + +# Load environment variables from .env file +load_dotenv() + + +async def main(): + if not os.getenv("SGAI_API_KEY"): + print("Error: SGAI_API_KEY not found in .env file") + print("Please create a .env file with your API key:") + print("SGAI_API_KEY=your_api_key_here") + return + + # Example schema (from your curl command) + schema: Dict[str, Any] = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ScrapeGraphAI Website Content", + "type": "object", + "properties": { + "company": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "description": {"type": "string"}, + "features": {"type": "array", "items": {"type": "string"}}, + "contact_email": {"type": "string", "format": "email"}, + "social_links": { + "type": "object", + "properties": { + "github": {"type": "string", "format": "uri"}, + "linkedin": {"type": "string", "format": "uri"}, + "twitter": {"type": "string", "format": "uri"}, + }, + "additionalProperties": False, + }, + }, + "required": ["name", "description"], + }, + "services": { + "type": "array", + "items": { + "type": "object", + "properties": { + "service_name": {"type": "string"}, + "description": {"type": "string"}, + "features": {"type": "array", "items": {"type": "string"}}, + }, + "required": ["service_name", "description"], + }, + }, + "legal": { + "type": "object", + "properties": { + "privacy_policy": {"type": "string"}, + "terms_of_service": {"type": "string"}, + }, + "required": ["privacy_policy", "terms_of_service"], + }, + }, + "required": ["company", "services", "legal"], + } + + url = "https://scrapegraphai.com/" + prompt = ( + "What does the company do? and I need text content from there privacy and terms" + ) + + try: + # Initialize the async client + async with AsyncClient.from_env() as client: + # Start the crawl job + print(f"\nStarting crawl for: {url}") + start_time = time.time() + crawl_response = await client.crawl( + url=url, + prompt=prompt, + schema=schema, + cache_website=True, + depth=2, + max_pages=2, + same_domain_only=True, + batch_size=1, + ) + execution_time = time.time() - start_time + print(f"POST /v1/crawl/ execution time: {execution_time:.2f} seconds") + print("\nCrawl job started. Response:") + print(json.dumps(crawl_response, indent=2)) + + # If the crawl is asynchronous and returns an ID, fetch the result + crawl_id = crawl_response.get("id") or crawl_response.get("task_id") + start_time = time.time() + if crawl_id: + print("\nPolling for crawl result...") + for _ in range(10): + await asyncio.sleep(5) + result = await client.get_crawl(crawl_id) + if result.get("status") == "success" and result.get("result"): + execution_time = time.time() - start_time + print( + f"GET /v1/crawl/{crawl_id} execution time: {execution_time:.2f} seconds" + ) + print("\nCrawl completed. Result:") + print(json.dumps(result["result"]["llm_result"], indent=2)) + break + elif result.get("status") == "failed": + print("\nCrawl failed. Result:") + print(json.dumps(result, indent=2)) + break + else: + print(f"Status: {result.get('status')}, waiting...") + else: + print("Crawl did not complete in time.") + else: + print("No crawl ID found in response. Synchronous result:") + print(json.dumps(crawl_response, indent=2)) + + except Exception as e: + print(f"Error occurred: {str(e)}") + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/scrapegraph-py/examples/miscellaneous/.env.example b/scrapegraph-py/examples/miscellaneous/.env.example new file mode 100644 index 0000000..ec3b205 --- /dev/null +++ b/scrapegraph-py/examples/miscellaneous/.env.example @@ -0,0 +1 @@ +SGAI_API_KEY="your_sgai_api_key" \ No newline at end of file diff --git a/scrapegraph-py/examples/miscellaneous/crawl_example.py b/scrapegraph-py/examples/miscellaneous/crawl_example.py new file mode 100644 index 0000000..6986047 --- /dev/null +++ b/scrapegraph-py/examples/miscellaneous/crawl_example.py @@ -0,0 +1,136 @@ +""" +Example demonstrating how to use the ScrapeGraphAI /v1/crawl/ API endpoint with a custom schema. + +Requirements: +- Python 3.7+ +- scrapegraph-py +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY="your_sgai_api_key" +""" + +import json +import os +import time +from typing import Dict, Any + +from dotenv import load_dotenv +from scrapegraph_py import Client + +# Load environment variables from .env file +load_dotenv() + + +def main(): + if not os.getenv("SGAI_API_KEY"): + print("Error: SGAI_API_KEY not found in .env file") + print("Please create a .env file with your API key:") + print('SGAI_API_KEY="your_sgai_api_key"') + return + + schema: Dict[str, Any] = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ScrapeGraphAI Website Content", + "type": "object", + "properties": { + "company": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "description": {"type": "string"}, + "features": {"type": "array", "items": {"type": "string"}}, + "contact_email": {"type": "string", "format": "email"}, + "social_links": { + "type": "object", + "properties": { + "github": {"type": "string", "format": "uri"}, + "linkedin": {"type": "string", "format": "uri"}, + "twitter": {"type": "string", "format": "uri"}, + }, + "additionalProperties": False, + }, + }, + "required": ["name", "description"], + }, + "services": { + "type": "array", + "items": { + "type": "object", + "properties": { + "service_name": {"type": "string"}, + "description": {"type": "string"}, + "features": {"type": "array", "items": {"type": "string"}}, + }, + "required": ["service_name", "description"], + }, + }, + "legal": { + "type": "object", + "properties": { + "privacy_policy": {"type": "string"}, + "terms_of_service": {"type": "string"}, + }, + "required": ["privacy_policy", "terms_of_service"], + }, + }, + "required": ["company", "services", "legal"], + } + + url = "https://scrapegraphai.com/" + prompt = ( + "What does the company do? and I need text content from there privacy and terms" + ) + + try: + client = Client.from_env() + print(f"\nStarting crawl for: {url}") + start_time = time.time() + crawl_response = client.crawl( + url=url, + prompt=prompt, + schema=schema, + cache_website=True, + depth=2, + max_pages=2, + same_domain_only=True, + batch_size=1, + ) + execution_time = time.time() - start_time + print(f"POST /v1/crawl/ execution time: {execution_time:.2f} seconds") + print("\nCrawl job started. Response:") + print(json.dumps(crawl_response, indent=2)) + + crawl_id = crawl_response.get("id") or crawl_response.get("task_id") + start_time = time.time() + if crawl_id: + print("\nPolling for crawl result...") + for _ in range(10): + time.sleep(5) + result = client.get_crawl(crawl_id) + if result.get("status") == "success" and result.get("result"): + execution_time = time.time() - start_time + print( + f"GET /v1/crawl/{crawl_id} execution time: {execution_time:.2f} seconds" + ) + print("\nCrawl completed. Result:") + print(json.dumps(result["result"]["llm_result"], indent=2)) + break + elif result.get("status") == "failed": + print("\nCrawl failed. Result:") + print(json.dumps(result, indent=2)) + break + else: + print(f"Status: {result.get('status')}, waiting...") + else: + print("Crawl did not complete in time.") + else: + print("No crawl ID found in response. Synchronous result:") + print(json.dumps(crawl_response, indent=2)) + + except Exception as e: + print(f"Error occurred: {str(e)}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scrapegraph-py/examples/sync/.env.example b/scrapegraph-py/examples/sync/.env.example new file mode 100644 index 0000000..ec3b205 --- /dev/null +++ b/scrapegraph-py/examples/sync/.env.example @@ -0,0 +1 @@ +SGAI_API_KEY="your_sgai_api_key" \ No newline at end of file diff --git a/scrapegraph-py/examples/sync/crawl_example.py b/scrapegraph-py/examples/sync/crawl_example.py new file mode 100644 index 0000000..f1f3e82 --- /dev/null +++ b/scrapegraph-py/examples/sync/crawl_example.py @@ -0,0 +1,123 @@ +""" +Example demonstrating how to use the ScrapeGraphAI /v1/crawl/ API endpoint with a custom schema. + +Requirements: +- Python 3.7+ +- scrapegraph-py +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY=your_api_key_here +""" + +import json +import os +import time +from typing import Dict, Any, List, Optional + +from dotenv import load_dotenv + +from pydantic import BaseModel, EmailStr, HttpUrl +from scrapegraph_py import Client + +# Load environment variables from .env file +load_dotenv() + +# Pydantic models for schema +class SocialLinks(BaseModel): + github: Optional[HttpUrl] + linkedin: Optional[HttpUrl] + twitter: Optional[HttpUrl] + +class Company(BaseModel): + name: str + description: str + features: Optional[List[str]] = None + contact_email: Optional[EmailStr] = None + social_links: Optional[SocialLinks] = None + +class Service(BaseModel): + service_name: str + description: str + features: Optional[List[str]] = None + +class Legal(BaseModel): + privacy_policy: str + terms_of_service: str + +class WebsiteContent(BaseModel): + company: Company + services: List[Service] + legal: Legal + +def main(): + if not os.getenv("SGAI_API_KEY"): + print("Error: SGAI_API_KEY not found in .env file") + print("Please create a .env file with your API key:") + print("SGAI_API_KEY=your_api_key_here") + return + + # Example schema (from your curl command) + schema = WebsiteContent.schema() + + url = "https://scrapegraphai.com/" + prompt = ( + "What does the company do? and I need text content from there privacy and terms" + ) + + try: + # Initialize the client + client = Client.from_env() + + # Start the crawl job + print(f"\nStarting crawl for: {url}") + start_time = time.time() + crawl_response = client.crawl( + url=url, + prompt=prompt, + schema=schema, + cache_website=True, + depth=2, + max_pages=2, + same_domain_only=True, + batch_size=1, + ) + execution_time = time.time() - start_time + print(f"POST /v1/crawl/ execution time: {execution_time:.2f} seconds") + print("\nCrawl job started. Response:") + print(json.dumps(crawl_response, indent=2)) + + # If the crawl is asynchronous and returns an ID, fetch the result + crawl_id = crawl_response.get("id") or crawl_response.get("task_id") + start_time = time.time() + if crawl_id: + print("\nPolling for crawl result...") + for _ in range(10): + time.sleep(5) + result = client.get_crawl(crawl_id) + if result.get("status") == "success" and result.get("result"): + execution_time = time.time() - start_time + print( + f"GET /v1/crawl/{crawl_id} execution time: {execution_time:.2f} seconds" + ) + print("\nCrawl completed. Result:") + print(json.dumps(result["result"]["llm_result"], indent=2)) + break + elif result.get("status") == "failed": + print("\nCrawl failed. Result:") + print(json.dumps(result, indent=2)) + break + else: + print(f"Status: {result.get('status')}, waiting...") + else: + print("Crawl did not complete in time.") + else: + print("No crawl ID found in response. Synchronous result:") + print(json.dumps(crawl_response, indent=2)) + + except Exception as e: + print(f"Error occurred: {str(e)}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index aa54e6c..a852c18 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -1,5 +1,5 @@ import asyncio -from typing import Any, Optional +from typing import Any, Optional, Dict from aiohttp import ClientSession, ClientTimeout, TCPConnector from aiohttp.client_exceptions import ClientError @@ -8,6 +8,7 @@ from scrapegraph_py.config import API_BASE_URL, DEFAULT_HEADERS from scrapegraph_py.exceptions import APIError from scrapegraph_py.logger import sgai_logger as logger +from scrapegraph_py.models.crawl import CrawlRequest, GetCrawlRequest from scrapegraph_py.models.feedback import FeedbackRequest from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest from scrapegraph_py.models.searchscraper import ( @@ -289,6 +290,58 @@ async def get_searchscraper(self, request_id: str): logger.info(f"✨ Successfully retrieved result for request {request_id}") return result + async def crawl( + self, + url: str, + prompt: str, + schema: Dict[str, Any], + cache_website: bool = True, + depth: int = 2, + max_pages: int = 2, + same_domain_only: bool = True, + batch_size: int = 1, + ): + """Send a crawl request""" + logger.info("🔍 Starting crawl request") + logger.debug(f"🌐 URL: {url}") + logger.debug(f"📝 Prompt: {prompt}") + logger.debug(f"📊 Schema provided: {bool(schema)}") + logger.debug(f"💾 Cache website: {cache_website}") + logger.debug(f"🔍 Depth: {depth}") + logger.debug(f"📄 Max pages: {max_pages}") + logger.debug(f"🏠 Same domain only: {same_domain_only}") + logger.debug(f"📦 Batch size: {batch_size}") + + request = CrawlRequest( + url=url, + prompt=prompt, + schema=schema, + cache_website=cache_website, + depth=depth, + max_pages=max_pages, + same_domain_only=same_domain_only, + batch_size=batch_size, + ) + logger.debug("✅ Request validation passed") + + result = await self._make_request( + "POST", f"{API_BASE_URL}/crawl", json=request.model_dump() + ) + logger.info("✨ Crawl request completed successfully") + return result + + async def get_crawl(self, crawl_id: str): + """Get the result of a previous crawl request""" + logger.info(f"🔍 Fetching crawl result for request {crawl_id}") + + # Validate input using Pydantic model + GetCrawlRequest(crawl_id=crawl_id) + logger.debug("✅ Request ID validation passed") + + result = await self._make_request("GET", f"{API_BASE_URL}/crawl/{crawl_id}") + logger.info(f"✨ Successfully retrieved result for request {crawl_id}") + return result + async def close(self): """Close the session to free up resources""" logger.info("🔒 Closing AsyncClient session") diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index 7cb6c3b..d4a7108 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -1,5 +1,5 @@ # Client implementation goes here -from typing import Any, Optional +from typing import Any, Optional, Dict import requests import urllib3 @@ -9,6 +9,7 @@ from scrapegraph_py.config import API_BASE_URL, DEFAULT_HEADERS from scrapegraph_py.exceptions import APIError from scrapegraph_py.logger import sgai_logger as logger +from scrapegraph_py.models.crawl import CrawlRequest, GetCrawlRequest from scrapegraph_py.models.feedback import FeedbackRequest from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest from scrapegraph_py.models.searchscraper import ( @@ -292,6 +293,58 @@ def get_searchscraper(self, request_id: str): logger.info(f"✨ Successfully retrieved result for request {request_id}") return result + def crawl( + self, + url: str, + prompt: str, + schema: Dict[str, Any], + cache_website: bool = True, + depth: int = 2, + max_pages: int = 2, + same_domain_only: bool = True, + batch_size: int = 1, + ): + """Send a crawl request""" + logger.info("🔍 Starting crawl request") + logger.debug(f"🌐 URL: {url}") + logger.debug(f"📝 Prompt: {prompt}") + logger.debug(f"📊 Schema provided: {bool(schema)}") + logger.debug(f"💾 Cache website: {cache_website}") + logger.debug(f"🔍 Depth: {depth}") + logger.debug(f"📄 Max pages: {max_pages}") + logger.debug(f"🏠 Same domain only: {same_domain_only}") + logger.debug(f"📦 Batch size: {batch_size}") + + request = CrawlRequest( + url=url, + prompt=prompt, + schema=schema, + cache_website=cache_website, + depth=depth, + max_pages=max_pages, + same_domain_only=same_domain_only, + batch_size=batch_size, + ) + logger.debug("✅ Request validation passed") + + result = self._make_request( + "POST", f"{API_BASE_URL}/crawl", json=request.model_dump() + ) + logger.info("✨ Crawl request completed successfully") + return result + + def get_crawl(self, crawl_id: str): + """Get the result of a previous crawl request""" + logger.info(f"🔍 Fetching crawl result for request {crawl_id}") + + # Validate input using Pydantic model + GetCrawlRequest(crawl_id=crawl_id) + logger.debug("✅ Request ID validation passed") + + result = self._make_request("GET", f"{API_BASE_URL}/crawl/{crawl_id}") + logger.info(f"✨ Successfully retrieved result for request {crawl_id}") + return result + def close(self): """Close the session to free up resources""" logger.info("🔒 Closing Client session") diff --git a/scrapegraph-py/scrapegraph_py/models/__init__.py b/scrapegraph-py/scrapegraph_py/models/__init__.py index e69de29..e9655b1 100644 --- a/scrapegraph-py/scrapegraph_py/models/__init__.py +++ b/scrapegraph-py/scrapegraph_py/models/__init__.py @@ -0,0 +1,17 @@ +from .crawl import CrawlRequest, GetCrawlRequest +from .feedback import FeedbackRequest +from .markdownify import GetMarkdownifyRequest, MarkdownifyRequest +from .searchscraper import GetSearchScraperRequest, SearchScraperRequest +from .smartscraper import GetSmartScraperRequest, SmartScraperRequest + +__all__ = [ + "CrawlRequest", + "GetCrawlRequest", + "FeedbackRequest", + "GetMarkdownifyRequest", + "MarkdownifyRequest", + "GetSearchScraperRequest", + "SearchScraperRequest", + "GetSmartScraperRequest", + "SmartScraperRequest", +] diff --git a/scrapegraph-py/scrapegraph_py/models/crawl.py b/scrapegraph-py/scrapegraph_py/models/crawl.py new file mode 100644 index 0000000..640ab85 --- /dev/null +++ b/scrapegraph-py/scrapegraph_py/models/crawl.py @@ -0,0 +1,85 @@ +# Models for crawl endpoint + +from typing import Optional, Dict, Any +from uuid import UUID + +from pydantic import BaseModel, Field, model_validator, conint + + +class CrawlRequest(BaseModel): + url: str = Field( + ..., + example="https://scrapegraphai.com/", + description="The starting URL for the crawl" + ) + prompt: str = Field( + ..., + example="What does the company do? and I need text content from there privacy and terms", + description="The prompt to guide the crawl and extraction" + ) + schema: Dict[str, Any] = Field( + ..., + description="JSON schema defining the structure of the extracted data" + ) + cache_website: bool = Field( + default=True, + description="Whether to cache the website content" + ) + depth: conint(ge=1, le=10) = Field( + default=2, + description="Maximum depth of the crawl (1-10)" + ) + max_pages: conint(ge=1, le=100) = Field( + default=2, + description="Maximum number of pages to crawl (1-100)" + ) + same_domain_only: bool = Field( + default=True, + description="Whether to only crawl pages from the same domain" + ) + batch_size: conint(ge=1, le=10) = Field( + default=1, + description="Batch size for processing pages (1-10)" + ) + + @model_validator(mode="after") + def validate_url(self) -> "CrawlRequest": + if not self.url.strip(): + raise ValueError("URL cannot be empty") + if not ( + self.url.startswith("http://") + or self.url.startswith("https://") + ): + raise ValueError("Invalid URL - must start with http:// or https://") + return self + + @model_validator(mode="after") + def validate_prompt(self) -> "CrawlRequest": + if not self.prompt.strip(): + raise ValueError("Prompt cannot be empty") + if not any(c.isalnum() for c in self.prompt): + raise ValueError("Prompt must contain valid content") + return self + + @model_validator(mode="after") + def validate_schema(self) -> "CrawlRequest": + if not isinstance(self.schema, dict): + raise ValueError("Schema must be a dictionary") + if not self.schema: + raise ValueError("Schema cannot be empty") + return self + + +class GetCrawlRequest(BaseModel): + """Request model for get_crawl endpoint""" + + crawl_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") + + @model_validator(mode="after") + def validate_crawl_id(self) -> "GetCrawlRequest": + try: + # Validate the crawl_id is a valid UUID + UUID(self.crawl_id) + except ValueError: + raise ValueError("crawl_id must be a valid UUID") + return self \ No newline at end of file diff --git a/scrapegraph-py/tests/test_async_client.py b/scrapegraph-py/tests/test_async_client.py index 69c067e..bdd040c 100644 --- a/scrapegraph-py/tests/test_async_client.py +++ b/scrapegraph-py/tests/test_async_client.py @@ -284,3 +284,110 @@ async def test_get_searchscraper(mock_api_key, mock_uuid): assert "answer" in response["result"] assert "reference_urls" in response assert isinstance(response["reference_urls"], list) + + +@pytest.mark.asyncio +async def test_crawl(mock_api_key): + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/crawl", + payload={ + "id": str(uuid4()), + "status": "processing", + "message": "Crawl job started", + }, + ) + + schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Test Schema", + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"}, + }, + "required": ["name"], + } + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.crawl( + url="https://example.com", + prompt="Extract company information", + schema=schema, + cache_website=True, + depth=2, + max_pages=5, + same_domain_only=True, + batch_size=1, + ) + assert response["status"] == "processing" + assert "id" in response + + +@pytest.mark.asyncio +async def test_crawl_with_minimal_params(mock_api_key): + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/crawl", + payload={ + "id": str(uuid4()), + "status": "processing", + "message": "Crawl job started", + }, + ) + + schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Test Schema", + "type": "object", + "properties": { + "name": {"type": "string"}, + }, + "required": ["name"], + } + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.crawl( + url="https://example.com", + prompt="Extract company information", + schema=schema, + ) + assert response["status"] == "processing" + assert "id" in response + + +@pytest.mark.asyncio +async def test_get_crawl(mock_api_key, mock_uuid): + with aioresponses() as mocked: + mocked.get( + f"https://api.scrapegraphai.com/v1/crawl/{mock_uuid}", + payload={ + "id": mock_uuid, + "status": "completed", + "result": { + "llm_result": { + "company": { + "name": "Example Corp", + "description": "A technology company", + }, + "services": [ + { + "service_name": "Web Development", + "description": "Custom web solutions", + } + ], + "legal": { + "privacy_policy": "Privacy policy content", + "terms_of_service": "Terms of service content", + }, + } + }, + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.get_crawl(mock_uuid) + assert response["status"] == "completed" + assert response["id"] == mock_uuid + assert "result" in response + assert "llm_result" in response["result"] diff --git a/scrapegraph-py/tests/test_client.py b/scrapegraph-py/tests/test_client.py index 11ef12f..3009077 100644 --- a/scrapegraph-py/tests/test_client.py +++ b/scrapegraph-py/tests/test_client.py @@ -282,3 +282,112 @@ def test_get_searchscraper(mock_api_key, mock_uuid): assert "answer" in response["result"] assert "reference_urls" in response assert isinstance(response["reference_urls"], list) + + +@responses.activate +def test_crawl(mock_api_key): + # Mock the API response + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/crawl", + json={ + "id": str(uuid4()), + "status": "processing", + "message": "Crawl job started", + }, + ) + + schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Test Schema", + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"}, + }, + "required": ["name"], + } + + with Client(api_key=mock_api_key) as client: + response = client.crawl( + url="https://example.com", + prompt="Extract company information", + schema=schema, + cache_website=True, + depth=2, + max_pages=5, + same_domain_only=True, + batch_size=1, + ) + assert response["status"] == "processing" + assert "id" in response + + +@responses.activate +def test_crawl_with_minimal_params(mock_api_key): + # Mock the API response + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/crawl", + json={ + "id": str(uuid4()), + "status": "processing", + "message": "Crawl job started", + }, + ) + + schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Test Schema", + "type": "object", + "properties": { + "name": {"type": "string"}, + }, + "required": ["name"], + } + + with Client(api_key=mock_api_key) as client: + response = client.crawl( + url="https://example.com", + prompt="Extract company information", + schema=schema, + ) + assert response["status"] == "processing" + assert "id" in response + + +@responses.activate +def test_get_crawl(mock_api_key, mock_uuid): + responses.add( + responses.GET, + f"https://api.scrapegraphai.com/v1/crawl/{mock_uuid}", + json={ + "id": mock_uuid, + "status": "completed", + "result": { + "llm_result": { + "company": { + "name": "Example Corp", + "description": "A technology company", + }, + "services": [ + { + "service_name": "Web Development", + "description": "Custom web solutions", + } + ], + "legal": { + "privacy_policy": "Privacy policy content", + "terms_of_service": "Terms of service content", + }, + } + }, + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.get_crawl(mock_uuid) + assert response["status"] == "completed" + assert response["id"] == mock_uuid + assert "result" in response + assert "llm_result" in response["result"] diff --git a/scrapegraph-py/tests/test_models.py b/scrapegraph-py/tests/test_models.py index 50c788f..493b2c6 100644 --- a/scrapegraph-py/tests/test_models.py +++ b/scrapegraph-py/tests/test_models.py @@ -1,6 +1,7 @@ import pytest from pydantic import BaseModel, ValidationError +from scrapegraph_py.models.crawl import CrawlRequest, GetCrawlRequest from scrapegraph_py.models.feedback import FeedbackRequest from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest from scrapegraph_py.models.searchscraper import ( @@ -215,3 +216,164 @@ def test_get_searchscraper_request_validation(): # Invalid UUID with pytest.raises(ValidationError): GetSearchScraperRequest(request_id="invalid-uuid") + + +def test_crawl_request_validation(): + # Example schema + schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Test Schema", + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"}, + }, + "required": ["name"], + } + + # Valid input with all parameters + request = CrawlRequest( + url="https://example.com", + prompt="Extract company information", + schema=schema, + cache_website=True, + depth=2, + max_pages=5, + same_domain_only=True, + batch_size=1, + ) + assert request.url == "https://example.com" + assert request.prompt == "Extract company information" + assert request.schema == schema + assert request.cache_website is True + assert request.depth == 2 + assert request.max_pages == 5 + assert request.same_domain_only is True + assert request.batch_size == 1 + + # Valid input with minimal parameters + request = CrawlRequest( + url="https://example.com", + prompt="Extract company information", + schema=schema, + ) + assert request.url == "https://example.com" + assert request.prompt == "Extract company information" + assert request.schema == schema + assert request.cache_website is True # default + assert request.depth == 2 # default + assert request.max_pages == 2 # default + assert request.same_domain_only is True # default + assert request.batch_size == 1 # default + + # Invalid URL + with pytest.raises(ValidationError): + CrawlRequest( + url="invalid-url", + prompt="Extract company information", + schema=schema, + ) + + # Empty URL + with pytest.raises(ValidationError): + CrawlRequest( + url="", + prompt="Extract company information", + schema=schema, + ) + + # Empty prompt + with pytest.raises(ValidationError): + CrawlRequest( + url="https://example.com", + prompt="", + schema=schema, + ) + + # Invalid prompt (no alphanumeric characters) + with pytest.raises(ValidationError): + CrawlRequest( + url="https://example.com", + prompt="!@#$%^", + schema=schema, + ) + + # Empty schema + with pytest.raises(ValidationError): + CrawlRequest( + url="https://example.com", + prompt="Extract company information", + schema={}, + ) + + # Invalid schema (not a dict) + with pytest.raises(ValidationError): + CrawlRequest( + url="https://example.com", + prompt="Extract company information", + schema="not a dict", + ) + + # Invalid depth (too low) + with pytest.raises(ValidationError): + CrawlRequest( + url="https://example.com", + prompt="Extract company information", + schema=schema, + depth=0, + ) + + # Invalid depth (too high) + with pytest.raises(ValidationError): + CrawlRequest( + url="https://example.com", + prompt="Extract company information", + schema=schema, + depth=11, + ) + + # Invalid max_pages (too low) + with pytest.raises(ValidationError): + CrawlRequest( + url="https://example.com", + prompt="Extract company information", + schema=schema, + max_pages=0, + ) + + # Invalid max_pages (too high) + with pytest.raises(ValidationError): + CrawlRequest( + url="https://example.com", + prompt="Extract company information", + schema=schema, + max_pages=101, + ) + + # Invalid batch_size (too low) + with pytest.raises(ValidationError): + CrawlRequest( + url="https://example.com", + prompt="Extract company information", + schema=schema, + batch_size=0, + ) + + # Invalid batch_size (too high) + with pytest.raises(ValidationError): + CrawlRequest( + url="https://example.com", + prompt="Extract company information", + schema=schema, + batch_size=11, + ) + + +def test_get_crawl_request_validation(): + # Valid UUID + request = GetCrawlRequest(crawl_id="123e4567-e89b-12d3-a456-426614174000") + assert request.crawl_id == "123e4567-e89b-12d3-a456-426614174000" + + # Invalid UUID + with pytest.raises(ValidationError): + GetCrawlRequest(crawl_id="invalid-uuid")