ScrapeGraphAI
diff --git a/‎scrapegraph-js/examples/crawl/crawl_with_path_filtering_example.js‎
Lines changed: 205 additions & 0 deletions b/‎scrapegraph-js/examples/crawl/crawl_with_path_filtering_example.js‎
Lines changed: 205 additions & 0 deletions
diff --git a/‎scrapegraph-js/src/crawl.js‎
Lines changed: 11 additions & 1 deletion b/‎scrapegraph-js/src/crawl.js‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎scrapegraph-py/examples/crawl/async/async_crawl_with_path_filtering_example.py‎
Lines changed: 98 additions & 0 deletions b/‎scrapegraph-py/examples/crawl/async/async_crawl_with_path_filtering_example.py‎
Lines changed: 98 additions & 0 deletions
@@ -0,0 +1,205 @@
+/**
+ * Example of using the crawl endpoint with path filtering.
+ *
+ * This example demonstrates how to use includePaths and excludePaths
+ * to control which pages are crawled on a website.
+ */
+
+import { crawl, getCrawlRequest } from 'scrapegraph-js';
+import dotenv from 'dotenv';
+
+dotenv.config();
+
+const SGAI_API_KEY = process.env.SGAI_APIKEY || process.env.SGAI_API_KEY;
+
+// Define your output schema
+const productSchema = {
+  type: 'object',
+  properties: {
+    products: {
+      type: 'array',
+      items: {
+        type: 'object',
+        properties: {
+          name: { type: 'string', description: 'Product name' },
+          price: { type: 'string', description: 'Product price' },
+          description: { type: 'string', description: 'Product description' },
+          category: { type: 'string', description: 'Product category' }
+        },
+        required: ['name', 'price']
+      }
+    },
+    total_products: {
+      type: 'number',
+      description: 'Total number of products found'
+    }
+  },
+  required: ['products', 'total_products']
+};
+
+// Helper function to wait for crawl completion
+async function waitForCrawl(taskId, maxAttempts = 60, delay = 5000) {
+  for (let attempt = 0; attempt < maxAttempts; attempt++) {
+    await new Promise(resolve => setTimeout(resolve, delay));
+
+    const status = await getCrawlRequest(SGAI_API_KEY, taskId);
+    const state = status.state || 'UNKNOWN';
+
+    console.log(`Attempt ${attempt + 1}: Status = ${state}`);
+
+    if (state === 'SUCCESS') {
+      console.log('\n✨ Crawl completed successfully!');
+      return status.result;
+    } else if (state === 'FAILURE' || state === 'REVOKED') {
+      console.log(`\n❌ Crawl failed with status: ${state}`);
+      throw new Error(`Crawl failed: ${state}`);
+    }
+  }
+
+  throw new Error('Timeout: Crawl took too long');
+}
+
+async function example1() {
+  console.log('\n📝 Example 1: Crawl only /products/* pages');
+  console.log('-'.repeat(50));
+
+  const result = await crawl(
+    SGAI_API_KEY,
+    'https://example.com',
+    'Extract product information including name, price, and description',
+    productSchema,
+    {
+      depth: 2,
+      maxPages: 10,
+      includePaths: ['/products/*', '/items/*'],  // Only crawl product pages
+      excludePaths: ['/products/archived/*']      // But skip archived products
+    }
+  );
+
+  console.log(`Task ID: ${result.task_id}`);
+  console.log('\n✅ Crawl job started successfully!');
+
+  return result.task_id;
+}
+
+async function example2() {
+  console.log('\n📝 Example 2: Crawl all pages except admin and API');
+  console.log('-'.repeat(50));
+
+  const result = await crawl(
+    SGAI_API_KEY,
+    'https://example.com',
+    'Extract all relevant information from the website',
+    productSchema,
+    {
+      depth: 2,
+      maxPages: 20,
+      excludePaths: [
+        '/admin/*',      // Skip all admin pages
+        '/api/*',        // Skip all API endpoints
+        '/private/*',    // Skip private pages
+        '/*.json'        // Skip JSON files
+      ]
+    }
+  );
+
+  console.log(`Task ID: ${result.task_id}`);
+  console.log('\n✅ Crawl job started successfully!');
+
+  return result.task_id;
+}
+
+async function example3() {
+  console.log('\n📝 Example 3: Complex path filtering with wildcards');
+  console.log('-'.repeat(50));
+
+  const blogSchema = {
+    type: 'object',
+    properties: {
+      posts: {
+        type: 'array',
+        items: {
+          type: 'object',
+          properties: {
+            title: { type: 'string' },
+            author: { type: 'string' },
+            date: { type: 'string' },
+            content: { type: 'string' }
+          }
+        }
+      }
+    }
+  };
+
+  const result = await crawl(
+    SGAI_API_KEY,
+    'https://example.com',
+    'Extract blog posts with title, author, date, and content',
+    blogSchema,
+    {
+      depth: 3,
+      maxPages: 15,
+      sitemap: true,  // Use sitemap for better coverage
+      includePaths: [
+        '/blog/**',           // Include all blog pages (any depth)
+        '/articles/*',        // Include top-level articles
+        '/news/2024/*'        // Include 2024 news only
+      ],
+      excludePaths: [
+        '/blog/draft/*',      // Skip draft blog posts
+        '/blog/*/comments'    // Skip comment pages
+      ]
+    }
+  );
+
+  console.log(`Task ID: ${result.task_id}`);
+  console.log('\n✅ Crawl job started successfully!');
+
+  return result.task_id;
+}
+
+async function main() {
+  try {
+    console.log('🔍 Starting crawl with path filtering...');
+    console.log('='.repeat(50));
+
+    // Run example 1
+    const taskId1 = await example1();
+
+    // Run example 2
+    const taskId2 = await example2();
+
+    // Run example 3 and wait for completion
+    const taskId3 = await example3();
+
+    // Optionally wait for one of the crawls to complete
+    console.log(`\n⏳ Waiting for example 3 to complete (task: ${taskId3})...`);
+    const result = await waitForCrawl(taskId3);
+
+    console.log('\n📊 Crawl Results:');
+    console.log(JSON.stringify(result, null, 2));
+
+    // Print guide
+    console.log('\n' + '='.repeat(50));
+    console.log('📚 Path Filtering Guide:');
+    console.log('='.repeat(50));
+    console.log('• Use \'/*\' to match a single path segment');
+    console.log('  Example: \'/products/*\' matches \'/products/item1\' but not \'/products/cat/item1\'');
+    console.log('\n• Use \'/**\' to match any number of path segments');
+    console.log('  Example: \'/blog/**\' matches \'/blog/2024/post\' and \'/blog/category/2024/post\'');
+    console.log('\n• excludePaths takes precedence over includePaths');
+    console.log('  You can include a broad pattern and exclude specific subsets');
+    console.log('\n• Paths must start with \'/\'');
+    console.log('  Example: \'/products/*\' is valid, \'products/*\' is not');
+    console.log('\n💡 Tips:');
+    console.log('• Combine with sitemap: true for better page discovery');
+    console.log('• Use includePaths to focus on content-rich sections');
+    console.log('• Use excludePaths to skip duplicate or irrelevant content');
+
+  } catch (error) {
+    console.error('❌ Error:', error.message);
+    process.exit(1);
+  }
+}
+
+main();
@@ -23,6 +23,8 @@ import { getMockResponse } from './utils/mockResponse.js';
  * @param {boolean} [options.mock] - Override mock mode for this request
  * @param {boolean} [options.renderHeavyJs=false] - Whether to render heavy JavaScript on the page
  * @param {boolean} [options.stealth=false] - Enable stealth mode to avoid bot detection
+ * @param {Array<string>} [options.includePaths] - List of path patterns to include (e.g., ['/products/*', '/blog/**']). Supports wildcards: * matches any characters, ** matches any path segments
+ * @param {Array<string>} [options.excludePaths] - List of path patterns to exclude (e.g., ['/admin/*', '/api/*']). Supports wildcards and takes precedence over includePaths
  * @returns {Promise<Object>} The crawl job response
  * @throws {Error} Throws an error if the HTTP request fails
  */
@@ -33,7 +35,7 @@ export async function crawl(
   schema,
   options = {}
 ) {
-  const { mock = null, renderHeavyJs = false, stealth = false } = options;
+  const { mock = null, renderHeavyJs = false, stealth = false, includePaths = null, excludePaths = null } = options;
 
   // Check if mock mode is enabled
   const useMock = mock !== null ? mock : isMockEnabled();
@@ -88,6 +90,14 @@ export async function crawl(
     payload.stealth = stealth;
   }
 
+  if (includePaths) {
+    payload.include_paths = includePaths;
+  }
+
+  if (excludePaths) {
+    payload.exclude_paths = excludePaths;
+  }
+
   try {
     const response = await axios.post(endpoint, payload, { headers });
     return response.data;
 
@@ -0,0 +1,98 @@
+"""
+Example of using the async crawl endpoint with path filtering.
+
+This example demonstrates how to use include_paths and exclude_paths
+to control which pages are crawled on a website (async version).
+"""
+import asyncio
+import os
+from scrapegraph_py import AsyncClient
+from pydantic import BaseModel, Field
+
+
+# Define your output schema
+class ProductInfo(BaseModel):
+    name: str = Field(description="Product name")
+    price: str = Field(description="Product price")
+    category: str = Field(description="Product category")
+
+
+class CrawlResult(BaseModel):
+    products: list[ProductInfo] = Field(description="List of products found")
+    categories: list[str] = Field(description="List of product categories")
+
+
+async def main():
+    # Initialize the async client
+    sgai_api_key = os.getenv("SGAI_API_KEY")
+
+    async with AsyncClient(api_key=sgai_api_key) as client:
+        print("🔍 Starting async crawl with path filtering...")
+        print("=" * 50)
+
+        # Example: Crawl only product pages, excluding certain sections
+        print("\n📝 Crawling e-commerce site with smart path filtering")
+        print("-" * 50)
+
+        result = await client.crawl(
+            url="https://example-shop.com",
+            prompt="Extract all products with their names, prices, and categories",
+            data_schema=CrawlResult.model_json_schema(),
+            extraction_mode=True,
+            depth=3,
+            max_pages=50,
+            sitemap=True,  # Use sitemap for better coverage
+            include_paths=[
+                "/products/**",          # Include all product pages
+                "/categories/*",         # Include category listings
+                "/collections/*"         # Include collection pages
+            ],
+            exclude_paths=[
+                "/products/out-of-stock/*",  # Skip out-of-stock items
+                "/products/*/reviews",       # Skip review pages
+                "/admin/**",                 # Skip admin pages
+                "/api/**",                   # Skip API endpoints
+                "/*.pdf"                     # Skip PDF files
+            ]
+        )
+
+        print(f"Task ID: {result.get('task_id')}")
+        print("\n✅ Async crawl job started successfully!")
+
+        # You can then poll for results using get_crawl
+        task_id = result.get('task_id')
+        if task_id:
+            print(f"\n⏳ Polling for results (task: {task_id})...")
+
+            # Poll every 5 seconds until complete
+            max_attempts = 60  # 5 minutes max
+            for attempt in range(max_attempts):
+                await asyncio.sleep(5)
+                status = await client.get_crawl(task_id)
+
+                state = status.get('state', 'UNKNOWN')
+                print(f"Attempt {attempt + 1}: Status = {state}")
+
+                if state == 'SUCCESS':
+                    print("\n✨ Crawl completed successfully!")
+                    result_data = status.get('result', {})
+                    print(f"Found {len(result_data.get('products', []))} products")
+                    break
+                elif state in ['FAILURE', 'REVOKED']:
+                    print(f"\n❌ Crawl failed with status: {state}")
+                    break
+            else:
+                print("\n⏰ Timeout: Crawl took too long")
+
+        print("\n" + "=" * 50)
+        print("💡 Tips for effective path filtering:")
+        print("=" * 50)
+        print("• Combine with sitemap=True for better page discovery")
+        print("• Use include_paths to focus on content-rich sections")
+        print("• Use exclude_paths to skip pages with duplicate content")
+        print("• Test your patterns on a small max_pages first")
+        print("• Remember: exclude_paths overrides include_paths")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())