Skip to content

Commit 75774a9

Browse files
authored
Merge pull request #70 from ScrapeGraphAI/include-and-exclude-in-crawl
feat: add include and exclude
2 parents f5fec15 + 8841333 commit 75774a9

File tree

9 files changed

+1423
-11
lines changed

9 files changed

+1423
-11
lines changed
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
/**
2+
* Example of using the crawl endpoint with path filtering.
3+
*
4+
* This example demonstrates how to use includePaths and excludePaths
5+
* to control which pages are crawled on a website.
6+
*/
7+
8+
import { crawl, getCrawlRequest } from 'scrapegraph-js';
9+
import dotenv from 'dotenv';
10+
11+
dotenv.config();
12+
13+
const SGAI_API_KEY = process.env.SGAI_APIKEY || process.env.SGAI_API_KEY;
14+
15+
// Define your output schema
16+
const productSchema = {
17+
type: 'object',
18+
properties: {
19+
products: {
20+
type: 'array',
21+
items: {
22+
type: 'object',
23+
properties: {
24+
name: { type: 'string', description: 'Product name' },
25+
price: { type: 'string', description: 'Product price' },
26+
description: { type: 'string', description: 'Product description' },
27+
category: { type: 'string', description: 'Product category' }
28+
},
29+
required: ['name', 'price']
30+
}
31+
},
32+
total_products: {
33+
type: 'number',
34+
description: 'Total number of products found'
35+
}
36+
},
37+
required: ['products', 'total_products']
38+
};
39+
40+
// Helper function to wait for crawl completion
41+
async function waitForCrawl(taskId, maxAttempts = 60, delay = 5000) {
42+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
43+
await new Promise(resolve => setTimeout(resolve, delay));
44+
45+
const status = await getCrawlRequest(SGAI_API_KEY, taskId);
46+
const state = status.state || 'UNKNOWN';
47+
48+
console.log(`Attempt ${attempt + 1}: Status = ${state}`);
49+
50+
if (state === 'SUCCESS') {
51+
console.log('\n✨ Crawl completed successfully!');
52+
return status.result;
53+
} else if (state === 'FAILURE' || state === 'REVOKED') {
54+
console.log(`\n❌ Crawl failed with status: ${state}`);
55+
throw new Error(`Crawl failed: ${state}`);
56+
}
57+
}
58+
59+
throw new Error('Timeout: Crawl took too long');
60+
}
61+
62+
async function example1() {
63+
console.log('\n📝 Example 1: Crawl only /products/* pages');
64+
console.log('-'.repeat(50));
65+
66+
const result = await crawl(
67+
SGAI_API_KEY,
68+
'https://example.com',
69+
'Extract product information including name, price, and description',
70+
productSchema,
71+
{
72+
depth: 2,
73+
maxPages: 10,
74+
includePaths: ['/products/*', '/items/*'], // Only crawl product pages
75+
excludePaths: ['/products/archived/*'] // But skip archived products
76+
}
77+
);
78+
79+
console.log(`Task ID: ${result.task_id}`);
80+
console.log('\n✅ Crawl job started successfully!');
81+
82+
return result.task_id;
83+
}
84+
85+
async function example2() {
86+
console.log('\n📝 Example 2: Crawl all pages except admin and API');
87+
console.log('-'.repeat(50));
88+
89+
const result = await crawl(
90+
SGAI_API_KEY,
91+
'https://example.com',
92+
'Extract all relevant information from the website',
93+
productSchema,
94+
{
95+
depth: 2,
96+
maxPages: 20,
97+
excludePaths: [
98+
'/admin/*', // Skip all admin pages
99+
'/api/*', // Skip all API endpoints
100+
'/private/*', // Skip private pages
101+
'/*.json' // Skip JSON files
102+
]
103+
}
104+
);
105+
106+
console.log(`Task ID: ${result.task_id}`);
107+
console.log('\n✅ Crawl job started successfully!');
108+
109+
return result.task_id;
110+
}
111+
112+
async function example3() {
113+
console.log('\n📝 Example 3: Complex path filtering with wildcards');
114+
console.log('-'.repeat(50));
115+
116+
const blogSchema = {
117+
type: 'object',
118+
properties: {
119+
posts: {
120+
type: 'array',
121+
items: {
122+
type: 'object',
123+
properties: {
124+
title: { type: 'string' },
125+
author: { type: 'string' },
126+
date: { type: 'string' },
127+
content: { type: 'string' }
128+
}
129+
}
130+
}
131+
}
132+
};
133+
134+
const result = await crawl(
135+
SGAI_API_KEY,
136+
'https://example.com',
137+
'Extract blog posts with title, author, date, and content',
138+
blogSchema,
139+
{
140+
depth: 3,
141+
maxPages: 15,
142+
sitemap: true, // Use sitemap for better coverage
143+
includePaths: [
144+
'/blog/**', // Include all blog pages (any depth)
145+
'/articles/*', // Include top-level articles
146+
'/news/2024/*' // Include 2024 news only
147+
],
148+
excludePaths: [
149+
'/blog/draft/*', // Skip draft blog posts
150+
'/blog/*/comments' // Skip comment pages
151+
]
152+
}
153+
);
154+
155+
console.log(`Task ID: ${result.task_id}`);
156+
console.log('\n✅ Crawl job started successfully!');
157+
158+
return result.task_id;
159+
}
160+
161+
async function main() {
162+
try {
163+
console.log('🔍 Starting crawl with path filtering...');
164+
console.log('='.repeat(50));
165+
166+
// Run example 1
167+
const taskId1 = await example1();
168+
169+
// Run example 2
170+
const taskId2 = await example2();
171+
172+
// Run example 3 and wait for completion
173+
const taskId3 = await example3();
174+
175+
// Optionally wait for one of the crawls to complete
176+
console.log(`\n⏳ Waiting for example 3 to complete (task: ${taskId3})...`);
177+
const result = await waitForCrawl(taskId3);
178+
179+
console.log('\n📊 Crawl Results:');
180+
console.log(JSON.stringify(result, null, 2));
181+
182+
// Print guide
183+
console.log('\n' + '='.repeat(50));
184+
console.log('📚 Path Filtering Guide:');
185+
console.log('='.repeat(50));
186+
console.log('• Use \'/*\' to match a single path segment');
187+
console.log(' Example: \'/products/*\' matches \'/products/item1\' but not \'/products/cat/item1\'');
188+
console.log('\n• Use \'/**\' to match any number of path segments');
189+
console.log(' Example: \'/blog/**\' matches \'/blog/2024/post\' and \'/blog/category/2024/post\'');
190+
console.log('\n• excludePaths takes precedence over includePaths');
191+
console.log(' You can include a broad pattern and exclude specific subsets');
192+
console.log('\n• Paths must start with \'/\'');
193+
console.log(' Example: \'/products/*\' is valid, \'products/*\' is not');
194+
console.log('\n💡 Tips:');
195+
console.log('• Combine with sitemap: true for better page discovery');
196+
console.log('• Use includePaths to focus on content-rich sections');
197+
console.log('• Use excludePaths to skip duplicate or irrelevant content');
198+
199+
} catch (error) {
200+
console.error('❌ Error:', error.message);
201+
process.exit(1);
202+
}
203+
}
204+
205+
main();

scrapegraph-js/src/crawl.js

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ import { getMockResponse } from './utils/mockResponse.js';
2323
* @param {boolean} [options.mock] - Override mock mode for this request
2424
* @param {boolean} [options.renderHeavyJs=false] - Whether to render heavy JavaScript on the page
2525
* @param {boolean} [options.stealth=false] - Enable stealth mode to avoid bot detection
26+
* @param {Array<string>} [options.includePaths] - List of path patterns to include (e.g., ['/products/*', '/blog/**']). Supports wildcards: * matches any characters, ** matches any path segments
27+
* @param {Array<string>} [options.excludePaths] - List of path patterns to exclude (e.g., ['/admin/*', '/api/*']). Supports wildcards and takes precedence over includePaths
2628
* @returns {Promise<Object>} The crawl job response
2729
* @throws {Error} Throws an error if the HTTP request fails
2830
*/
@@ -33,7 +35,7 @@ export async function crawl(
3335
schema,
3436
options = {}
3537
) {
36-
const { mock = null, renderHeavyJs = false, stealth = false } = options;
38+
const { mock = null, renderHeavyJs = false, stealth = false, includePaths = null, excludePaths = null } = options;
3739

3840
// Check if mock mode is enabled
3941
const useMock = mock !== null ? mock : isMockEnabled();
@@ -88,6 +90,14 @@ export async function crawl(
8890
payload.stealth = stealth;
8991
}
9092

93+
if (includePaths) {
94+
payload.include_paths = includePaths;
95+
}
96+
97+
if (excludePaths) {
98+
payload.exclude_paths = excludePaths;
99+
}
100+
91101
try {
92102
const response = await axios.post(endpoint, payload, { headers });
93103
return response.data;
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
"""
2+
Example of using the async crawl endpoint with path filtering.
3+
4+
This example demonstrates how to use include_paths and exclude_paths
5+
to control which pages are crawled on a website (async version).
6+
"""
7+
import asyncio
8+
import os
9+
from scrapegraph_py import AsyncClient
10+
from pydantic import BaseModel, Field
11+
12+
13+
# Define your output schema
14+
class ProductInfo(BaseModel):
15+
name: str = Field(description="Product name")
16+
price: str = Field(description="Product price")
17+
category: str = Field(description="Product category")
18+
19+
20+
class CrawlResult(BaseModel):
21+
products: list[ProductInfo] = Field(description="List of products found")
22+
categories: list[str] = Field(description="List of product categories")
23+
24+
25+
async def main():
26+
# Initialize the async client
27+
sgai_api_key = os.getenv("SGAI_API_KEY")
28+
29+
async with AsyncClient(api_key=sgai_api_key) as client:
30+
print("🔍 Starting async crawl with path filtering...")
31+
print("=" * 50)
32+
33+
# Example: Crawl only product pages, excluding certain sections
34+
print("\n📝 Crawling e-commerce site with smart path filtering")
35+
print("-" * 50)
36+
37+
result = await client.crawl(
38+
url="https://example-shop.com",
39+
prompt="Extract all products with their names, prices, and categories",
40+
data_schema=CrawlResult.model_json_schema(),
41+
extraction_mode=True,
42+
depth=3,
43+
max_pages=50,
44+
sitemap=True, # Use sitemap for better coverage
45+
include_paths=[
46+
"/products/**", # Include all product pages
47+
"/categories/*", # Include category listings
48+
"/collections/*" # Include collection pages
49+
],
50+
exclude_paths=[
51+
"/products/out-of-stock/*", # Skip out-of-stock items
52+
"/products/*/reviews", # Skip review pages
53+
"/admin/**", # Skip admin pages
54+
"/api/**", # Skip API endpoints
55+
"/*.pdf" # Skip PDF files
56+
]
57+
)
58+
59+
print(f"Task ID: {result.get('task_id')}")
60+
print("\n✅ Async crawl job started successfully!")
61+
62+
# You can then poll for results using get_crawl
63+
task_id = result.get('task_id')
64+
if task_id:
65+
print(f"\n⏳ Polling for results (task: {task_id})...")
66+
67+
# Poll every 5 seconds until complete
68+
max_attempts = 60 # 5 minutes max
69+
for attempt in range(max_attempts):
70+
await asyncio.sleep(5)
71+
status = await client.get_crawl(task_id)
72+
73+
state = status.get('state', 'UNKNOWN')
74+
print(f"Attempt {attempt + 1}: Status = {state}")
75+
76+
if state == 'SUCCESS':
77+
print("\n✨ Crawl completed successfully!")
78+
result_data = status.get('result', {})
79+
print(f"Found {len(result_data.get('products', []))} products")
80+
break
81+
elif state in ['FAILURE', 'REVOKED']:
82+
print(f"\n❌ Crawl failed with status: {state}")
83+
break
84+
else:
85+
print("\n⏰ Timeout: Crawl took too long")
86+
87+
print("\n" + "=" * 50)
88+
print("💡 Tips for effective path filtering:")
89+
print("=" * 50)
90+
print("• Combine with sitemap=True for better page discovery")
91+
print("• Use include_paths to focus on content-rich sections")
92+
print("• Use exclude_paths to skip pages with duplicate content")
93+
print("• Test your patterns on a small max_pages first")
94+
print("• Remember: exclude_paths overrides include_paths")
95+
96+
97+
if __name__ == "__main__":
98+
asyncio.run(main())

0 commit comments

Comments
 (0)