Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
269 changes: 269 additions & 0 deletions scrapegraph-js/examples/crawl_markdown_direct_api_example.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,269 @@
#!/usr/bin/env node

/**
* Example script demonstrating the ScrapeGraphAI Crawler markdown conversion mode.
*
* This example shows how to use the crawler in markdown conversion mode:
* - Cost-effective markdown conversion (NO AI/LLM processing)
* - 2 credits per page (80% savings compared to AI mode)
* - Clean HTML to markdown conversion with metadata extraction
*
* Requirements:
* - Node.js 14+
* - dotenv
* - A .env file with your API_KEY
*
* Example .env file:
* API_KEY=your_api_key_here
*/

import 'dotenv/config';

// Configuration - API key from environment or fallback
const API_KEY = process.env.TEST_API_KEY || "sgai-xxx"; // Load from .env file
const BASE_URL = process.env.BASE_URL || "http://localhost:8001"; // Can be overridden via env

/**
* Make an HTTP request to the API.
* @param {string} url - The URL to make the request to
* @param {Object} data - The data to send in the request body
* @returns {Promise<Object>} The response JSON
*/
async function makeRequest(url, data) {
const headers = {
"Content-Type": "application/json",
"SGAI-APIKEY": API_KEY
};

const response = await fetch(url, {
method: 'POST',
headers: headers,
body: JSON.stringify(data)
});

return await response.json();
}

/**
* Poll for the result of a crawl job with rate limit handling.
* @param {string} taskId - The task ID to poll for
* @returns {Promise<Object>} The response JSON
*/
async function pollResult(taskId) {
const headers = { "SGAI-APIKEY": API_KEY };
const url = `${BASE_URL}/v1/crawl/${taskId}`;

const response = await fetch(url, {
method: 'GET',
headers: headers
});

if (response.status === 429) {
// Rate limited - return special status to handle in polling loop
return { status: "rate_limited", retry_after: 60 };
}

return await response.json();
}

/**
* Poll for crawl results with intelligent backoff to avoid rate limits.
* @param {string} taskId - The task ID to poll for
* @param {number} maxAttempts - Maximum number of polling attempts
* @returns {Promise<Object>} The final result or throws an exception on timeout/failure
*/
async function pollWithBackoff(taskId, maxAttempts = 20) {
console.log("⏳ Starting to poll for results with rate-limit protection...");

// Initial wait to give the job time to start processing
await new Promise(resolve => setTimeout(resolve, 15000));

for (let attempt = 0; attempt < maxAttempts; attempt++) {
try {
const result = await pollResult(taskId);
const status = result.status;

if (status === "rate_limited") {
const waitTime = Math.min(90, 30 + (attempt * 10)); // Exponential backoff for rate limits
console.log(`⚠️ Rate limited! Waiting ${waitTime}s before retry...`);
await new Promise(resolve => setTimeout(resolve, waitTime * 1000));
continue;
} else if (status === "success") {
return result;
} else if (status === "failed") {
throw new Error(`Crawl failed: ${result.error || 'Unknown error'}`);
} else {
// Calculate progressive wait time: start at 15s, increase gradually
const baseWait = 15;
const progressiveWait = Math.min(60, baseWait + (attempt * 3)); // Cap at 60s

console.log(`⏳ Status: ${status} (attempt ${attempt + 1}/${maxAttempts}) - waiting ${progressiveWait}s...`);
await new Promise(resolve => setTimeout(resolve, progressiveWait * 1000));
}
} catch (error) {
if (error.message.toLowerCase().includes('rate') || error.message.includes('429')) {
const waitTime = Math.min(90, 45 + (attempt * 10));
console.log(`⚠️ Rate limit detected in error, waiting ${waitTime}s...`);
await new Promise(resolve => setTimeout(resolve, waitTime * 1000));
continue;
} else {
console.log(`❌ Error polling for results: ${error.message}`);
if (attempt < maxAttempts - 1) {
await new Promise(resolve => setTimeout(resolve, 20000)); // Wait before retry
continue;
}
throw error;
}
}
}

throw new Error(`⏰ Timeout: Job did not complete after ${maxAttempts} attempts`);
}

/**
* Markdown Conversion Mode (NO AI/LLM Used)
*
* This example demonstrates cost-effective crawling that converts pages to clean markdown
* WITHOUT any AI processing. Perfect for content archival and when you only need clean markdown.
*/
async function markdownCrawlingExample() {
console.log("=".repeat(60));
console.log("MARKDOWN CONVERSION MODE (NO AI/LLM)");
console.log("=".repeat(60));
console.log("Use case: Get clean markdown content without AI processing");
console.log("Cost: 2 credits per page (80% savings!)");
console.log("Features: Clean markdown conversion, metadata extraction");
console.log("⚠️ NO AI/LLM PROCESSING - Pure HTML to markdown conversion only!");
console.log();

// Markdown conversion request - NO AI/LLM processing
const requestData = {
url: "https://scrapegraphai.com/",
extraction_mode: false, // FALSE = Markdown conversion mode (NO AI/LLM used)
depth: 2,
max_pages: 2,
same_domain_only: true,
sitemap: false, // Use sitemap for better coverage
// Note: No prompt needed when extraction_mode = false
};

console.log(`🌐 Target URL: ${requestData.url}`);
console.log("πŸ€– AI Prompt: None (no AI processing)");
console.log(`πŸ“Š Crawl Depth: ${requestData.depth}`);
console.log(`πŸ“„ Max Pages: ${requestData.max_pages}`);
console.log(`πŸ—ΊοΈ Use Sitemap: ${requestData.sitemap}`);
console.log("πŸ’‘ Mode: Pure HTML to markdown conversion");
console.log();

// Start the markdown conversion job
console.log("πŸš€ Starting markdown conversion job...");
const response = await makeRequest(`${BASE_URL}/v1/crawl`, requestData);
const taskId = response.task_id;

if (!taskId) {
console.log("❌ Failed to start markdown conversion job");
return;
}

console.log(`πŸ“‹ Task ID: ${taskId}`);
console.log("⏳ Polling for results...");
console.log();

// Poll for results with rate-limit protection
try {
const result = await pollWithBackoff(taskId, 20);

console.log("βœ… Markdown conversion completed successfully!");
console.log();

const resultData = result.result || {};
const pages = resultData.pages || [];
const crawledUrls = resultData.crawled_urls || [];
const creditsUsed = resultData.credits_used || 0;
const pagesProcessed = resultData.pages_processed || 0;

console.log("πŸ“Š CONVERSION RESULTS:");
console.log("-".repeat(40));
console.log(`πŸ“„ Pages processed: ${pagesProcessed}`);
console.log(`πŸ’° Credits used: ${creditsUsed}`);
console.log(`πŸ’΅ Cost per page: ${pagesProcessed > 0 ? (creditsUsed / pagesProcessed).toFixed(1) : 0} credits`);
if (crawledUrls.length > 0) {
console.log(`πŸ”— URLs processed: ${JSON.stringify(crawledUrls)}`);
}
console.log();

console.log("πŸ“ MARKDOWN CONTENT:");
console.log("-".repeat(40));
if (pages.length > 0) {
console.log(`πŸ“„ Total pages with markdown: ${pages.length}`);
pages.slice(0, 3).forEach((page, i) => { // Show first 3 pages
console.log(`\nπŸ“„ Page ${i + 1}:`);
console.log(` URL: ${page.url || 'N/A'}`);
console.log(` Title: ${page.title || 'None'}`);

const metadata = page.metadata || {};
console.log(` πŸ“Š Word count: ${metadata.word_count || 0}`);
console.log(` πŸ“‹ Headers: ${JSON.stringify((metadata.headers || []).slice(0, 3))}`); // First 3 headers
console.log(` πŸ”— Links: ${metadata.links_count || 0}`);

// Show markdown preview
const markdownContent = page.markdown || "";
let markdownPreview = markdownContent.substring(0, 200);
if (markdownContent.length > 200) {
markdownPreview += "...";
}
console.log(` πŸ“ Content preview: ${markdownPreview}`);
});

if (pages.length > 3) {
console.log(`\n ... and ${pages.length - 3} more pages with markdown content`);
}
} else {
console.log("No markdown content available");
}

} catch (error) {
console.log(`❌ Markdown conversion failed: ${error.message}`);
}
}

/**
* Main function to run the markdown crawling example.
*/
async function main() {
console.log("🌐 ScrapeGraphAI Crawler - Markdown Conversion Example");
console.log("Cost-effective HTML to Markdown conversion (NO AI/LLM)");
console.log("=".repeat(60));

// Check if API key is set
if (API_KEY === "sgai-xxx") {
console.log("⚠️ Please set your API key in the .env file");
console.log(" Create a .env file with your API key:");
console.log(" API_KEY=your_api_key_here");
console.log();
console.log(" You can get your API key from: https://dashboard.scrapegraphai.com");
console.log();
console.log(" Example .env file:");
console.log(" API_KEY=sgai-your-actual-api-key-here");
console.log(" BASE_URL=https://api.scrapegraphai.com # Optional");
return;
}

console.log(`πŸ”‘ Using API key: ${API_KEY.substring(0, 10)}...`);
console.log(`🌐 Base URL: ${BASE_URL}`);
console.log();

// Run the single example
await markdownCrawlingExample(); // Markdown conversion mode (NO AI)

console.log("\n" + "=".repeat(60));
console.log("πŸŽ‰ Example completed!");
console.log("πŸ’‘ This demonstrates markdown conversion mode:");
console.log(" β€’ Cost-effective: Only 2 credits per page");
console.log(" β€’ No AI/LLM processing - pure HTML to markdown conversion");
console.log(" β€’ Perfect for content archival and documentation");
console.log(" β€’ 80% cheaper than AI extraction modes!");
}

// Run the example
main().catch(console.error);
Loading