diff --git a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts index 34e9fcd693..db3da8a119 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts @@ -2,6 +2,39 @@ import axios from "axios"; import { universalTimeout } from "../global"; import { Logger } from "../../../lib/logger"; +/** + * Detects if the content is a PDF file + * @param content The content to check + * @returns true if the content is a PDF + */ +function isPDFContent(content: string): boolean { + if (!content || typeof content !== 'string') { + return false; + } + + const trimmedContent = content.trim(); + + // Check for PDF header signature + if (trimmedContent.startsWith('%PDF-')) { + return true; + } + + // Check for PDF binary content indicators + if (trimmedContent.includes('obj') && trimmedContent.includes('endobj') && + trimmedContent.includes('stream') && trimmedContent.includes('endstream')) { + return true; + } + + // Check for high ratio of non-printable characters (typical of binary PDF content) + const nonPrintableChars = (content.match(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]/g) || []).length; + const totalChars = content.length; + if (totalChars > 100 && nonPrintableChars / totalChars > 0.1) { + return true; + } + + return false; +} + /** * Scrapes a URL with Axios * @param url The URL to scrape @@ -44,6 +77,19 @@ export async function scrapeWithFetch( } const text = response.data; + + // Check if the content is a PDF file + if (isPDFContent(text)) { + Logger.debug(`⛏️ fetch: Detected PDF content for ${url}, skipping PDF processing`); + logParams.error_message = "PDF content detected - not suitable for text extraction"; + logParams.response_code = response.status; + return { + content: "", + pageStatusCode: response.status, + pageError: "PDF content detected - not suitable for text extraction", + }; + } + logParams.success = true; logParams.html = text; logParams.response_code = response.status; diff --git a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts index 9f60bd5c2d..f44e53c6ed 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts @@ -3,6 +3,39 @@ import { generateRequestParams } from "../single_url"; import { universalTimeout } from "../global"; import { Logger } from "../../../lib/logger"; +/** + * Detects if the content is a PDF file + * @param content The content to check + * @returns true if the content is a PDF + */ +function isPDFContent(content: string): boolean { + if (!content || typeof content !== 'string') { + return false; + } + + const trimmedContent = content.trim(); + + // Check for PDF header signature + if (trimmedContent.startsWith('%PDF-')) { + return true; + } + + // Check for PDF binary content indicators + if (trimmedContent.includes('obj') && trimmedContent.includes('endobj') && + trimmedContent.includes('stream') && trimmedContent.includes('endstream')) { + return true; + } + + // Check for high ratio of non-printable characters (typical of binary PDF content) + const nonPrintableChars = (content.match(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]/g) || []).length; + const totalChars = content.length; + if (totalChars > 100 && nonPrintableChars / totalChars > 0.1) { + return true; + } + + return false; +} + /** * Scrapes a URL with Playwright * @param url The URL to scrape @@ -64,6 +97,19 @@ export async function scrapeWithPlaywright( try { const data = JSON.parse(textData); const html = data.content; + + // Check if the content is a PDF file + if (isPDFContent(html)) { + Logger.debug(`⛏️ Playwright: Detected PDF content for ${url}, skipping PDF processing`); + logParams.error_message = "PDF content detected - not suitable for text extraction"; + logParams.response_code = data.pageStatusCode; + return { + content: "", + pageStatusCode: data.pageStatusCode, + pageError: "PDF content detected - not suitable for text extraction", + }; + } + logParams.success = true; logParams.html = html; logParams.response_code = data.pageStatusCode;