diff --git a/.eslintrc.json b/.eslintrc.json index 2cad0a3..9450ea5 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -3,18 +3,13 @@ "es2022": true, "node": true }, - "extends": [ - "eslint:recommended", - "@typescript-eslint/recommended" - ], + "extends": ["eslint:recommended", "@typescript-eslint/recommended"], "parser": "@typescript-eslint/parser", "parserOptions": { "ecmaVersion": "latest", "sourceType": "module" }, - "plugins": [ - "@typescript-eslint" - ], + "plugins": ["@typescript-eslint"], "rules": { "@typescript-eslint/no-unused-vars": "error", "@typescript-eslint/no-explicit-any": "warn", @@ -25,4 +20,4 @@ "no-var": "error", "no-console": "warn" } -} \ No newline at end of file +} diff --git a/.prettierrc b/.prettierrc index aa870fa..02c53be 100644 --- a/.prettierrc +++ b/.prettierrc @@ -2,9 +2,9 @@ "semi": true, "trailingComma": "es5", "singleQuote": true, - "printWidth": 80, + "printWidth": 120, "tabWidth": 2, "useTabs": false, "bracketSpacing": true, "arrowParens": "avoid" -} \ No newline at end of file +} diff --git a/README.md b/README.md index 5847de1..c788683 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,9 @@ A TypeScript MCP (Model Context Protocol) server that provides comprehensive web The server provides three specialised tools for different web search needs: ### 1. `full-web-search` (Main Tool) + When a comprehensive search is requested, the server uses an **optimised search strategy**: + 1. **Browser-based Bing Search** - Primary method using dedicated Chromium instance 2. **Browser-based Brave Search** - Secondary option using dedicated Firefox instance 3. **Axios DuckDuckGo Search** - Final fallback using traditional HTTP @@ -25,13 +27,17 @@ When a comprehensive search is requested, the server uses an **optimised search 7. **HTTP/2 error recovery**: Automatically falls back to HTTP/1.1 when protocol errors occur ### 2. `get-web-search-summaries` (Lightweight Alternative) + For quick search results without full content extraction: + 1. Performs the same optimised multi-engine search as `full-web-search` 2. Returns only the search result snippets/descriptions 3. Does not follow links to extract full page content ### 3. `get-single-web-page-content` (Utility Tool) + For extracting content from a specific webpage: + 1. Takes a single URL as input 2. Follows the URL and extracts the main page content 3. Removes navigation, ads, and other non-content elements @@ -41,7 +47,8 @@ For extracting content from a specific webpage: This MCP server has been developed and tested with **LM Studio** and **LibreChat**. It has not been tested with other MCP clients. ### Model Compatibility -**Important:** Prioritise using more recent models designated for tool use. + +**Important:** Prioritise using more recent models designated for tool use. Older models (even those with tool use specified) may not work or may work erratically. This seems to be the case with Llama and Deepseek. Qwen3 and Gemma 3 currently have the best restults. @@ -56,20 +63,24 @@ Older models (even those with tool use specified) may not work or may work errat ## Installation (Recommended) **Requirements:** + - Node.js 18.0.0 or higher - npm 8.0.0 or higher 1. Download the latest release zip file from the [Releases page](https://github.com/mrkrsl/web-search-mcp/releases) 2. Extract the zip file to a location on your system (e.g., `~/mcp-servers/web-search-mcp/`) 3. **Open a terminal in the extracted folder and run:** + ```bash npm install npx playwright install npm run build ``` + This will create a `node_modules` folder with all required dependencies, install Playwright browsers, and build the project. **Note:** You must run `npm install` in the root of the extracted folder (not in `dist/`). + 4. Configure your `mcp.json` to point to the extracted `dist/index.js` file: ```json @@ -82,33 +93,39 @@ Older models (even those with tool use specified) may not work or may work errat } } ``` + **Example paths:** + - macOS/Linux: `~/mcp-servers/web-search-mcp/dist/index.js` - Windows: `C:\\mcp-servers\\web-search-mcp\\dist\\index.js` In LibreChat, you can include the MCP server in the librechat.yaml. If you are running LibreChat in Docker, you must first mount your local directory in docker-compose.override.yml. in `docker-compose.override.yml`: + ```yaml services: api: volumes: - - type: bind - source: /path/to/your/mcp/directory - target: /app/mcp + - type: bind + source: /path/to/your/mcp/directory + target: /app/mcp ``` + in `librechat.yaml`: + ```yaml mcpServers: web-search: type: stdio command: node args: - - /app/mcp/web-search-mcp/dist/index.js + - /app/mcp/web-search-mcp/dist/index.js serverInstructions: true ``` **Troubleshooting:** + - If `npm install` fails, try updating Node.js to version 18+ and npm to version 8+ - If `npm run build` fails, ensure you have the latest Node.js version installed - For older Node.js versions, you may need to use an older release of this project @@ -151,28 +168,33 @@ The server supports several environment variables for configuration: ## Troubleshooting ### Slow Response Times + - **Optimised timeouts**: Default timeout reduced to 6 seconds with concurrent processing for faster results - **Concurrent extraction**: Content is now extracted from multiple pages simultaneously - **Reduce timeouts further**: Set `DEFAULT_TIMEOUT=4000` for even faster responses (may reduce success rate) - **Use fewer browsers**: Set `MAX_BROWSERS=1` to reduce memory usage ### Search Failures + - **Check browser installation**: Run `npx playwright install` to ensure browsers are available - **Try headless mode**: Ensure `BROWSER_HEADLESS=true` (default) for server environments - **Network restrictions**: Some networks block browser automation - try different network or VPN - **HTTP/2 issues**: The server automatically handles HTTP/2 protocol errors with fallback to HTTP/1.1 ### Search Quality Issues + - **Enable quality checking**: Set `ENABLE_RELEVANCE_CHECKING=true` (enabled by default) - **Adjust quality threshold**: Set `RELEVANCE_THRESHOLD=0.5` for stricter quality requirements - **Force multi-engine search**: Set `FORCE_MULTI_ENGINE_SEARCH=true` to try all engines and return the best results ### Memory Usage + - **Automatic cleanup**: Browsers are automatically cleaned up after each operation to prevent memory leaks - **Limit browsers**: Reduce `MAX_BROWSERS` (default: 3) - **EventEmitter warnings**: Fixed - browsers are properly closed to prevent listener accumulation ## For Development + ```bash git clone https://github.com/mrkrsl/web-search-mcp.git cd web-search-mcp @@ -195,7 +217,9 @@ npm run format # Run Prettier This server provides three specialised tools for different web search needs: ### 1. `full-web-search` (Main Tool) + The most comprehensive web search tool that: + 1. Takes a search query and optional number of results (1-10, default 5) 2. Performs a web search (tries Bing, then Brave, then DuckDuckGo if needed) 3. Fetches full page content from each result URL with concurrent processing @@ -203,6 +227,7 @@ The most comprehensive web search tool that: 5. **Enhanced reliability**: HTTP/2 error recovery, reduced timeouts, and better error handling **Example Usage:** + ```json { "name": "full-web-search", @@ -215,13 +240,16 @@ The most comprehensive web search tool that: ``` ### 2. `get-web-search-summaries` (Lightweight Alternative) + A lightweight alternative for quick search results: + 1. Takes a search query and optional number of results (1-10, default 5) 2. Performs the same optimised multi-engine search as `full-web-search` 3. Returns only search result snippets/descriptions (no content extraction) 4. Faster and more efficient for quick research **Example Usage:** + ```json { "name": "get-web-search-summaries", @@ -233,13 +261,16 @@ A lightweight alternative for quick search results: ``` ### 3. `get-single-web-page-content` (Utility Tool) + A utility tool for extracting content from a specific webpage: + 1. Takes a single URL as input 2. Follows the URL and extracts the main page content 3. Removes navigation, ads, and other non-content elements 4. Useful for getting detailed content from a known webpage **Example Usage:** + ```json { "name": "get-single-web-page-content", @@ -253,6 +284,7 @@ A utility tool for extracting content from a specific webpage: ## Standalone Usage You can also run the server directly: + ```bash # If running from source npm start diff --git a/docs/API.md b/docs/API.md index cff64ce..2443014 100644 --- a/docs/API.md +++ b/docs/API.md @@ -5,15 +5,17 @@ The Web Search MCP Server provides three tools for web searching and content extraction: 1. **`full-web-search`** - Comprehensive web search with full content extraction (primary tool) -2. **`get-web-search-summaries`** - Lightweight search returning only result snippets +2. **`get-web-search-summaries`** - Lightweight search returning only result snippets 3. **`get-single-web-page-content`** - Extract content from a single web page URL ## Tool: full-web-search ### Description + Search the web and fetch complete page content from top results. This is the most comprehensive web search tool. It searches the web and then follows the resulting links to extract their full page content, providing the most detailed and complete information available. ### Input Schema + ```json { "type": "object", @@ -45,6 +47,7 @@ Search the web and fetch complete page content from top results. This is the mos ``` ### Output Schema + Returns formatted text content containing search results with full page content: ```json @@ -61,6 +64,7 @@ Returns formatted text content containing search results with full page content: ### Usage Examples #### Basic Search + ```json { "name": "full-web-search", @@ -71,6 +75,7 @@ Returns formatted text content containing search results with full page content: ``` #### Search with Custom Parameters + ```json { "name": "full-web-search", @@ -86,9 +91,11 @@ Returns formatted text content containing search results with full page content: ## Tool: get-web-search-summaries ### Description + Search the web and return only the search result snippets/descriptions without following links to extract full page content. This is a lightweight alternative to full-web-search for when you only need brief search results. ### Input Schema + ```json { "type": "object", @@ -110,6 +117,7 @@ Search the web and return only the search result snippets/descriptions without f ``` ### Output Schema + Returns formatted text content containing search result summaries: ```json @@ -126,6 +134,7 @@ Returns formatted text content containing search result summaries: ### Usage Examples #### Basic Summary Search + ```json { "name": "get-web-search-summaries", @@ -136,6 +145,7 @@ Returns formatted text content containing search result summaries: ``` #### Summary Search with Custom Limit + ```json { "name": "get-web-search-summaries", @@ -149,9 +159,11 @@ Returns formatted text content containing search result summaries: ## Tool: get-single-web-page-content ### Description + Extract and return the full content from a single web page URL. This tool follows a provided URL and extracts the main page content. Useful for getting detailed content from a specific webpage without performing a search. ### Input Schema + ```json { "type": "object", @@ -172,6 +184,7 @@ Extract and return the full content from a single web page URL. This tool follow ``` ### Output Schema + Returns formatted text content from the specified web page: ```json @@ -188,6 +201,7 @@ Returns formatted text content from the specified web page: ### Usage Examples #### Basic Page Content Extraction + ```json { "name": "get-single-web-page-content", @@ -198,6 +212,7 @@ Returns formatted text content from the specified web page: ``` #### Page Content with Length Limit + ```json { "name": "get-single-web-page-content", @@ -211,6 +226,7 @@ Returns formatted text content from the specified web page: ## Response Examples ### full-web-search Response + ```json { "content": [ @@ -223,6 +239,7 @@ Returns formatted text content from the specified web page: ``` ### get-web-search-summaries Response + ```json { "content": [ @@ -235,6 +252,7 @@ Returns formatted text content from the specified web page: ``` ### get-single-web-page-content Response + ```json { "content": [ @@ -266,6 +284,7 @@ Returns formatted text content from the specified web page: - Malformed HTML ### Error Response Format + ```json { "error": { @@ -287,11 +306,13 @@ The server implements rate limiting to respect Google's terms of service: ## Performance Considerations ### Response Times + - Search execution: 1-5 seconds - Content extraction: 2-10 seconds per URL - Total response time: 3-15 seconds (depending on result count) ### Content Limits + - Maximum content length: 50KB per page - Maximum concurrent requests: 5 - Request timeout: 10 seconds @@ -299,6 +320,7 @@ The server implements rate limiting to respect Google's terms of service: ## Integration Examples ### LM Studio Configuration + ```json { "mcpServers": { @@ -315,6 +337,7 @@ The server implements rate limiting to respect Google's terms of service: ``` ### Claude Desktop Configuration + ```json { "mcpServers": { @@ -329,16 +352,19 @@ The server implements rate limiting to respect Google's terms of service: ## Best Practices ### Query Optimization + - Use specific, descriptive queries - Include relevant keywords - Avoid overly broad searches ### Result Handling + - Check for content extraction errors - Handle partial failures gracefully - Consider result relevance ### Error Recovery + - Implement retry logic for transient errors - Provide fallback content when extraction fails - Log errors for debugging @@ -363,7 +389,9 @@ The server implements rate limiting to respect Google's terms of service: - Check system resources ### Debug Mode + Enable debug logging by setting the environment variable: + ```bash export DEBUG=web-search-mcp:* ``` diff --git a/eslint.config.js b/eslint.config.js index 32acd38..793c02f 100644 --- a/eslint.config.js +++ b/eslint.config.js @@ -71,4 +71,4 @@ export default [ 'no-console': 'warn', }, }, -]; \ No newline at end of file +]; diff --git a/mcp.json b/mcp.json index 82c2846..7359fd6 100644 --- a/mcp.json +++ b/mcp.json @@ -2,9 +2,7 @@ "mcpServers": { "web-search": { "command": "node", - "args": [ - "/Users/mark/Projects/web-search-mcp/dist/simple-test.js" - ] + "args": ["/Users/mark/Projects/web-search-mcp/dist/simple-test.js"] } } -} \ No newline at end of file +} diff --git a/package-lock.json b/package-lock.json index 02f58a0..b508adb 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "web-search-mcp-server", - "version": "0.2.2", + "version": "0.3.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "web-search-mcp-server", - "version": "0.2.2", + "version": "0.3.1", "license": "MIT", "dependencies": { "@modelcontextprotocol/sdk": "^1.15.0", @@ -14,7 +14,8 @@ "cheerio": "^1.0.0-rc.12", "p-limit": "^6.2.0", "p-retry": "^6.2.1", - "playwright": "^1.48.0" + "playwright": "^1.48.0", + "zod": "^3.22.0" }, "bin": { "web-search-mcp": "dist/index.js" diff --git a/package.json b/package.json index fd7724d..ee786cb 100644 --- a/package.json +++ b/package.json @@ -13,7 +13,8 @@ "dev": "tsx watch src/index.ts", "start": "node ./dist/index.js", "lint": "eslint \"src/**/*.ts\"", - "format": "prettier --write ." + "format": "prettier --write .", + "test": "npx playwright install && node tests/run-all-tests.js" }, "keywords": [ "mcp", diff --git a/scripts/bundle.js b/scripts/bundle.js index 55775ec..82e685f 100644 --- a/scripts/bundle.js +++ b/scripts/bundle.js @@ -51,17 +51,17 @@ try { 'combined-stream', 'mime-types', 'mime-db', - 'axios' + 'axios', ], sourcemap: true, minify: false, // Keep readable for debugging define: { - 'process.env.NODE_ENV': '"production"' - } + 'process.env.NODE_ENV': '"production"', + }, }); - + console.log('โœ… Bundle created: dist/bundle.js'); } catch (error) { console.error('โŒ Bundle failed:', error); process.exit(1); -} \ No newline at end of file +} diff --git a/src/browser-pool.ts b/src/browser-pool.ts index a9a2dde..70347c0 100644 --- a/src/browser-pool.ts +++ b/src/browser-pool.ts @@ -12,12 +12,14 @@ export class BrowserPool { // Read configuration from environment variables this.maxBrowsers = parseInt(process.env.MAX_BROWSERS || '3', 10); this.headless = process.env.BROWSER_HEADLESS !== 'false'; // Default to true - + // Configure browser types based on environment const browserTypesEnv = process.env.BROWSER_TYPES || 'chromium,firefox'; this.browserTypes = browserTypesEnv.split(',').map(type => type.trim()); - - console.log(`[BrowserPool] Configuration: maxBrowsers=${this.maxBrowsers}, headless=${this.headless}, types=${this.browserTypes.join(',')}`); + + console.log( + `[BrowserPool] Configuration: maxBrowsers=${this.maxBrowsers}, headless=${this.headless}, types=${this.browserTypes.join(',')}` + ); } async getBrowser(): Promise { @@ -28,14 +30,15 @@ export class BrowserPool { if (this.browsers.has(browserType)) { const browser = this.browsers.get(browserType)!; - + // Check if browser is still connected and healthy try { if (browser.isConnected()) { // Quick health check by trying to create and close a context // Use minimal options to avoid Firefox isMobile issues const testContext = await browser.newContext({ - userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' + userAgent: + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', }); await testContext.close(); return browser; @@ -54,7 +57,7 @@ export class BrowserPool { // Launch new browser console.log(`[BrowserPool] Launching new ${browserType} browser`); - + const launchOptions = { headless: this.headless, args: [ @@ -80,6 +83,18 @@ export class BrowserPool { case 'chromium': browser = await chromium.launch(launchOptions); break; + case 'edge': + browser = await chromium.launch({ + ...launchOptions, + channel: 'msedge', + }); + break; + case 'chrome': + browser = await chromium.launch({ + ...launchOptions, + channel: 'chrome', + }); + break; case 'firefox': browser = await firefox.launch(launchOptions); break; @@ -91,7 +106,7 @@ export class BrowserPool { } this.browsers.set(browserType, browser); - + // Clean up old browsers if we have too many if (this.browsers.size > this.maxBrowsers) { const oldestBrowser = this.browsers.entries().next().value; @@ -114,13 +129,11 @@ export class BrowserPool { async closeAll(): Promise { console.log(`[BrowserPool] Closing ${this.browsers.size} browsers`); - - const closePromises = Array.from(this.browsers.values()).map(browser => - browser.close().catch(error => - console.error('Error closing browser:', error) - ) + + const closePromises = Array.from(this.browsers.values()).map(browser => + browser.close().catch(error => console.error('Error closing browser:', error)) ); - + await Promise.all(closePromises); this.browsers.clear(); } @@ -128,4 +141,4 @@ export class BrowserPool { getLastUsedBrowserType(): string { return this.lastUsedBrowserType; } -} \ No newline at end of file +} diff --git a/src/content-extractor.ts b/src/content-extractor.ts index aa14bba..f4002ed 100644 --- a/src/content-extractor.ts +++ b/src/content-extractor.ts @@ -12,7 +12,7 @@ export class ContentExtractor { // Read MAX_CONTENT_LENGTH from environment variable, fallback to 500KB const envMaxLength = process.env.MAX_CONTENT_LENGTH; this.maxContentLength = envMaxLength ? parseInt(envMaxLength, 10) : 500000; - + // Validate the parsed value if (isNaN(this.maxContentLength) || this.maxContentLength < 0) { console.warn(`[ContentExtractor] Invalid MAX_CONTENT_LENGTH value: ${envMaxLength}, using default 500000`); @@ -22,16 +22,18 @@ export class ContentExtractor { async extractContent(options: ContentExtractionOptions): Promise { const { url, timeout = this.defaultTimeout, maxContentLength = this.maxContentLength } = options; - + try { const response = await axios.get(url, { headers: { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'User-Agent': + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + Accept: + 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', - 'DNT': '1', - 'Connection': 'keep-alive', + DNT: '1', + Connection: 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', @@ -50,64 +52,68 @@ export class ContentExtractor { return this.parseContent(response.data); } catch (error) { console.error(`Content extraction error for ${url}:`, error); - + // If it's a 403 error, try with different headers if (axios.isAxiosError(error) && error.response?.status === 403) { console.log(`[ContentExtractor] Trying alternative headers for ${url}`); try { const response = await axios.get(url, { headers: { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'User-Agent': + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', - 'DNT': '1', - 'Connection': 'keep-alive', + DNT: '1', + Connection: 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'no-cache', - 'Pragma': 'no-cache', + Pragma: 'no-cache', }, timeout, maxContentLength, validateStatus: (status: number) => status < 400, }); - + console.log(`[ContentExtractor] Alternative headers worked for ${url}`); return this.parseContent(response.data); } catch (retryError) { console.error(`[ContentExtractor] Alternative headers also failed for ${url}:`, retryError); } } - + throw new Error(`Failed to extract content from ${url}: ${this.getSpecificErrorMessage(error)}`); } } - async extractContentForResults(results: SearchResult[], targetCount: number = results.length): Promise { + async extractContentForResults( + results: SearchResult[], + targetCount: number = results.length + ): Promise { const enhancedResults: SearchResult[] = []; let processedCount = 0; - + console.log(`[ContentExtractor] Processing up to ${results.length} results to get ${targetCount} non-PDF results`); - + for (const result of results) { if (enhancedResults.length >= targetCount) { console.log(`[ContentExtractor] Reached target count of ${targetCount} results`); break; } - + processedCount++; - + // Skip PDF files if (isPdfUrl(result.url)) { console.log(`[ContentExtractor] Skipping PDF file: ${result.url}`); continue; } - + try { console.log(`[ContentExtractor] Extracting content from: ${result.url}`); const content = await this.extractContent({ url: result.url }); const cleanedContent = cleanText(content, this.maxContentLength); - + enhancedResults.push({ ...result, fullContent: cleanedContent, @@ -116,10 +122,12 @@ export class ContentExtractor { timestamp: generateTimestamp(), fetchStatus: 'success' as const, }); - + console.log(`[ContentExtractor] Successfully extracted content (${enhancedResults.length}/${targetCount})`); } catch (error) { - console.log(`[ContentExtractor] Failed to extract content from ${result.url}: ${error instanceof Error ? error.message : 'Unknown error'}`); + console.log( + `[ContentExtractor] Failed to extract content from ${result.url}: ${error instanceof Error ? error.message : 'Unknown error'}` + ); enhancedResults.push({ ...result, fullContent: '', @@ -131,42 +139,54 @@ export class ContentExtractor { }); } } - - console.log(`[ContentExtractor] Processed ${processedCount} results, extracted ${enhancedResults.length} non-PDF results`); + + console.log( + `[ContentExtractor] Processed ${processedCount} results, extracted ${enhancedResults.length} non-PDF results` + ); return enhancedResults; } private parseContent(html: string): string { const $ = cheerio.load(html); - + // Remove all script, style, and other non-content elements - $('script, style, noscript, iframe, img, video, audio, canvas, svg, object, embed, applet, form, input, textarea, select, button, label, fieldset, legend, optgroup, option').remove(); - + $( + 'script, style, noscript, iframe, img, video, audio, canvas, svg, object, embed, applet, form, input, textarea, select, button, label, fieldset, legend, optgroup, option' + ).remove(); + // Remove navigation, header, footer, and other non-content elements - $('nav, header, footer, .nav, .header, .footer, .sidebar, .menu, .breadcrumb, aside, .ad, .advertisement, .ads, .advertisement-container, .social-share, .share-buttons, .comments, .comment-section, .related-posts, .recommendations, .newsletter-signup, .cookie-notice, .privacy-notice, .terms-notice, .disclaimer, .legal, .copyright, .meta, .metadata, .author-info, .publish-date, .tags, .categories, .navigation, .pagination, .search-box, .search-form, .login-form, .signup-form, .newsletter, .popup, .modal, .overlay, .tooltip, .toolbar, .ribbon, .banner, .promo, .sponsored, .affiliate, .tracking, .analytics, .pixel, .beacon').remove(); - + $( + 'nav, header, footer, .nav, .header, .footer, .sidebar, .menu, .breadcrumb, aside, .ad, .advertisement, .ads, .advertisement-container, .social-share, .share-buttons, .comments, .comment-section, .related-posts, .recommendations, .newsletter-signup, .cookie-notice, .privacy-notice, .terms-notice, .disclaimer, .legal, .copyright, .meta, .metadata, .author-info, .publish-date, .tags, .categories, .navigation, .pagination, .search-box, .search-form, .login-form, .signup-form, .newsletter, .popup, .modal, .overlay, .tooltip, .toolbar, .ribbon, .banner, .promo, .sponsored, .affiliate, .tracking, .analytics, .pixel, .beacon' + ).remove(); + // Remove elements with common ad/tracking classes - $('[class*="ad"], [class*="ads"], [class*="advertisement"], [class*="tracking"], [class*="analytics"], [class*="pixel"], [class*="beacon"], [class*="sponsored"], [class*="affiliate"], [class*="promo"], [class*="banner"], [class*="popup"], [class*="modal"], [class*="overlay"], [class*="tooltip"], [class*="toolbar"], [class*="ribbon"]').remove(); - + $( + '[class*="ad"], [class*="ads"], [class*="advertisement"], [class*="tracking"], [class*="analytics"], [class*="pixel"], [class*="beacon"], [class*="sponsored"], [class*="affiliate"], [class*="promo"], [class*="banner"], [class*="popup"], [class*="modal"], [class*="overlay"], [class*="tooltip"], [class*="toolbar"], [class*="ribbon"]' + ).remove(); + // Remove elements with common non-content IDs - $('[id*="ad"], [id*="ads"], [id*="advertisement"], [id*="tracking"], [id*="analytics"], [id*="pixel"], [id*="beacon"], [id*="sponsored"], [id*="affiliate"], [id*="promo"], [id*="banner"], [id*="popup"], [id*="modal"], [id*="overlay"], [id*="tooltip"], [id*="toolbar"], [id*="ribbon"], [id*="sidebar"], [id*="navigation"], [id*="menu"], [id*="footer"], [id*="header"]').remove(); - + $( + '[id*="ad"], [id*="ads"], [id*="advertisement"], [id*="tracking"], [id*="analytics"], [id*="pixel"], [id*="beacon"], [id*="sponsored"], [id*="affiliate"], [id*="promo"], [id*="banner"], [id*="popup"], [id*="modal"], [id*="overlay"], [id*="tooltip"], [id*="toolbar"], [id*="ribbon"], [id*="sidebar"], [id*="navigation"], [id*="menu"], [id*="footer"], [id*="header"]' + ).remove(); + // Remove image-related elements and attributes - $('picture, source, figure, figcaption, .image, .img, .photo, .picture, .media, .gallery, .slideshow, .carousel').remove(); + $( + 'picture, source, figure, figcaption, .image, .img, .photo, .picture, .media, .gallery, .slideshow, .carousel' + ).remove(); $('[data-src*="image"], [data-src*="img"], [data-src*="photo"], [data-src*="picture"]').remove(); $('[style*="background-image"]').remove(); - + // Remove empty elements and whitespace-only elements - $('*').each(function() { + $('*').each(function () { const $this = $(this); if ($this.children().length === 0 && $this.text().trim() === '') { $this.remove(); } }); - + // Try to find the main content area first let mainContent = ''; - + // Priority selectors for main content const contentSelectors = [ 'article', @@ -184,54 +204,55 @@ export class ContentExtractor { '.body-content', '.copy', '.text', - '.body' + '.body', ]; - + for (const selector of contentSelectors) { const $content = $(selector).first(); if ($content.length > 0) { mainContent = $content.text().trim(); - if (mainContent.length > 100) { // Ensure we have substantial content + if (mainContent.length > 100) { + // Ensure we have substantial content console.log(`[ContentExtractor] Found content with selector: ${selector} (${mainContent.length} chars)`); break; } } } - + // If no main content found, try body content if (!mainContent || mainContent.length < 100) { console.log(`[ContentExtractor] No main content found, using body content`); mainContent = $('body').text().trim(); } - + // Clean up the text const cleanedContent = this.cleanTextContent(mainContent); - + return cleanText(cleanedContent, this.maxContentLength); } - + private cleanTextContent(text: string): string { // Remove excessive whitespace text = text.replace(/\s+/g, ' '); - + // Remove image-related text and data URLs text = text.replace(/data:image\/[^;]+;base64,[A-Za-z0-9+/=]+/g, ''); // Remove base64 image data text = text.replace(/https?:\/\/[^\s]+\.(jpg|jpeg|png|gif|webp|svg|ico|bmp|tiff)(\?[^\s]*)?/gi, ''); // Remove image URLs text = text.replace(/\.(jpg|jpeg|png|gif|webp|svg|ico|bmp|tiff)/gi, ''); // Remove image file extensions text = text.replace(/image|img|photo|picture|gallery|slideshow|carousel/gi, ''); // Remove image-related words text = text.replace(/click to enlarge|click for full size|view larger|download image/gi, ''); // Remove image action text - + // Remove common non-content patterns text = text.replace(/cookie|privacy|terms|conditions|disclaimer|legal|copyright|all rights reserved/gi, ''); - + // Remove excessive line breaks and spacing text = text.replace(/\n\s*\n/g, '\n'); text = text.replace(/\r\n/g, '\n'); text = text.replace(/\r/g, '\n'); - + // Remove leading/trailing whitespace text = text.trim(); - + return text; } @@ -254,7 +275,7 @@ export class ContentExtractor { } return `Network error: ${error.message}`; } - + return error instanceof Error ? error.message : 'Unknown error'; } } diff --git a/src/enhanced-content-extractor.ts b/src/enhanced-content-extractor.ts index c5eecaf..2da7847 100644 --- a/src/enhanced-content-extractor.ts +++ b/src/enhanced-content-extractor.ts @@ -13,36 +13,42 @@ export class EnhancedContentExtractor { constructor() { this.defaultTimeout = parseInt(process.env.DEFAULT_TIMEOUT || '6000', 10); - + // Read MAX_CONTENT_LENGTH from environment variable, fallback to 500KB const envMaxLength = process.env.MAX_CONTENT_LENGTH; this.maxContentLength = envMaxLength ? parseInt(envMaxLength, 10) : 500000; - + // Validate the parsed value if (isNaN(this.maxContentLength) || this.maxContentLength < 0) { - console.warn(`[EnhancedContentExtractor] Invalid MAX_CONTENT_LENGTH value: ${envMaxLength}, using default 500000`); + console.warn( + `[EnhancedContentExtractor] Invalid MAX_CONTENT_LENGTH value: ${envMaxLength}, using default 500000` + ); this.maxContentLength = 500000; } - + this.browserPool = new BrowserPool(); this.fallbackThreshold = parseInt(process.env.BROWSER_FALLBACK_THRESHOLD || '3', 10); - - console.log(`[EnhancedContentExtractor] Configuration: timeout=${this.defaultTimeout}, maxContentLength=${this.maxContentLength}, fallbackThreshold=${this.fallbackThreshold}`); + + console.log( + `[EnhancedContentExtractor] Configuration: timeout=${this.defaultTimeout}, maxContentLength=${this.maxContentLength}, fallbackThreshold=${this.fallbackThreshold}` + ); } async extractContent(options: ContentExtractionOptions): Promise { const { url } = options; - + console.log(`[EnhancedContentExtractor] Starting extraction for: ${url}`); - + // First, try with regular HTTP client (faster) try { const content = await this.extractWithAxios(options); console.log(`[EnhancedContentExtractor] Successfully extracted with axios: ${content.length} chars`); return content; } catch (error) { - console.log(`[EnhancedContentExtractor] Axios failed: ${error instanceof Error ? error.message : 'Unknown error'}`); - + console.log( + `[EnhancedContentExtractor] Axios failed: ${error instanceof Error ? error.message : 'Unknown error'}` + ); + // Check if this looks like a case where browser would help if (this.shouldUseBrowser(error, url)) { console.log(`[EnhancedContentExtractor] Falling back to headless browser for: ${url}`); @@ -62,7 +68,7 @@ export class EnhancedContentExtractor { private async extractWithAxios(options: ContentExtractionOptions): Promise { const { url, timeout = this.defaultTimeout, maxContentLength = this.maxContentLength } = options; - + const response = await axios.get(url, { headers: this.getRandomHeaders(), timeout, @@ -71,27 +77,29 @@ export class EnhancedContentExtractor { }); let content = this.parseContent(response.data); - + // Truncate content if it exceeds the limit (instead of axios throwing an error) if (maxContentLength && content.length > maxContentLength) { - console.log(`[EnhancedContentExtractor] Content truncated from ${content.length} to ${maxContentLength} characters for ${url}`); + console.log( + `[EnhancedContentExtractor] Content truncated from ${content.length} to ${maxContentLength} characters for ${url}` + ); content = content.substring(0, maxContentLength); } - + // Check if we got a meaningful response if (this.isLowQualityContent(content)) { throw new Error('Low quality content detected - likely bot detection'); } - + return content; } private async extractWithBrowser(options: ContentExtractionOptions): Promise { const { url, timeout = this.defaultTimeout } = options; - + const browser = await this.browserPool.getBrowser(); const browserType = this.browserPool.getLastUsedBrowserType(); - + try { // Create context options based on browser capabilities const baseContextOptions = { @@ -105,13 +113,12 @@ export class EnhancedContentExtractor { }; // Firefox doesn't support isMobile option - check multiple ways to ensure detection - const isFirefox = browserType === 'firefox' || - browserType.includes('firefox') || - browser.constructor.name.toLowerCase().includes('firefox'); - - const contextOptions = isFirefox - ? baseContextOptions - : { ...baseContextOptions, isMobile: Math.random() > 0.8 }; + const isFirefox = + browserType === 'firefox' || + browserType.includes('firefox') || + browser.constructor.name.toLowerCase().includes('firefox'); + + const contextOptions = isFirefox ? baseContextOptions : { ...baseContextOptions, isMobile: Math.random() > 0.8 }; // Create a new context for each request (isolation) const context = await browser.newContext(contextOptions); @@ -135,11 +142,12 @@ export class EnhancedContentExtractor { // Mock permissions const originalQuery = window.navigator.permissions.query; - window.navigator.permissions.query = (parameters) => ( - parameters.name === 'notifications' ? - Promise.resolve({ state: 'default' } as unknown as PermissionStatus) : - originalQuery(parameters) - ); + window.navigator.permissions.query = parameters => + parameters.name === 'notifications' + ? Promise.resolve({ + state: 'default', + } as unknown as PermissionStatus) + : originalQuery(parameters); // Remove automation indicators const windowWithChrome = window as any; @@ -150,11 +158,11 @@ export class EnhancedContentExtractor { }); const page = await context.newPage(); - + // Set up request interception to block unnecessary resources - await page.route('**/*', (route) => { + await page.route('**/*', route => { const resourceType = route.request().resourceType(); - + // Block images, fonts, and other non-essential resources for faster loading if (['image', 'font', 'media'].includes(resourceType)) { route.abort(); @@ -165,19 +173,19 @@ export class EnhancedContentExtractor { // Navigate with realistic options and better error handling console.log(`[BrowserExtractor] Navigating to ${url}`); - + try { - await page.goto(url, { + await page.goto(url, { waitUntil: 'domcontentloaded', // Don't wait for all resources - timeout: Math.min(timeout, 8000) // Reduced timeout, max 8 seconds + timeout: Math.min(timeout, 8000), // Reduced timeout, max 8 seconds }); } catch (gotoError) { // Handle specific protocol errors const errorMessage = gotoError instanceof Error ? gotoError.message : String(gotoError); - + if (errorMessage.includes('ERR_HTTP2_PROTOCOL_ERROR') || errorMessage.includes('HTTP2')) { console.log(`[BrowserExtractor] HTTP/2 error detected, trying with HTTP/1.1`); - + // Create a new context with HTTP/1.1 preference await context.close(); const http1Context = await browser.newContext({ @@ -186,15 +194,15 @@ export class EnhancedContentExtractor { locale: 'en-US', timezoneId: this.getRandomTimezone(), extraHTTPHeaders: { - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1' - } + Connection: 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + }, }); - + const http1Page = await http1Context.newPage(); - + // Disable HTTP/2 by intercepting requests - await http1Page.route('**/*', (route) => { + await http1Page.route('**/*', route => { const resourceType = route.request().resourceType(); if (['image', 'font', 'media'].includes(resourceType)) { route.abort(); @@ -202,12 +210,12 @@ export class EnhancedContentExtractor { route.continue(); } }); - - await http1Page.goto(url, { + + await http1Page.goto(url, { waitUntil: 'domcontentloaded', - timeout: Math.min(timeout, 6000) + timeout: Math.min(timeout, 6000), }); - + // Quick content extraction const html = await http1Page.content(); const content = this.parseContent(html); @@ -220,14 +228,14 @@ export class EnhancedContentExtractor { // Quick human simulation - reduced time await page.mouse.move(Math.random() * 100, Math.random() * 100); - + // Reduced wait time for dynamic content await page.waitForTimeout(500 + Math.random() * 1000); // Quick check for main content without long wait try { await page.waitForSelector('article, main, .content, .post-content, .entry-content', { - timeout: 2000 + timeout: 2000, }); } catch { console.log(`[BrowserExtractor] No main content selector found, proceeding anyway`); @@ -239,7 +247,6 @@ export class EnhancedContentExtractor { await context.close(); return content; - } catch (error) { console.error(`[BrowserExtractor] Browser extraction failed for ${url}:`, error); throw error; @@ -249,14 +256,11 @@ export class EnhancedContentExtractor { private async simulateHumanBehavior(page: Page): Promise { try { // Random mouse movements - await page.mouse.move( - Math.random() * 800, - Math.random() * 600 - ); + await page.mouse.move(Math.random() * 800, Math.random() * 600); // Random scroll (common human behavior) const scrollY = Math.random() * 500; - await page.evaluate((y) => window.scrollTo(0, y), scrollY); + await page.evaluate(y => window.scrollTo(0, y), scrollY); // Small random delay await page.waitForTimeout(500 + Math.random() * 1000); @@ -282,19 +286,19 @@ export class EnhancedContentExtractor { error.response?.status === 403, error.response?.status === 429, error.response?.status === 503, - + // Error messages suggesting JS requirement error.message?.includes('timeout'), error.message?.includes('Access denied'), error.message?.includes('Forbidden'), error.message?.includes('Low quality content detected'), - + // Response content suggesting bot detection error.response?.data?.includes('Please enable JavaScript'), error.response?.data?.includes('captcha'), error.response?.data?.includes('unusual traffic'), error.response?.data?.includes('robot'), - + // Sites known to be JS-heavy url.includes('twitter.com'), url.includes('facebook.com'), @@ -325,31 +329,34 @@ export class EnhancedContentExtractor { private getRandomHeaders(): Record { const browsers = [ { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', + 'User-Agent': + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"', 'sec-ch-ua-platform': '"Windows"', }, { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', + 'User-Agent': + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"', 'sec-ch-ua-platform': '"macOS"', }, { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', + 'User-Agent': + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"', 'sec-ch-ua-platform': '"Linux"', - } + }, ]; const browser = browsers[Math.floor(Math.random() * browsers.length)]; - + return { ...browser, - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', - 'DNT': '1', - 'Connection': 'keep-alive', + DNT: '1', + Connection: 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', @@ -368,7 +375,7 @@ export class EnhancedContentExtractor { 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:122.0) Gecko/20100101 Firefox/122.0', ]; - + return userAgents[Math.floor(Math.random() * userAgents.length)]; } @@ -380,7 +387,7 @@ export class EnhancedContentExtractor { { width: 1536, height: 864 }, { width: 1280, height: 720 }, ]; - + return viewports[Math.floor(Math.random() * viewports.length)]; } @@ -393,35 +400,40 @@ export class EnhancedContentExtractor { 'Europe/Berlin', 'Asia/Tokyo', ]; - + return timezones[Math.floor(Math.random() * timezones.length)]; } - async extractContentForResults(results: SearchResult[], targetCount: number = results.length): Promise { - console.log(`[EnhancedContentExtractor] Processing up to ${results.length} results to get ${targetCount} non-PDF results`); - + async extractContentForResults( + results: SearchResult[], + targetCount: number = results.length + ): Promise { + console.log( + `[EnhancedContentExtractor] Processing up to ${results.length} results to get ${targetCount} non-PDF results` + ); + // Filter out PDF files first const nonPdfResults = results.filter(result => !isPdfUrl(result.url)); const resultsToProcess = nonPdfResults.slice(0, Math.min(targetCount * 2, 10)); // Process extra to account for failures - + console.log(`[EnhancedContentExtractor] Processing ${resultsToProcess.length} non-PDF results concurrently`); - + // Process results concurrently with timeout const extractionPromises = resultsToProcess.map(async (result): Promise => { try { // Use a race condition with timeout to prevent hanging - const extractionPromise = this.extractContent({ - url: result.url, - timeout: 6000 // Reduced timeout to 6 seconds per page + const extractionPromise = this.extractContent({ + url: result.url, + timeout: 6000, // Reduced timeout to 6 seconds per page }); - + const timeoutPromise = new Promise((_, reject) => { setTimeout(() => reject(new Error('Content extraction timeout')), 8000); }); - + const content = await Promise.race([extractionPromise, timeoutPromise]); const cleanedContent = cleanText(content, this.maxContentLength); - + console.log(`[EnhancedContentExtractor] Successfully extracted: ${result.url}`); return { ...result, @@ -432,7 +444,9 @@ export class EnhancedContentExtractor { fetchStatus: 'success' as const, }; } catch (error) { - console.log(`[EnhancedContentExtractor] Failed to extract: ${result.url} - ${error instanceof Error ? error.message : 'Unknown error'}`); + console.log( + `[EnhancedContentExtractor] Failed to extract: ${result.url} - ${error instanceof Error ? error.message : 'Unknown error'}` + ); return { ...result, fullContent: '', @@ -444,55 +458,67 @@ export class EnhancedContentExtractor { }; } }); - + // Wait for all extractions to complete const allResults = await Promise.all(extractionPromises); - + // Return successful results first, up to targetCount const successfulResults = allResults.filter(r => r.fetchStatus === 'success'); const failedResults = allResults.filter(r => r.fetchStatus === 'error'); - + // Combine successful and failed results, prioritizing successful ones const enhancedResults = [ ...successfulResults.slice(0, targetCount), - ...failedResults.slice(0, Math.max(0, targetCount - successfulResults.length)) + ...failedResults.slice(0, Math.max(0, targetCount - successfulResults.length)), ].slice(0, targetCount); - - console.log(`[EnhancedContentExtractor] Completed processing ${resultsToProcess.length} results, extracted ${successfulResults.length} successful/${failedResults.length} failed`); + + console.log( + `[EnhancedContentExtractor] Completed processing ${resultsToProcess.length} results, extracted ${successfulResults.length} successful/${failedResults.length} failed` + ); return enhancedResults; } private parseContent(html: string): string { const $ = cheerio.load(html); - + // Remove all script, style, and other non-content elements - $('script, style, noscript, iframe, img, video, audio, canvas, svg, object, embed, applet, form, input, textarea, select, button, label, fieldset, legend, optgroup, option').remove(); - + $( + 'script, style, noscript, iframe, img, video, audio, canvas, svg, object, embed, applet, form, input, textarea, select, button, label, fieldset, legend, optgroup, option' + ).remove(); + // Remove navigation, header, footer, and other non-content elements - $('nav, header, footer, .nav, .header, .footer, .sidebar, .menu, .breadcrumb, aside, .ad, .advertisement, .ads, .advertisement-container, .social-share, .share-buttons, .comments, .comment-section, .related-posts, .recommendations, .newsletter-signup, .cookie-notice, .privacy-notice, .terms-notice, .disclaimer, .legal, .copyright, .meta, .metadata, .author-info, .publish-date, .tags, .categories, .navigation, .pagination, .search-box, .search-form, .login-form, .signup-form, .newsletter, .popup, .modal, .overlay, .tooltip, .toolbar, .ribbon, .banner, .promo, .sponsored, .affiliate, .tracking, .analytics, .pixel, .beacon').remove(); - + $( + 'nav, header, footer, .nav, .header, .footer, .sidebar, .menu, .breadcrumb, aside, .ad, .advertisement, .ads, .advertisement-container, .social-share, .share-buttons, .comments, .comment-section, .related-posts, .recommendations, .newsletter-signup, .cookie-notice, .privacy-notice, .terms-notice, .disclaimer, .legal, .copyright, .meta, .metadata, .author-info, .publish-date, .tags, .categories, .navigation, .pagination, .search-box, .search-form, .login-form, .signup-form, .newsletter, .popup, .modal, .overlay, .tooltip, .toolbar, .ribbon, .banner, .promo, .sponsored, .affiliate, .tracking, .analytics, .pixel, .beacon' + ).remove(); + // Remove elements with common ad/tracking classes - $('[class*="ad"], [class*="ads"], [class*="advertisement"], [class*="tracking"], [class*="analytics"], [class*="pixel"], [class*="beacon"], [class*="sponsored"], [class*="affiliate"], [class*="promo"], [class*="banner"], [class*="popup"], [class*="modal"], [class*="overlay"], [class*="tooltip"], [class*="toolbar"], [class*="ribbon"]').remove(); - + $( + '[class*="ad"], [class*="ads"], [class*="advertisement"], [class*="tracking"], [class*="analytics"], [class*="pixel"], [class*="beacon"], [class*="sponsored"], [class*="affiliate"], [class*="promo"], [class*="banner"], [class*="popup"], [class*="modal"], [class*="overlay"], [class*="tooltip"], [class*="toolbar"], [class*="ribbon"]' + ).remove(); + // Remove elements with common non-content IDs - $('[id*="ad"], [id*="ads"], [id*="advertisement"], [id*="tracking"], [id*="analytics"], [id*="pixel"], [id*="beacon"], [id*="sponsored"], [id*="affiliate"], [id*="promo"], [id*="banner"], [id*="popup"], [id*="modal"], [id*="overlay"], [id*="tooltip"], [id*="toolbar"], [id*="ribbon"], [id*="sidebar"], [id*="navigation"], [id*="menu"], [id*="footer"], [id*="header"]').remove(); - + $( + '[id*="ad"], [id*="ads"], [id*="advertisement"], [id*="tracking"], [id*="analytics"], [id*="pixel"], [id*="beacon"], [id*="sponsored"], [id*="affiliate"], [id*="promo"], [id*="banner"], [id*="popup"], [id*="modal"], [id*="overlay"], [id*="tooltip"], [id*="toolbar"], [id*="ribbon"], [id*="sidebar"], [id*="navigation"], [id*="menu"], [id*="footer"], [id*="header"]' + ).remove(); + // Remove image-related elements and attributes - $('picture, source, figure, figcaption, .image, .img, .photo, .picture, .media, .gallery, .slideshow, .carousel').remove(); + $( + 'picture, source, figure, figcaption, .image, .img, .photo, .picture, .media, .gallery, .slideshow, .carousel' + ).remove(); $('[data-src*="image"], [data-src*="img"], [data-src*="photo"], [data-src*="picture"]').remove(); $('[style*="background-image"]').remove(); - + // Remove empty elements and whitespace-only elements - $('*').each(function() { + $('*').each(function () { const $this = $(this); if ($this.children().length === 0 && $this.text().trim() === '') { $this.remove(); } }); - + // Try to find the main content area first let mainContent = ''; - + // Priority selectors for main content const contentSelectors = [ 'article', @@ -510,54 +536,57 @@ export class EnhancedContentExtractor { '.body-content', '.copy', '.text', - '.body' + '.body', ]; - + for (const selector of contentSelectors) { const $content = $(selector).first(); if ($content.length > 0) { mainContent = $content.text().trim(); - if (mainContent.length > 100) { // Ensure we have substantial content - console.log(`[EnhancedContentExtractor] Found content with selector: ${selector} (${mainContent.length} chars)`); + if (mainContent.length > 100) { + // Ensure we have substantial content + console.log( + `[EnhancedContentExtractor] Found content with selector: ${selector} (${mainContent.length} chars)` + ); break; } } } - + // If no main content found, try body content if (!mainContent || mainContent.length < 100) { console.log(`[EnhancedContentExtractor] No main content found, using body content`); mainContent = $('body').text().trim(); } - + // Clean up the text const cleanedContent = this.cleanTextContent(mainContent); - + return cleanText(cleanedContent, this.maxContentLength); } - + private cleanTextContent(text: string): string { // Remove excessive whitespace text = text.replace(/\s+/g, ' '); - + // Remove image-related text and data URLs text = text.replace(/data:image\/[^;]+;base64,[A-Za-z0-9+/=]+/g, ''); // Remove base64 image data text = text.replace(/https?:\/\/[^\s]+\.(jpg|jpeg|png|gif|webp|svg|ico|bmp|tiff)(\?[^\s]*)?/gi, ''); // Remove image URLs text = text.replace(/\.(jpg|jpeg|png|gif|webp|svg|ico|bmp|tiff)/gi, ''); // Remove image file extensions text = text.replace(/image|img|photo|picture|gallery|slideshow|carousel/gi, ''); // Remove image-related words text = text.replace(/click to enlarge|click for full size|view larger|download image/gi, ''); // Remove image action text - + // Remove common non-content patterns text = text.replace(/cookie|privacy|terms|conditions|disclaimer|legal|copyright|all rights reserved/gi, ''); - + // Remove excessive line breaks and spacing text = text.replace(/\n\s*\n/g, '\n'); text = text.replace(/\r\n/g, '\n'); text = text.replace(/\r/g, '\n'); - + // Remove leading/trailing whitespace text = text.trim(); - + return text; } @@ -580,11 +609,11 @@ export class EnhancedContentExtractor { } return `Network error: ${error.message}`; } - + return error instanceof Error ? error.message : 'Unknown error'; } async closeAll(): Promise { await this.browserPool.closeAll(); } -} \ No newline at end of file +} diff --git a/src/index.ts b/src/index.ts index 89c1843..8f2fae6 100644 --- a/src/index.ts +++ b/src/index.ts @@ -34,26 +34,40 @@ class WebSearchMCPServer { 'Search the web and fetch complete page content from top results. This is the most comprehensive web search tool. It searches the web and then follows the resulting links to extract their full page content, providing the most detailed and complete information available. Use get-web-search-summaries for a lightweight alternative.', { query: z.string().describe('Search query to execute (recommended for comprehensive research)'), - limit: z.union([z.number(), z.string()]).transform((val) => { - const num = typeof val === 'string' ? parseInt(val, 10) : val; - if (isNaN(num) || num < 1 || num > 10) { - throw new Error('Invalid limit: must be a number between 1 and 10'); - } - return num; - }).default(5).describe('Number of results to return with full content (1-10)'), - includeContent: z.union([z.boolean(), z.string()]).transform((val) => { - if (typeof val === 'string') { - return val.toLowerCase() === 'true'; - } - return Boolean(val); - }).default(true).describe('Whether to fetch full page content (default: true)'), - maxContentLength: z.union([z.number(), z.string()]).transform((val) => { - const num = typeof val === 'string' ? parseInt(val, 10) : val; - if (isNaN(num) || num < 0) { - throw new Error('Invalid maxContentLength: must be a non-negative number'); - } - return num; - }).optional().describe('Maximum characters per result content (0 = no limit). Usually not needed - content length is automatically optimized.'), + limit: z + .union([z.number(), z.string()]) + .transform(val => { + const num = typeof val === 'string' ? parseInt(val, 10) : val; + if (isNaN(num) || num < 1 || num > 10) { + throw new Error('Invalid limit: must be a number between 1 and 10'); + } + return num; + }) + .default(5) + .describe('Number of results to return with full content (1-10)'), + includeContent: z + .union([z.boolean(), z.string()]) + .transform(val => { + if (typeof val === 'string') { + return val.toLowerCase() === 'true'; + } + return Boolean(val); + }) + .default(true) + .describe('Whether to fetch full page content (default: true)'), + maxContentLength: z + .union([z.number(), z.string()]) + .transform(val => { + const num = typeof val === 'string' ? parseInt(val, 10) : val; + if (isNaN(num) || num < 0) { + throw new Error('Invalid maxContentLength: must be a non-negative number'); + } + return num; + }) + .optional() + .describe( + 'Maximum characters per result content (0 = no limit). Usually not needed - content length is automatically optimized.' + ), }, async (args: unknown) => { console.log(`[MCP] Tool call received: full-web-search`); @@ -62,56 +76,60 @@ class WebSearchMCPServer { try { // Convert and validate arguments const validatedArgs = this.validateAndConvertArgs(args); - + // Auto-detect model types based on parameter formats // Llama models often send string parameters and struggle with large responses - const isLikelyLlama = typeof args === 'object' && args !== null && ( - ('limit' in args && typeof (args as Record).limit === 'string') || - ('includeContent' in args && typeof (args as Record).includeContent === 'string') - ); - + const isLikelyLlama = + typeof args === 'object' && + args !== null && + (('limit' in args && typeof (args as Record).limit === 'string') || + ('includeContent' in args && typeof (args as Record).includeContent === 'string')); + // Detect models that handle large responses well (Qwen, Gemma, recent Deepseek) - const isLikelyRobustModel = typeof args === 'object' && args !== null && ( - ('limit' in args && typeof (args as Record).limit === 'number') && - ('includeContent' in args && typeof (args as Record).includeContent === 'boolean') - ); - + const isLikelyRobustModel = + typeof args === 'object' && + args !== null && + 'limit' in args && + typeof (args as Record).limit === 'number' && + 'includeContent' in args && + typeof (args as Record).includeContent === 'boolean'; + // Only apply auto-limit if maxContentLength is not explicitly set (including 0) const hasExplicitMaxLength = typeof args === 'object' && args !== null && 'maxContentLength' in args; - + if (!hasExplicitMaxLength && isLikelyLlama) { console.log(`[MCP] Detected potential Llama model (string parameters), applying content length limit`); validatedArgs.maxContentLength = 2000; // Reasonable limit for Llama } - + // For robust models (Qwen, Gemma, recent Deepseek), remove maxContentLength if it's set to a low value if (isLikelyRobustModel && validatedArgs.maxContentLength && validatedArgs.maxContentLength < 5000) { console.log(`[MCP] Detected robust model (numeric parameters), removing unnecessary content length limit`); validatedArgs.maxContentLength = undefined; } - + console.log(`[MCP] Validated args:`, JSON.stringify(validatedArgs, null, 2)); - + console.log(`[MCP] Starting web search...`); const result = await this.handleWebSearch(validatedArgs); - + console.log(`[MCP] Search completed, found ${result.results.length} results`); - + // Format the results as a comprehensive text response let responseText = `Search completed for "${result.query}" with ${result.total_results} results:\n\n`; - + // Add status line if available if (result.status) { responseText += `**Status:** ${result.status}\n\n`; } - + const maxLength = validatedArgs.maxContentLength; - + result.results.forEach((searchResult, idx) => { responseText += `**${idx + 1}. ${searchResult.title}**\n`; responseText += `URL: ${searchResult.url}\n`; responseText += `Description: ${searchResult.description}\n`; - + if (searchResult.fullContent && searchResult.fullContent.trim()) { let content = searchResult.fullContent; if (maxLength && maxLength > 0 && content.length > maxLength) { @@ -127,10 +145,10 @@ class WebSearchMCPServer { } else if (searchResult.fetchStatus === 'error') { responseText += `\n**Content Extraction Failed:** ${searchResult.error}\n`; } - + responseText += `\n---\n\n`; }); - + return { content: [ { @@ -152,13 +170,17 @@ class WebSearchMCPServer { 'Search the web and return only the search result snippets/descriptions without following links to extract full page content. This is a lightweight alternative to full-web-search for when you only need brief search results. For comprehensive information, use full-web-search instead.', { query: z.string().describe('Search query to execute (lightweight alternative)'), - limit: z.union([z.number(), z.string()]).transform((val) => { - const num = typeof val === 'string' ? parseInt(val, 10) : val; - if (isNaN(num) || num < 1 || num > 10) { - throw new Error('Invalid limit: must be a number between 1 and 10'); - } - return num; - }).default(5).describe('Number of search results to return (1-10)'), + limit: z + .union([z.number(), z.string()]) + .transform(val => { + const num = typeof val === 'string' ? parseInt(val, 10) : val; + if (isNaN(num) || num < 1 || num > 10) { + throw new Error('Invalid limit: must be a number between 1 and 10'); + } + return num; + }) + .default(5) + .describe('Number of search results to return (1-10)'), }, async (args: unknown) => { console.log(`[MCP] Tool call received: get-web-search-summaries`); @@ -170,7 +192,7 @@ class WebSearchMCPServer { throw new Error('Invalid arguments: args must be an object'); } const obj = args as Record; - + if (!obj.query || typeof obj.query !== 'string') { throw new Error('Invalid arguments: query is required and must be a string'); } @@ -185,7 +207,7 @@ class WebSearchMCPServer { } console.log(`[MCP] Starting web search summaries...`); - + try { // Use existing search engine to get results with snippets const searchResponse = await this.searchEngine.search({ @@ -204,10 +226,10 @@ class WebSearchMCPServer { })); console.log(`[MCP] Search summaries completed, found ${summaryResults.length} results`); - + // Format the results as text let responseText = `Search summaries for "${obj.query}" with ${summaryResults.length} results:\n\n`; - + summaryResults.forEach((summary, i) => { responseText += `**${i + 1}. ${summary.title}**\n`; responseText += `URL: ${summary.url}\n`; @@ -245,13 +267,19 @@ class WebSearchMCPServer { 'Extract and return the full content from a single web page URL. This tool follows a provided URL and extracts the main page content. Useful for getting detailed content from a specific webpage without performing a search.', { url: z.string().url().describe('The URL of the web page to extract content from'), - maxContentLength: z.union([z.number(), z.string()]).transform((val) => { - const num = typeof val === 'string' ? parseInt(val, 10) : val; - if (isNaN(num) || num < 0) { - throw new Error('Invalid maxContentLength: must be a non-negative number'); - } - return num; - }).optional().describe('Maximum characters for the extracted content (0 = no limit, undefined = use default limit). Usually not needed - content length is automatically optimized.'), + maxContentLength: z + .union([z.number(), z.string()]) + .transform(val => { + const num = typeof val === 'string' ? parseInt(val, 10) : val; + if (isNaN(num) || num < 0) { + throw new Error('Invalid maxContentLength: must be a non-negative number'); + } + return num; + }) + .optional() + .describe( + 'Maximum characters for the extracted content (0 = no limit, undefined = use default limit). Usually not needed - content length is automatically optimized.' + ), }, async (args: unknown) => { console.log(`[MCP] Tool call received: get-single-web-page-content`); @@ -263,14 +291,15 @@ class WebSearchMCPServer { throw new Error('Invalid arguments: args must be an object'); } const obj = args as Record; - + if (!obj.url || typeof obj.url !== 'string') { throw new Error('Invalid arguments: url is required and must be a string'); } let maxContentLength: number | undefined; if (obj.maxContentLength !== undefined) { - const maxLengthValue = typeof obj.maxContentLength === 'string' ? parseInt(obj.maxContentLength, 10) : obj.maxContentLength; + const maxLengthValue = + typeof obj.maxContentLength === 'string' ? parseInt(obj.maxContentLength, 10) : obj.maxContentLength; if (typeof maxLengthValue !== 'number' || isNaN(maxLengthValue) || maxLengthValue < 0) { throw new Error('Invalid maxContentLength: must be a non-negative number'); } @@ -279,7 +308,7 @@ class WebSearchMCPServer { } console.log(`[MCP] Starting single page content extraction for: ${obj.url}`); - + // Use existing content extractor to get page content const content = await this.contentExtractor.extractContent({ url: obj.url, @@ -301,7 +330,7 @@ class WebSearchMCPServer { responseText += `**Title:** ${title}\n`; responseText += `**Word Count:** ${wordCount}\n`; responseText += `**Content Length:** ${content.length} characters\n\n`; - + if (maxContentLength && maxContentLength > 0 && content.length > maxContentLength) { responseText += `**Content (truncated at ${maxContentLength} characters):**\n${content.substring(0, maxContentLength)}\n\n[Content truncated at ${maxContentLength} characters]`; } else { @@ -364,46 +393,54 @@ class WebSearchMCPServer { private async handleWebSearch(input: WebSearchToolInput): Promise { const startTime = Date.now(); const { query, limit = 5, includeContent = true } = input; - - console.error(`[web-search-mcp] DEBUG: handleWebSearch called with limit=${limit}, includeContent=${includeContent}`); + + console.error( + `[web-search-mcp] DEBUG: handleWebSearch called with limit=${limit}, includeContent=${includeContent}` + ); try { // Request extra search results to account for potential PDF files that will be skipped // Request up to 2x the limit or at least 5 extra results, capped at 10 (Google's max) const searchLimit = includeContent ? Math.min(limit * 2 + 2, 10) : limit; - - console.log(`[web-search-mcp] DEBUG: Requesting ${searchLimit} search results to get ${limit} non-PDF content results`); - + + console.log( + `[web-search-mcp] DEBUG: Requesting ${searchLimit} search results to get ${limit} non-PDF content results` + ); + // Perform the search const searchResponse = await this.searchEngine.search({ query, numResults: searchLimit, }); const searchResults = searchResponse.results; - + // Log search summary const pdfCount = searchResults.filter(result => isPdfUrl(result.url)).length; const followedCount = searchResults.length - pdfCount; - console.error(`[web-search-mcp] DEBUG: Search engine: ${searchResponse.engine}; ${limit} requested/${searchResults.length} obtained; PDF: ${pdfCount}; ${followedCount} followed.`); + console.error( + `[web-search-mcp] DEBUG: Search engine: ${searchResponse.engine}; ${limit} requested/${searchResults.length} obtained; PDF: ${pdfCount}; ${followedCount} followed.` + ); // Extract content from each result if requested, with target count - const enhancedResults = includeContent + const enhancedResults = includeContent ? await this.contentExtractor.extractContentForResults(searchResults, limit) : searchResults.slice(0, limit); // If not extracting content, just take the first 'limit' results - + // Log extraction summary with failure reasons and generate combined status let combinedStatus = `Search engine: ${searchResponse.engine}; ${limit} result requested/${searchResults.length} obtained; PDF: ${pdfCount}; ${followedCount} followed`; - + if (includeContent) { const successCount = enhancedResults.filter(r => r.fetchStatus === 'success').length; const failedResults = enhancedResults.filter(r => r.fetchStatus === 'error'); const failedCount = failedResults.length; - + const failureReasons = this.categorizeFailureReasons(failedResults); const failureReasonText = failureReasons.length > 0 ? ` (${failureReasons.join(', ')})` : ''; - - console.error(`[web-search-mcp] DEBUG: Links requested: ${limit}; Successfully extracted: ${successCount}; Failed: ${failedCount}${failureReasonText}; Results: ${enhancedResults.length}.`); - + + console.error( + `[web-search-mcp] DEBUG: Links requested: ${limit}; Successfully extracted: ${successCount}; Failed: ${failedCount}${failureReasonText}; Results: ${enhancedResults.length}.` + ); + // Add extraction info to combined status combinedStatus += `; Successfully extracted: ${successCount}; Failed: ${failedCount}; Results: ${enhancedResults.length}`; } @@ -425,22 +462,20 @@ class WebSearchMCPServer { private categorizeFailureReasons(failedResults: SearchResult[]): string[] { const reasonCounts = new Map(); - + failedResults.forEach(result => { if (result.error) { const category = this.categorizeError(result.error); reasonCounts.set(category, (reasonCounts.get(category) || 0) + 1); } }); - - return Array.from(reasonCounts.entries()).map(([reason, count]) => - count > 1 ? `${reason} (${count})` : reason - ); + + return Array.from(reasonCounts.entries()).map(([reason, count]) => (count > 1 ? `${reason} (${count})` : reason)); } private categorizeError(errorMessage: string): string { const lowerError = errorMessage.toLowerCase(); - + if (lowerError.includes('timeout') || lowerError.includes('timed out')) { return 'Timeout'; } @@ -453,7 +488,11 @@ class WebSearchMCPServer { if (lowerError.includes('bot') || lowerError.includes('captcha') || lowerError.includes('unusual traffic')) { return 'Bot detection'; } - if (lowerError.includes('too large') || lowerError.includes('content length') || lowerError.includes('maxcontentlength')) { + if ( + lowerError.includes('too large') || + lowerError.includes('content length') || + lowerError.includes('maxcontentlength') + ) { return 'Content too long'; } if (lowerError.includes('ssl') || lowerError.includes('certificate') || lowerError.includes('tls')) { @@ -465,7 +504,7 @@ class WebSearchMCPServer { if (lowerError.includes('dns') || lowerError.includes('hostname')) { return 'DNS error'; } - + return 'Other error'; } @@ -477,7 +516,7 @@ class WebSearchMCPServer { }); // Handle uncaught exceptions - process.on('uncaughtException', (error) => { + process.on('uncaughtException', error => { console.error('Uncaught Exception:', error); // Don't exit on uncaught exceptions in MCP context }); @@ -486,10 +525,7 @@ class WebSearchMCPServer { process.on('SIGINT', async () => { console.log('Shutting down gracefully...'); try { - await Promise.all([ - this.contentExtractor.closeAll(), - this.searchEngine.closeAll() - ]); + await Promise.all([this.contentExtractor.closeAll(), this.searchEngine.closeAll()]); } catch (error) { console.error('Error during graceful shutdown:', error); } @@ -499,10 +535,7 @@ class WebSearchMCPServer { process.on('SIGTERM', async () => { console.log('Shutting down gracefully...'); try { - await Promise.all([ - this.contentExtractor.closeAll(), - this.searchEngine.closeAll() - ]); + await Promise.all([this.contentExtractor.closeAll(), this.searchEngine.closeAll()]); } catch (error) { console.error('Error during graceful shutdown:', error); } @@ -513,7 +546,7 @@ class WebSearchMCPServer { async run(): Promise { console.log('Setting up MCP server...'); const transport = new StdioServerTransport(); - + console.log('Connecting to transport...'); await this.server.connect(transport); console.log('Web Search MCP Server started'); diff --git a/src/rate-limiter.ts b/src/rate-limiter.ts index 12f0185..7ffe06a 100644 --- a/src/rate-limiter.ts +++ b/src/rate-limiter.ts @@ -35,11 +35,15 @@ export class RateLimiter { return result; } - getStatus(): { requestCount: number; maxRequests: number; resetTime: number } { + getStatus(): { + requestCount: number; + maxRequests: number; + resetTime: number; + } { return { requestCount: this.requestCount, maxRequests: this.maxRequestsPerMinute, resetTime: this.lastResetTime + this.resetIntervalMs, }; } -} \ No newline at end of file +} diff --git a/src/search-engine.ts b/src/search-engine.ts index 7c8ea3b..eff8acd 100644 --- a/src/search-engine.ts +++ b/src/search-engine.ts @@ -17,87 +17,104 @@ export class SearchEngine { async search(options: SearchOptions): Promise { const { query, numResults = 5, timeout = 10000 } = options; const sanitizedQuery = sanitizeQuery(query); - + console.log(`[SearchEngine] Starting search for query: "${sanitizedQuery}"`); - + try { return await this.rateLimiter.execute(async () => { console.log(`[SearchEngine] Starting search with multiple engines...`); - + // Configuration from environment variables const enableQualityCheck = process.env.ENABLE_RELEVANCE_CHECKING !== 'false'; const qualityThreshold = parseFloat(process.env.RELEVANCE_THRESHOLD || '0.3'); const forceMultiEngine = process.env.FORCE_MULTI_ENGINE_SEARCH === 'true'; const debugBrowsers = process.env.DEBUG_BROWSER_LIFECYCLE === 'true'; - - console.log(`[SearchEngine] Quality checking: ${enableQualityCheck}, threshold: ${qualityThreshold}, multi-engine: ${forceMultiEngine}, debug: ${debugBrowsers}`); + + console.log( + `[SearchEngine] Quality checking: ${enableQualityCheck}, threshold: ${qualityThreshold}, multi-engine: ${forceMultiEngine}, debug: ${debugBrowsers}` + ); // Try multiple approaches to get search results, starting with most reliable const approaches = [ - { method: this.tryBrowserBingSearch.bind(this), name: 'Browser Bing' }, - { method: this.tryBrowserBraveSearch.bind(this), name: 'Browser Brave' }, - { method: this.tryDuckDuckGoSearch.bind(this), name: 'Axios DuckDuckGo' } + { + method: this.tryBrowserBingSearch.bind(this), + name: 'Browser Bing', + }, + { + method: this.tryBrowserBraveSearch.bind(this), + name: 'Browser Brave', + }, + { + method: this.tryDuckDuckGoSearch.bind(this), + name: 'Axios DuckDuckGo', + }, ]; - + let bestResults: SearchResult[] = []; let bestEngine = 'None'; let bestQuality = 0; - + for (let i = 0; i < approaches.length; i++) { const approach = approaches[i]; try { console.log(`[SearchEngine] Attempting ${approach.name} (${i + 1}/${approaches.length})...`); - + // Use more aggressive timeouts for faster fallback const approachTimeout = Math.min(timeout / 3, 4000); // Max 4 seconds per approach for faster fallback const results = await approach.method(sanitizedQuery, numResults, approachTimeout); if (results.length > 0) { console.log(`[SearchEngine] Found ${results.length} results with ${approach.name}`); - + // Validate result quality to detect irrelevant results const qualityScore = enableQualityCheck ? this.assessResultQuality(results, sanitizedQuery) : 1.0; console.log(`[SearchEngine] ${approach.name} quality score: ${qualityScore.toFixed(2)}/1.0`); - + // Track the best results so far if (qualityScore > bestQuality) { bestResults = results; bestEngine = approach.name; bestQuality = qualityScore; } - + // If quality is excellent, return immediately (unless forcing multi-engine) if (qualityScore >= 0.8 && !forceMultiEngine) { console.log(`[SearchEngine] Excellent quality results from ${approach.name}, returning immediately`); return { results, engine: approach.name }; } - + // If quality is acceptable and this isn't Bing (first engine), return if (qualityScore >= qualityThreshold && approach.name !== 'Browser Bing' && !forceMultiEngine) { console.log(`[SearchEngine] Good quality results from ${approach.name}, using as primary`); return { results, engine: approach.name }; } - + // If this is the last engine or quality is acceptable, prepare to return if (i === approaches.length - 1) { if (bestQuality >= qualityThreshold || !enableQualityCheck) { - console.log(`[SearchEngine] Using best results from ${bestEngine} (quality: ${bestQuality.toFixed(2)})`); + console.log( + `[SearchEngine] Using best results from ${bestEngine} (quality: ${bestQuality.toFixed(2)})` + ); return { results: bestResults, engine: bestEngine }; } else if (bestResults.length > 0) { - console.log(`[SearchEngine] Warning: Low quality results from all engines, using best available from ${bestEngine}`); + console.log( + `[SearchEngine] Warning: Low quality results from all engines, using best available from ${bestEngine}` + ); return { results: bestResults, engine: bestEngine }; } } else { - console.log(`[SearchEngine] ${approach.name} results quality: ${qualityScore.toFixed(2)}, continuing to try other engines...`); + console.log( + `[SearchEngine] ${approach.name} results quality: ${qualityScore.toFixed(2)}, continuing to try other engines...` + ); } } } catch (error) { console.error(`[SearchEngine] ${approach.name} approach failed:`, error); - + // Handle browser-specific errors (no cleanup needed since each engine uses dedicated browsers) await this.handleBrowserError(error, approach.name); } } - + console.log(`[SearchEngine] All approaches failed, returning empty results`); return { results: [], engine: 'None' }; }); @@ -114,12 +131,9 @@ export class SearchEngine { } } - - - private async tryBrowserBraveSearch(query: string, numResults: number, timeout: number): Promise { console.log(`[SearchEngine] Trying browser-based Brave search with dedicated browser...`); - + // Try with retry mechanism for (let attempt = 1; attempt <= 2; attempt++) { let browser; @@ -128,12 +142,9 @@ export class SearchEngine { const { firefox } = await import('playwright'); browser = await firefox.launch({ headless: process.env.BROWSER_HEADLESS !== 'false', - args: [ - '--no-sandbox', - '--disable-dev-shm-usage', - ], + args: ['--no-sandbox', '--disable-dev-shm-usage'], }); - + console.log(`[SearchEngine] Brave search attempt ${attempt}/2 with fresh browser`); const results = await this.tryBrowserBraveSearchInternal(browser, query, numResults, timeout); return results; @@ -155,19 +166,25 @@ export class SearchEngine { } } } - + throw new Error('All Brave search attempts failed'); } - private async tryBrowserBraveSearchInternal(browser: any, query: string, numResults: number, timeout: number): Promise { + private async tryBrowserBraveSearchInternal( + browser: any, + query: string, + numResults: number, + timeout: number + ): Promise { // Validate browser is still functional before proceeding if (!browser.isConnected()) { throw new Error('Browser is not connected'); } - + try { const context = await browser.newContext({ - userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', + userAgent: + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', viewport: { width: 1366, height: 768 }, locale: 'en-US', timezoneId: 'America/New_York', @@ -175,14 +192,14 @@ export class SearchEngine { try { const page = await context.newPage(); - + // Navigate to Brave search const searchUrl = `https://search.brave.com/search?q=${encodeURIComponent(query)}&source=web`; console.log(`[SearchEngine] Browser navigating to Brave: ${searchUrl}`); - - await page.goto(searchUrl, { + + await page.goto(searchUrl, { waitUntil: 'domcontentloaded', - timeout: timeout + timeout: timeout, }); // Wait for search results to load @@ -194,12 +211,12 @@ export class SearchEngine { // Get the page content const html = await page.content(); - + console.log(`[SearchEngine] Browser Brave got HTML with length: ${html.length}`); - + const results = this.parseBraveResults(html, numResults); console.log(`[SearchEngine] Browser Brave parsed ${results.length} results`); - + await context.close(); return results; } catch (error) { @@ -216,16 +233,21 @@ export class SearchEngine { private async tryBrowserBingSearch(query: string, numResults: number, timeout: number): Promise { const debugBing = process.env.DEBUG_BING_SEARCH === 'true'; console.error(`[SearchEngine] BING: Starting browser-based search with dedicated browser for query: "${query}"`); - + // Try with retry mechanism for (let attempt = 1; attempt <= 2; attempt++) { let browser; try { console.error(`[SearchEngine] BING: Attempt ${attempt}/2 - Launching Chromium browser...`); - + // Create a dedicated browser instance for Bing search only const { chromium } = await import('playwright'); const startTime = Date.now(); + // Support launching Chrome and Edge as branded channels + let channel: string | undefined; + const browserType = process.env.BROWSER_TYPE || null; + if (browserType === 'chrome') channel = 'chrome'; + if (browserType === 'edge') channel = 'msedge'; browser = await chromium.launch({ headless: process.env.BROWSER_HEADLESS !== 'false', args: [ @@ -234,22 +256,25 @@ export class SearchEngine { '--disable-dev-shm-usage', '--disable-gpu', ], + ...(channel ? { channel } : {}), }); - + const launchTime = Date.now() - startTime; - console.error(`[SearchEngine] BING: Browser launched successfully in ${launchTime}ms, connected: ${browser.isConnected()}`); - + console.error( + `[SearchEngine] BING: Browser launched successfully in ${launchTime}ms, connected: ${browser.isConnected()}` + ); + const results = await this.tryBrowserBingSearchInternal(browser, query, numResults, timeout); console.error(`[SearchEngine] BING: Search completed successfully with ${results.length} results`); return results; } catch (error) { const errorMessage = error instanceof Error ? error.message : 'Unknown error'; console.error(`[SearchEngine] BING: Attempt ${attempt}/2 FAILED with error: ${errorMessage}`); - + if (debugBing) { console.error(`[SearchEngine] BING: Full error details:`, error); } - + if (attempt === 2) { console.error(`[SearchEngine] BING: All attempts exhausted, giving up`); throw error; // Re-throw on final attempt @@ -271,25 +296,31 @@ export class SearchEngine { } } } - + throw new Error('All Bing search attempts failed'); } - private async tryBrowserBingSearchInternal(browser: any, query: string, numResults: number, timeout: number): Promise { + private async tryBrowserBingSearchInternal( + browser: any, + query: string, + numResults: number, + timeout: number + ): Promise { const debugBing = process.env.DEBUG_BING_SEARCH === 'true'; - + // Validate browser is still functional before proceeding if (!browser.isConnected()) { console.error(`[SearchEngine] BING: Browser is not connected`); throw new Error('Browser is not connected'); } - + console.error(`[SearchEngine] BING: Creating browser context with enhanced fingerprinting...`); - + try { // Enhanced browser context with more realistic fingerprinting const context = await browser.newContext({ - userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', + userAgent: + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', viewport: { width: 1366, height: 768 }, locale: 'en-US', timezoneId: 'America/New_York', @@ -298,21 +329,21 @@ export class SearchEngine { hasTouch: false, isMobile: false, extraHTTPHeaders: { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', - 'DNT': '1', + DNT: '1', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none' - } + 'Sec-Fetch-Site': 'none', + }, }); console.error(`[SearchEngine] BING: Context created, opening new page...`); const page = await context.newPage(); console.error(`[SearchEngine] BING: Page opened successfully`); - + try { // Try enhanced Bing search with proper web interface flow try { @@ -324,13 +355,13 @@ export class SearchEngine { } catch (enhancedError) { const errorMessage = enhancedError instanceof Error ? enhancedError.message : 'Unknown error'; console.error(`[SearchEngine] BING: Enhanced search failed: ${errorMessage}`); - + if (debugBing) { console.error(`[SearchEngine] BING: Enhanced search error details:`, enhancedError); } - + console.error(`[SearchEngine] BING: Falling back to direct URL search...`); - + // Fallback to direct URL approach with enhanced parameters const results = await this.tryDirectBingSearch(page, query, numResults, timeout); console.error(`[SearchEngine] BING: Direct search succeeded with ${results.length} results`); @@ -346,64 +377,73 @@ export class SearchEngine { } catch (error) { const errorMessage = error instanceof Error ? error.message : 'Unknown error'; console.error(`[SearchEngine] BING: Internal search failed: ${errorMessage}`); - + if (debugBing) { console.error(`[SearchEngine] BING: Internal search error details:`, error); } - + throw error; } } - private async tryEnhancedBingSearch(page: any, query: string, numResults: number, timeout: number): Promise { + private async tryEnhancedBingSearch( + page: any, + query: string, + numResults: number, + timeout: number + ): Promise { const debugBing = process.env.DEBUG_BING_SEARCH === 'true'; console.error(`[SearchEngine] BING: Enhanced search - navigating to Bing homepage...`); - + // Navigate to Bing homepage first to establish proper session const startTime = Date.now(); - await page.goto('https://www.bing.com', { + await page.goto('https://www.bing.com', { waitUntil: 'domcontentloaded', - timeout: timeout / 2 + timeout: timeout / 2, }); - + const loadTime = Date.now() - startTime; const pageTitle = await page.title(); const currentUrl = page.url(); console.error(`[SearchEngine] BING: Homepage loaded in ${loadTime}ms, title: "${pageTitle}", URL: ${currentUrl}`); - + // Wait a moment for page to fully load await page.waitForTimeout(500); - + // Find and use the search box (more realistic than direct URL) try { console.error(`[SearchEngine] BING: Looking for search form elements...`); await page.waitForSelector('#sb_form_q', { timeout: 2000 }); console.error(`[SearchEngine] BING: Search box found, filling with query: "${query}"`); await page.fill('#sb_form_q', query); - + console.error(`[SearchEngine] BING: Clicking search button and waiting for navigation...`); // Submit the search form await Promise.all([ - page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: timeout }), - page.click('#search_icon') + page.waitForNavigation({ + waitUntil: 'domcontentloaded', + timeout: timeout, + }), + page.click('#search_icon'), ]); - + const searchLoadTime = Date.now() - startTime; const searchPageTitle = await page.title(); const searchPageUrl = page.url(); - console.error(`[SearchEngine] BING: Search completed in ${searchLoadTime}ms total, title: "${searchPageTitle}", URL: ${searchPageUrl}`); - + console.error( + `[SearchEngine] BING: Search completed in ${searchLoadTime}ms total, title: "${searchPageTitle}", URL: ${searchPageUrl}` + ); } catch (formError) { const errorMessage = formError instanceof Error ? formError.message : 'Unknown error'; console.error(`[SearchEngine] BING: Search form submission failed: ${errorMessage}`); - + if (debugBing) { console.error(`[SearchEngine] BING: Form error details:`, formError); } - + throw formError; } - + // Wait for search results to load try { console.error(`[SearchEngine] BING: Waiting for search results to appear...`); @@ -415,47 +455,54 @@ export class SearchEngine { const html = await page.content(); console.error(`[SearchEngine] BING: Got page HTML with length: ${html.length} characters`); - + if (debugBing && html.length < 10000) { console.error(`[SearchEngine] BING: WARNING - HTML seems short, possible bot detection or error page`); } - + const results = this.parseBingResults(html, numResults); console.error(`[SearchEngine] BING: Enhanced search parsed ${results.length} results`); - + if (results.length === 0) { console.error(`[SearchEngine] BING: WARNING - No results found, possible parsing failure or empty search`); - + if (debugBing) { const sampleHtml = html.substring(0, 1000); console.error(`[SearchEngine] BING: Sample HTML for debugging:`, sampleHtml); } } - + return results; } - private async tryDirectBingSearch(page: any, query: string, numResults: number, timeout: number): Promise { + private async tryDirectBingSearch( + page: any, + query: string, + numResults: number, + timeout: number + ): Promise { const debugBing = process.env.DEBUG_BING_SEARCH === 'true'; console.error(`[SearchEngine] BING: Direct search with enhanced parameters...`); - + // Generate a conversation ID (cvid) similar to what Bing uses const cvid = this.generateConversationId(); - + // Construct URL with enhanced parameters based on successful manual searches const searchUrl = `https://www.bing.com/search?q=${encodeURIComponent(query)}&count=${Math.min(numResults, 10)}&form=QBLH&sp=-1&qs=n&cvid=${cvid}`; console.error(`[SearchEngine] BING: Navigating to direct URL: ${searchUrl}`); - + const startTime = Date.now(); - await page.goto(searchUrl, { + await page.goto(searchUrl, { waitUntil: 'domcontentloaded', - timeout: timeout + timeout: timeout, }); - + const loadTime = Date.now() - startTime; const pageTitle = await page.title(); const currentUrl = page.url(); - console.error(`[SearchEngine] BING: Direct page loaded in ${loadTime}ms, title: "${pageTitle}", URL: ${currentUrl}`); + console.error( + `[SearchEngine] BING: Direct page loaded in ${loadTime}ms, title: "${pageTitle}", URL: ${currentUrl}` + ); // Wait for search results to load try { @@ -468,23 +515,23 @@ export class SearchEngine { const html = await page.content(); console.error(`[SearchEngine] BING: Got page HTML with length: ${html.length} characters`); - + if (debugBing && html.length < 10000) { console.error(`[SearchEngine] BING: WARNING - HTML seems short, possible bot detection or error page`); } - + const results = this.parseBingResults(html, numResults); console.error(`[SearchEngine] BING: Direct search parsed ${results.length} results`); - + if (results.length === 0) { console.error(`[SearchEngine] BING: WARNING - No results found, possible parsing failure or empty search`); - + if (debugBing) { const sampleHtml = html.substring(0, 1000); console.error(`[SearchEngine] BING: Sample HTML for debugging:`, sampleHtml); } } - + return results; } @@ -498,22 +545,22 @@ export class SearchEngine { return cvid; } - private async tryDuckDuckGoSearch(query: string, numResults: number, timeout: number): Promise { console.log(`[SearchEngine] Trying DuckDuckGo as fallback...`); - + try { const response = await axios.get('https://html.duckduckgo.com/html/', { params: { q: query, }, headers: { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'User-Agent': + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', - 'DNT': '1', - 'Connection': 'keep-alive', + DNT: '1', + Connection: 'keep-alive', 'Upgrade-Insecure-Requests': '1', }, timeout, @@ -521,10 +568,10 @@ export class SearchEngine { }); console.log(`[SearchEngine] DuckDuckGo got response with status: ${response.status}`); - + const results = this.parseDuckDuckGoResults(response.data, numResults); console.log(`[SearchEngine] DuckDuckGo parsed ${results.length} results`); - + return results; } catch { console.error(`[SearchEngine] DuckDuckGo search failed`); @@ -534,7 +581,7 @@ export class SearchEngine { private parseSearchResults(html: string, maxResults: number): SearchResult[] { console.log(`[SearchEngine] Parsing HTML with length: ${html.length}`); - + const $ = cheerio.load(html); const results: SearchResult[] = []; const timestamp = generateTimestamp(); @@ -547,7 +594,7 @@ export class SearchEngine { const vedElements = $('[data-ved]'); const h3Elements = $('h3'); const linkElements = $('a[href]'); - + console.log(`[SearchEngine] Found elements:`); console.log(` - div.g: ${gElements.length}`); console.log(` - div[data-sokoban-container]: ${sokobanElements.length}`); @@ -556,7 +603,7 @@ export class SearchEngine { console.log(` - [data-ved]: ${vedElements.length}`); console.log(` - h3: ${h3Elements.length}`); console.log(` - a[href]: ${linkElements.length}`); - + // Try multiple approaches to find search results const searchResultSelectors = [ 'div.g', @@ -564,34 +611,34 @@ export class SearchEngine { '.tF2Cxc', '.rc', '[data-ved]', - 'div[jscontroller]' + 'div[jscontroller]', ]; - + let foundResults = false; - + for (const selector of searchResultSelectors) { if (foundResults) break; - + console.log(`[SearchEngine] Trying selector: ${selector}`); const elements = $(selector); console.log(`[SearchEngine] Found ${elements.length} elements with selector ${selector}`); - + elements.each((_index, element) => { if (results.length >= maxResults) return false; - + const $element = $(element); - + // Try multiple title selectors const titleSelectors = ['h3', '.LC20lb', '.DKV0Md', 'a[data-ved]', '.r', '.s']; let title = ''; let url = ''; - + for (const titleSelector of titleSelectors) { const $title = $element.find(titleSelector).first(); if ($title.length) { title = $title.text().trim(); console.log(`[SearchEngine] Found title with ${titleSelector}: "${title}"`); - + // Try to find the link const $link = $title.closest('a'); if ($link.length) { @@ -608,11 +655,20 @@ export class SearchEngine { break; } } - + // Try multiple snippet selectors - const snippetSelectors = ['.VwiC3b', '.st', '.aCOpRe', '.IsZvec', '.s3v9rd', '.MUxGbd', '.aCOpRe', '.snippet-content']; + const snippetSelectors = [ + '.VwiC3b', + '.st', + '.aCOpRe', + '.IsZvec', + '.s3v9rd', + '.MUxGbd', + '.aCOpRe', + '.snippet-content', + ]; let snippet = ''; - + for (const snippetSelector of snippetSelectors) { const $snippet = $element.find(snippetSelector).first(); if ($snippet.length) { @@ -621,7 +677,7 @@ export class SearchEngine { break; } } - + if (title && url && this.isValidSearchUrl(url)) { console.log(`[SearchEngine] Adding result: ${title}`); results.push({ @@ -636,7 +692,9 @@ export class SearchEngine { }); foundResults = true; } else { - console.log(`[SearchEngine] Skipping result: title="${title}", url="${url}", isValid=${this.isValidSearchUrl(url)}`); + console.log( + `[SearchEngine] Skipping result: title="${title}", url="${url}", isValid=${this.isValidSearchUrl(url)}` + ); } }); } @@ -648,15 +706,15 @@ export class SearchEngine { console.log(`[SearchEngine] No results found, trying aggressive h3 search...`); $('h3').each((_index, element) => { if (results.length >= maxResults) return false; - + const $h3 = $(element); const title = $h3.text().trim(); const $link = $h3.closest('a'); - + if ($link.length && title) { const url = $link.attr('href') || ''; console.log(`[SearchEngine] Aggressive search found: "${title}" -> "${url}"`); - + if (this.isValidSearchUrl(url)) { results.push({ title, @@ -671,7 +729,7 @@ export class SearchEngine { } } }); - + console.log(`[SearchEngine] Aggressive search found ${results.length} results`); } @@ -680,44 +738,44 @@ export class SearchEngine { private parseBraveResults(html: string, maxResults: number): SearchResult[] { console.log(`[SearchEngine] Parsing Brave HTML with length: ${html.length}`); - + const $ = cheerio.load(html); const results: SearchResult[] = []; const timestamp = generateTimestamp(); // Brave result selectors const resultSelectors = [ - '[data-type="web"]', // Main Brave results - '.result', // Alternative format - '.fdb' // Brave specific format + '[data-type="web"]', // Main Brave results + '.result', // Alternative format + '.fdb', // Brave specific format ]; - + let foundResults = false; - + for (const selector of resultSelectors) { if (foundResults && results.length >= maxResults) break; - + console.log(`[SearchEngine] Trying Brave selector: ${selector}`); const elements = $(selector); console.log(`[SearchEngine] Found ${elements.length} elements with selector ${selector}`); - + elements.each((_index, element) => { if (results.length >= maxResults) return false; const $element = $(element); - + // Try multiple title selectors for Brave const titleSelectors = [ - '.title a', // Brave specific - 'h2 a', // Common format - '.result-title a', // Alternative format - 'a[href*="://"]', // Any external link - '.snippet-title a' // Snippet title + '.title a', // Brave specific + 'h2 a', // Common format + '.result-title a', // Alternative format + 'a[href*="://"]', // Any external link + '.snippet-title a', // Snippet title ]; - + let title = ''; let url = ''; - + for (const titleSelector of titleSelectors) { const $titleElement = $element.find(titleSelector).first(); if ($titleElement.length) { @@ -729,7 +787,7 @@ export class SearchEngine { } } } - + // If still no title, try getting it from any text content if (!title) { const textContent = $element.text().trim(); @@ -739,15 +797,15 @@ export class SearchEngine { console.log(`[SearchEngine] Brave found title from text content: "${title}"`); } } - + // Try multiple snippet selectors for Brave const snippetSelectors = [ - '.snippet-content', // Brave specific - '.snippet', // Generic - '.description', // Alternative - 'p' // Fallback paragraph + '.snippet-content', // Brave specific + '.snippet', // Generic + '.description', // Alternative + 'p', // Fallback paragraph ]; - + let snippet = ''; for (const snippetSelector of snippetSelectors) { const $snippetElement = $element.find(snippetSelector).first(); @@ -756,7 +814,7 @@ export class SearchEngine { break; } } - + if (title && url && this.isValidSearchUrl(url)) { console.log(`[SearchEngine] Brave found: "${title}" -> "${url}"`); results.push({ @@ -781,7 +839,7 @@ export class SearchEngine { private parseBingResults(html: string, maxResults: number): SearchResult[] { const debugBing = process.env.DEBUG_BING_SEARCH === 'true'; console.error(`[SearchEngine] BING: Parsing HTML with length: ${html.length}`); - + const $ = cheerio.load(html); const results: SearchResult[] = []; const timestamp = generateTimestamp(); @@ -789,49 +847,49 @@ export class SearchEngine { // Check for common Bing error indicators const pageTitle = $('title').text(); console.error(`[SearchEngine] BING: Page title: "${pageTitle}"`); - + if (pageTitle.includes('Access Denied') || pageTitle.includes('blocked') || pageTitle.includes('captcha')) { console.error(`[SearchEngine] BING: ERROR - Bot detection or access denied detected in page title`); } // Bing result selectors const resultSelectors = [ - '.b_algo', // Main Bing results - '.b_result', // Alternative Bing format - '.b_card' // Card format + '.b_algo', // Main Bing results + '.b_result', // Alternative Bing format + '.b_card', // Card format ]; - + console.error(`[SearchEngine] BING: Checking for result elements...`); - + // Log counts for all selectors first for (const selector of resultSelectors) { const elements = $(selector); console.error(`[SearchEngine] BING: Found ${elements.length} elements with selector "${selector}"`); } - + let foundResults = false; - + for (const selector of resultSelectors) { if (foundResults && results.length >= maxResults) break; - + const elements = $(selector); if (elements.length === 0) continue; - + elements.each((_index, element) => { if (results.length >= maxResults) return false; const $element = $(element); - + // Try multiple title selectors for Bing const titleSelectors = [ - 'h2 a', // Standard Bing format - '.b_title a', // Alternative format - 'a[data-seid]' // Bing specific + 'h2 a', // Standard Bing format + '.b_title a', // Alternative format + 'a[data-seid]', // Bing specific ]; - + let title = ''; let url = ''; - + for (const titleSelector of titleSelectors) { const $titleElement = $element.find(titleSelector).first(); if ($titleElement.length) { @@ -841,21 +899,21 @@ export class SearchEngine { break; } } - + // Try multiple snippet selectors for Bing const snippetSelectors = [ - '.b_caption p', // Standard Bing snippet - '.b_snippet', // Alternative format - '.b_descript', // Description format - '.b_caption', // Caption without p tag - '.b_caption > span', // Caption span - '.b_excerpt', // Excerpt format - 'p', // Any paragraph in the result - '.b_algo_content p', // Content paragraph - '.b_algo_content', // Full content area - '.b_context' // Context information + '.b_caption p', // Standard Bing snippet + '.b_snippet', // Alternative format + '.b_descript', // Description format + '.b_caption', // Caption without p tag + '.b_caption > span', // Caption span + '.b_excerpt', // Excerpt format + 'p', // Any paragraph in the result + '.b_algo_content p', // Content paragraph + '.b_algo_content', // Full content area + '.b_context', // Context information ]; - + let snippet = ''; for (const snippetSelector of snippetSelectors) { const $snippetElement = $element.find(snippetSelector).first(); @@ -864,12 +922,14 @@ export class SearchEngine { // Skip very short snippets or those that look like metadata if (candidateSnippet.length > 20 && !candidateSnippet.match(/^\d+\s*(min|sec|hour|day|week|month|year)/i)) { snippet = candidateSnippet; - console.log(`[SearchEngine] Bing found snippet with ${snippetSelector}: "${snippet.substring(0, 100)}..."`); + console.log( + `[SearchEngine] Bing found snippet with ${snippetSelector}: "${snippet.substring(0, 100)}..."` + ); break; } } } - + if (title && url && this.isValidSearchUrl(url)) { console.log(`[SearchEngine] Bing found: "${title}" -> "${url}"`); results.push({ @@ -893,7 +953,7 @@ export class SearchEngine { private parseDuckDuckGoResults(html: string, maxResults: number): SearchResult[] { console.log(`[SearchEngine] Parsing DuckDuckGo HTML with length: ${html.length}`); - + const $ = cheerio.load(html); const results: SearchResult[] = []; const timestamp = generateTimestamp(); @@ -903,15 +963,15 @@ export class SearchEngine { if (results.length >= maxResults) return false; const $element = $(element); - + // Extract title and URL const $titleElement = $element.find('.result__title a'); const title = $titleElement.text().trim(); const url = $titleElement.attr('href'); - + // Extract snippet const snippet = $element.find('.result__snippet').text().trim(); - + if (title && url) { console.log(`[SearchEngine] DuckDuckGo found: "${title}" -> "${url}"`); results.push({ @@ -933,14 +993,16 @@ export class SearchEngine { private isValidSearchUrl(url: string): boolean { // Google search results URLs can be in various formats - return url.startsWith('/url?') || - url.startsWith('http://') || - url.startsWith('https://') || - url.startsWith('//') || - url.startsWith('/search?') || - url.startsWith('/') || - url.includes('google.com') || - url.length > 10; // Accept any reasonably long URL + return ( + url.startsWith('/url?') || + url.startsWith('http://') || + url.startsWith('https://') || + url.startsWith('//') || + url.startsWith('/search?') || + url.startsWith('/') || + url.includes('google.com') || + url.length > 10 + ); // Accept any reasonably long URL } private cleanGoogleUrl(url: string): string { @@ -970,12 +1032,12 @@ export class SearchEngine { if (url.startsWith('//')) { return 'https:' + url; } - + // If it's already a full URL, return as-is if (url.startsWith('http://') || url.startsWith('https://')) { return url; } - + return url; } @@ -984,12 +1046,12 @@ export class SearchEngine { if (url.startsWith('//')) { return 'https:' + url; } - + // If it's already a full URL, return as-is if (url.startsWith('http://') || url.startsWith('https://')) { return url; } - + return url; } @@ -1010,12 +1072,12 @@ export class SearchEngine { console.log(`[SearchEngine] Failed to decode DuckDuckGo URL: ${url}`); } } - + // If it's a protocol-relative URL, add https: if (url.startsWith('//')) { return 'https:' + url; } - + return url; } @@ -1023,8 +1085,46 @@ export class SearchEngine { if (results.length === 0) return 0; // Extract keywords from the original query (ignore common words) - const commonWords = new Set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'group', 'members']); - const queryWords = originalQuery.toLowerCase() + const commonWords = new Set([ + 'the', + 'a', + 'an', + 'and', + 'or', + 'but', + 'in', + 'on', + 'at', + 'to', + 'for', + 'of', + 'with', + 'by', + 'is', + 'are', + 'was', + 'were', + 'be', + 'been', + 'have', + 'has', + 'had', + 'do', + 'does', + 'did', + 'will', + 'would', + 'could', + 'should', + 'may', + 'might', + 'must', + 'can', + 'group', + 'members', + ]); + const queryWords = originalQuery + .toLowerCase() .replace(/[^\w\s]/g, ' ') .split(/\s+/) .filter(word => word.length > 2 && !commonWords.has(word)); @@ -1077,15 +1177,41 @@ export class SearchEngine { // Penalty for obvious irrelevant content const irrelevantPatterns = [ - /recipe/i, /cooking/i, /food/i, /restaurant/i, /menu/i, - /weather/i, /temperature/i, /forecast/i, - /shopping/i, /sale/i, /price/i, /buy/i, /store/i, - /movie/i, /film/i, /tv show/i, /entertainment/i, - /sports/i, /game/i, /score/i, /team/i, - /fashion/i, /clothing/i, /style/i, - /travel/i, /hotel/i, /flight/i, /vacation/i, - /car/i, /vehicle/i, /automotive/i, - /real estate/i, /property/i, /house/i, /apartment/i + /recipe/i, + /cooking/i, + /food/i, + /restaurant/i, + /menu/i, + /weather/i, + /temperature/i, + /forecast/i, + /shopping/i, + /sale/i, + /price/i, + /buy/i, + /store/i, + /movie/i, + /film/i, + /tv show/i, + /entertainment/i, + /sports/i, + /game/i, + /score/i, + /team/i, + /fashion/i, + /clothing/i, + /style/i, + /travel/i, + /hotel/i, + /flight/i, + /vacation/i, + /car/i, + /vehicle/i, + /automotive/i, + /real estate/i, + /property/i, + /house/i, + /apartment/i, ]; let penalty = 0; @@ -1096,9 +1222,11 @@ export class SearchEngine { } const finalScore = Math.max(0, resultScore - penalty); - - console.log(`[SearchEngine] Result "${result.title.substring(0, 50)}..." - Score: ${finalScore.toFixed(2)} (keywords: ${keywordMatches}/${queryWords.length}, phrases: ${phraseMatches}, penalty: ${penalty.toFixed(2)})`); - + + console.log( + `[SearchEngine] Result "${result.title.substring(0, 50)}..." - Score: ${finalScore.toFixed(2)} (keywords: ${keywordMatches}/${queryWords.length}, phrases: ${phraseMatches}, penalty: ${penalty.toFixed(2)})` + ); + totalScore += finalScore; scoredResults++; } @@ -1109,24 +1237,26 @@ export class SearchEngine { private async validateBrowserHealth(browser: any): Promise { const debugBrowsers = process.env.DEBUG_BROWSER_LIFECYCLE === 'true'; - + try { if (debugBrowsers) console.log(`[SearchEngine] Validating browser health...`); - + // Check if browser is still connected if (!browser.isConnected()) { if (debugBrowsers) console.log(`[SearchEngine] Browser is not connected`); return false; } - + // Try to create a simple context to test browser responsiveness const testContext = await browser.newContext(); await testContext.close(); - + if (debugBrowsers) console.log(`[SearchEngine] Browser health check passed`); return true; } catch (error) { - console.log(`[SearchEngine] Browser health check failed: ${error instanceof Error ? error.message : 'Unknown error'}`); + console.log( + `[SearchEngine] Browser health check failed: ${error instanceof Error ? error.message : 'Unknown error'}` + ); return false; } } @@ -1134,20 +1264,23 @@ export class SearchEngine { private async handleBrowserError(error: any, engineName: string, attemptNumber: number = 1): Promise { const errorMessage = error instanceof Error ? error.message : 'Unknown error'; console.error(`[SearchEngine] ${engineName} browser error (attempt ${attemptNumber}): ${errorMessage}`); - + // Check for specific browser-related errors - if (errorMessage.includes('Target page, context or browser has been closed') || - errorMessage.includes('Browser has been closed') || - errorMessage.includes('Session has been closed')) { - + if ( + errorMessage.includes('Target page, context or browser has been closed') || + errorMessage.includes('Browser has been closed') || + errorMessage.includes('Session has been closed') + ) { console.log(`[SearchEngine] Detected browser session closure, attempting to refresh browser pool`); - + // Try to refresh the browser pool for subsequent attempts try { await this.browserPool.closeAll(); console.log(`[SearchEngine] Browser pool refreshed for ${engineName}`); } catch (refreshError) { - console.error(`[SearchEngine] Failed to refresh browser pool: ${refreshError instanceof Error ? refreshError.message : 'Unknown error'}`); + console.error( + `[SearchEngine] Failed to refresh browser pool: ${refreshError instanceof Error ? refreshError.message : 'Unknown error'}` + ); } } } diff --git a/src/utils.ts b/src/utils.ts index 3f457bb..909ed69 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -11,7 +11,10 @@ export function cleanText(text: string, maxLength: number = 10000): string { } export function getWordCount(text: string): number { - return text.trim().split(/\s+/).filter(word => word.length > 0).length; + return text + .trim() + .split(/\s+/) + .filter(word => word.length > 0).length; } export function getContentPreview(text: string, maxLength: number = 500): string { @@ -58,4 +61,4 @@ export function isPdfUrl(url: string): boolean { // If URL parsing fails, check the raw string as fallback return url.toLowerCase().endsWith('.pdf'); } -} \ No newline at end of file +} diff --git a/tests/browser-installed.js b/tests/browser-installed.js new file mode 100644 index 0000000..a6006fa --- /dev/null +++ b/tests/browser-installed.js @@ -0,0 +1,11 @@ +// Utility to check if a Playwright browser channel is installed +import { chromium } from 'playwright'; + +export async function isBrowserInstalled(channel) { + try { + await chromium.executablePath({ channel }); + return true; + } catch { + return false; + } +} diff --git a/tests/run-all-tests.js b/tests/run-all-tests.js new file mode 100644 index 0000000..78983df --- /dev/null +++ b/tests/run-all-tests.js @@ -0,0 +1,30 @@ +// Utility to run all test modules in sequence +import { execSync } from 'child_process'; + +const testFiles = [ + 'tests/test-all-engines.js', + 'tests/test-bing.js', + 'tests/test-brave.js', + 'tests/test-duckduckgo.js', + 'tests/test-search.js', +]; + +let allPassed = true; + +for (const file of testFiles) { + console.log(`\n[run-all-tests] Running: ${file}`); + try { + execSync(`node ${file}`, { stdio: 'inherit' }); + } catch (err) { + allPassed = false; + console.error(`[run-all-tests] Test failed: ${file}`); + } +} + +if (allPassed) { + console.log('\nAll tests passed โœ…'); + process.exit(0); +} else { + console.log('\nSome tests failed โŒ'); + process.exit(1); +} diff --git a/tests/test-all-engines.js b/tests/test-all-engines.js index c53bb46..640ba07 100644 --- a/tests/test-all-engines.js +++ b/tests/test-all-engines.js @@ -6,22 +6,23 @@ */ import { SearchEngine } from '../dist/search-engine.js'; +import { runMultiBrowserTest } from './test-utilities.js'; -async function testSearchEngine(query = 'javascript programming', numResults = 3) { - console.log('๐Ÿ” Testing Web Search MCP Server - All Engines'); +async function testSearchEngine(channel, query = 'javascript programming', numResults = 3) { + console.log(`๐Ÿ” Testing Web Search MCP Server - All Engines (${channel || 'chromium'})`); console.log('==============================================='); console.log(`Query: "${query}"`); console.log(`Expected results: ${numResults}`); console.log(''); - const searchEngine = new SearchEngine(); + const searchEngine = new SearchEngine({ channel }); try { const startTime = Date.now(); const result = await searchEngine.search({ query, numResults, - timeout: 15000 // 15 second timeout + timeout: 15000, // 15 second timeout }); const endTime = Date.now(); @@ -37,7 +38,7 @@ async function testSearchEngine(query = 'javascript programming', numResults = 3 console.log('๐Ÿ“‹ Results:'); console.log('==========='); - + result.results.forEach((item, index) => { console.log(`${index + 1}. ${item.title}`); console.log(` ๐Ÿ”— ${item.url}`); @@ -46,24 +47,24 @@ async function testSearchEngine(query = 'javascript programming', numResults = 3 }); // Validate results - const validResults = result.results.filter(r => - r.title && - r.title !== 'No title' && - r.url && - r.url.startsWith('http') && - r.description && - r.description !== 'No description available' + const validResults = result.results.filter( + r => + r.title && + r.title !== 'No title' && + r.url && + r.url.startsWith('http') && + r.description && + r.description !== 'No description available' ); console.log(`โœ… Valid results: ${validResults.length}/${result.results.length}`); - + if (validResults.length === 0) { console.log('โŒ No valid results found!'); return false; } return true; - } catch (error) { console.error('โŒ Search failed:', error.message); return false; @@ -72,15 +73,11 @@ async function testSearchEngine(query = 'javascript programming', numResults = 3 } } -async function runTests() { - console.log('๐Ÿงช Running comprehensive search engine tests...'); +async function runTests(channel) { + console.log(`๐Ÿงช Running comprehensive search engine tests (${channel || 'chromium'})...`); console.log('================================================'); - const testQueries = [ - 'javascript programming', - 'climate change effects', - 'machine learning basics' - ]; + const testQueries = ['javascript programming', 'climate change effects', 'machine learning basics']; let passedTests = 0; const totalTests = testQueries.length; @@ -89,15 +86,15 @@ async function runTests() { const query = testQueries[i]; console.log(`\n๐Ÿ” Test ${i + 1}/${totalTests}: "${query}"`); console.log('โ”€'.repeat(50)); - - const success = await testSearchEngine(query, 5); + + const success = await testSearchEngine(channel, query, 5); if (success) { passedTests++; console.log('โœ… Test PASSED'); } else { console.log('โŒ Test FAILED'); } - + if (i < testQueries.length - 1) { console.log('\nโณ Waiting 2 seconds before next test...'); await new Promise(resolve => setTimeout(resolve, 2000)); @@ -108,19 +105,17 @@ async function runTests() { console.log('==============='); console.log(`Tests passed: ${passedTests}/${totalTests}`); console.log(`Success rate: ${Math.round((passedTests / totalTests) * 100)}%`); - + if (passedTests === totalTests) { console.log('๐ŸŽ‰ All tests passed!'); - process.exit(0); + return true; } else { console.log('โš ๏ธ Some tests failed'); - process.exit(1); + return false; } } -// Run tests if this script is executed directly -if (import.meta.url === `file://${process.argv[1]}`) { - runTests().catch(console.error); -} +// Run tests across all browsers +runMultiBrowserTest(runTests, 'ALL ENGINES'); -export { testSearchEngine, runTests }; \ No newline at end of file +export { testSearchEngine, runTests }; diff --git a/tests/test-bing.js b/tests/test-bing.js index 8f20f56..1153e4b 100644 --- a/tests/test-bing.js +++ b/tests/test-bing.js @@ -2,76 +2,81 @@ // Test Bing search independently import { chromium } from 'playwright'; +import { runMultiBrowserTest } from './test-utilities.js'; -async function testBing() { - console.log('=== TESTING BING SEARCH ==='); - - const browser = await chromium.launch({ headless: true }); +async function testBingWithChannel(channel) { + const label = channel || 'chromium'; + console.log(`=== TESTING BING SEARCH (${label}) ===`); + + const launchOptions = { headless: true }; + if (channel) launchOptions.channel = channel; + + const browser = await chromium.launch(launchOptions); const context = await browser.newContext({ - userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', + userAgent: + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', viewport: { width: 1366, height: 768 }, }); - + const page = await context.newPage(); - + try { const query = 'javascript tutorial'; const searchUrl = `https://www.bing.com/search?q=${encodeURIComponent(query)}&count=5`; console.log(`Navigating to: ${searchUrl}`); - + const startTime = Date.now(); - await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 10000 }); + await page.goto(searchUrl, { + waitUntil: 'domcontentloaded', + timeout: 10000, + }); const loadTime = Date.now() - startTime; - + const html = await page.content(); console.log(`โœ“ Page loaded successfully in ${loadTime}ms`); console.log(`โœ“ HTML length: ${html.length} characters`); - + // Check for bot detection const title = await page.title(); console.log(`โœ“ Page title: ${title}`); - + if (title.includes('Access Denied') || title.includes('Captcha') || html.includes('unusual traffic')) { console.log('โŒ Bot detection detected'); return false; } - + // Parse results const resultElements = await page.$$('.b_algo'); console.log(`โœ“ Found ${resultElements.length} .b_algo elements`); - + if (resultElements.length > 0) { console.log('\n--- SAMPLE RESULTS ---'); for (let i = 0; i < Math.min(3, resultElements.length); i++) { const titleElement = await resultElements[i].$('h2 a'); const snippetElement = await resultElements[i].$('.b_caption p'); - + const title = titleElement ? await titleElement.textContent() : 'No title'; const url = titleElement ? await titleElement.getAttribute('href') : 'No URL'; const snippet = snippetElement ? await snippetElement.textContent() : 'No snippet'; - + console.log(`${i + 1}. ${title?.trim()}`); console.log(` URL: ${url}`); console.log(` Snippet: ${snippet?.trim().substring(0, 100)}...`); console.log(''); } - - console.log('โœ… BING SEARCH: SUCCESS'); + + console.log(`โœ… BING SEARCH (${label}): SUCCESS`); return true; } else { console.log('โŒ No results found'); return false; } - } catch (error) { - console.log(`โŒ BING SEARCH FAILED: ${error.message}`); + console.log(`โŒ BING SEARCH FAILED (${label}): ${error.message}`); return false; } finally { await browser.close(); } } -testBing().then(success => { - console.log(`\nBING RESULT: ${success ? 'WORKING โœ…' : 'FAILED โŒ'}`); - process.exit(success ? 0 : 1); -}); \ No newline at end of file +runMultiBrowserTest(testBingWithChannel, 'BING'); diff --git a/tests/test-brave.js b/tests/test-brave.js index 376a36b..e79d97b 100644 --- a/tests/test-brave.js +++ b/tests/test-brave.js @@ -2,57 +2,68 @@ // Test Brave search independently import { chromium } from 'playwright'; +import { runMultiBrowserTest } from './test-utilities.js'; -async function testBrave() { - console.log('=== TESTING BRAVE SEARCH ==='); - - const browser = await chromium.launch({ headless: true }); +async function testBrave(channel) { + console.log(`=== TESTING BRAVE SEARCH (${channel || 'chromium'}) ===`); + const launchOptions = { headless: true }; + if (channel) launchOptions.channel = channel; + + const browser = await chromium.launch(launchOptions); const context = await browser.newContext({ - userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', + userAgent: + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', viewport: { width: 1366, height: 768 }, }); - + const page = await context.newPage(); - + try { const query = 'javascript tutorial'; const searchUrl = `https://search.brave.com/search?q=${encodeURIComponent(query)}&source=web`; console.log(`Navigating to: ${searchUrl}`); - + const startTime = Date.now(); - await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 10000 }); + await page.goto(searchUrl, { + waitUntil: 'domcontentloaded', + timeout: 10000, + }); const loadTime = Date.now() - startTime; - + await page.waitForTimeout(2000); // Wait for any dynamic content - + const html = await page.content(); console.log(`โœ“ Page loaded successfully in ${loadTime}ms`); console.log(`โœ“ HTML length: ${html.length} characters`); - + // Check for bot detection const title = await page.title(); console.log(`โœ“ Page title: ${title}`); - - if (title.includes('Access Denied') || title.includes('Captcha') || - html.includes('unusual traffic') || html.includes('blocked') || - html.length < 1000) { + + if ( + title.includes('Access Denied') || + title.includes('Captcha') || + html.includes('unusual traffic') || + html.includes('blocked') || + html.length < 1000 + ) { console.log('โŒ Bot detection detected'); console.log('Sample HTML:', html.substring(0, 500)); return false; } - + // Try multiple selectors for Brave results const resultSelectors = [ - '[data-type="web"]', // Brave specific - '.result', // Generic - '.fdb', // Brave format - '.snippet', // Alternative - 'div[data-pos]' // Position-based + '[data-type="web"]', // Brave specific + '.result', // Generic + '.fdb', // Brave format + '.snippet', // Alternative + 'div[data-pos]', // Position-based ]; - + let resultElements = []; let workingSelector = ''; - + for (const selector of resultSelectors) { resultElements = await page.$$(selector); console.log(`โœ“ Found ${resultElements.length} elements with selector: ${selector}`); @@ -61,53 +72,53 @@ async function testBrave() { break; } } - + if (resultElements.length > 0) { console.log('\n--- SAMPLE RESULTS ---'); for (let i = 0; i < Math.min(3, resultElements.length); i++) { // Try multiple title selectors for Brave const titleSelectors = [ - 'h2 a', // Common format - '.title a', // Brave specific - '.result-title a', // Alternative - 'a[data-testid]', // Test ID format - 'h3 a' // Fallback + 'h2 a', // Common format + '.title a', // Brave specific + '.result-title a', // Alternative + 'a[data-testid]', // Test ID format + 'h3 a', // Fallback ]; - + const snippetSelectors = [ - '.snippet-content', // Brave specific - '.snippet', // Generic - '.description', // Alternative - 'p' // Fallback + '.snippet-content', // Brave specific + '.snippet', // Generic + '.description', // Alternative + 'p', // Fallback ]; - + let title = 'No title'; let url = 'No URL'; let snippet = 'No snippet'; - + for (const titleSel of titleSelectors) { const titleElement = await resultElements[i].$(titleSel); if (titleElement) { - title = await titleElement.textContent() || 'No title'; - url = await titleElement.getAttribute('href') || 'No URL'; + title = (await titleElement.textContent()) || 'No title'; + url = (await titleElement.getAttribute('href')) || 'No URL'; break; } } - + for (const snippetSel of snippetSelectors) { const snippetElement = await resultElements[i].$(snippetSel); if (snippetElement) { - snippet = await snippetElement.textContent() || 'No snippet'; + snippet = (await snippetElement.textContent()) || 'No snippet'; break; } } - + console.log(`${i + 1}. ${title.trim()}`); console.log(` URL: ${url}`); console.log(` Snippet: ${snippet.trim().substring(0, 100)}...`); console.log(''); } - + console.log('โœ… BRAVE SEARCH: SUCCESS'); return true; } else { @@ -115,7 +126,6 @@ async function testBrave() { console.log('Sample HTML:', html.substring(0, 1000)); return false; } - } catch (error) { console.log(`โŒ BRAVE SEARCH FAILED: ${error.message}`); return false; @@ -124,7 +134,4 @@ async function testBrave() { } } -testBrave().then(success => { - console.log(`\nBRAVE RESULT: ${success ? 'WORKING โœ…' : 'FAILED โŒ'}`); - process.exit(success ? 0 : 1); -}); \ No newline at end of file +runMultiBrowserTest(testBrave, 'BRAVE'); diff --git a/tests/test-duckduckgo.js b/tests/test-duckduckgo.js index ce23aac..f526c6b 100644 --- a/tests/test-duckduckgo.js +++ b/tests/test-duckduckgo.js @@ -2,68 +2,74 @@ // Test DuckDuckGo search independently import { chromium } from 'playwright'; +import { runMultiBrowserTest } from './test-utilities.js'; -async function testDuckDuckGo() { - console.log('=== TESTING DUCKDUCKGO SEARCH ==='); - - const browser = await chromium.launch({ headless: true }); +async function testDuckDuckGo(channel) { + console.log(`=== TESTING DUCKDUCKGO SEARCH (${channel || 'chromium'}) ===`); + const launchOptions = { headless: true }; + if (channel) launchOptions.channel = channel; + + const browser = await chromium.launch(launchOptions); const context = await browser.newContext({ - userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', + userAgent: + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', viewport: { width: 1366, height: 768 }, }); - + const page = await context.newPage(); - + try { const query = 'javascript tutorial'; - + // Test both URLs const urls = [ `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&ia=web`, - `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}` + `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`, ]; - + for (let i = 0; i < urls.length; i++) { const searchUrl = urls[i]; const urlType = i === 0 ? 'Main DDG' : 'HTML DDG'; - + console.log(`\n--- Testing ${urlType} ---`); console.log(`Navigating to: ${searchUrl}`); - + try { const startTime = Date.now(); - await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 10000 }); + await page.goto(searchUrl, { + waitUntil: 'domcontentloaded', + timeout: 10000, + }); const loadTime = Date.now() - startTime; - + await page.waitForTimeout(2000); // Wait for any dynamic content - + const html = await page.content(); console.log(`โœ“ Page loaded successfully in ${loadTime}ms`); console.log(`โœ“ HTML length: ${html.length} characters`); - + // Check for bot detection or error messages const title = await page.title(); console.log(`โœ“ Page title: ${title}`); - - if (html.includes('error-lite') || html.includes('email us') || - html.length < 1000 || title === '') { + + if (html.includes('error-lite') || html.includes('email us') || html.length < 1000 || title === '') { console.log('โŒ Error page or bot detection detected'); console.log('Sample HTML:', html.substring(0, 500)); continue; } - + // Try multiple selectors for results const resultSelectors = [ '[data-result="result"]', '.result', '.web-result', 'article[data-testid="result"]', - '.results_links' + '.results_links', ]; - + let resultElements = []; let workingSelector = ''; - + for (const selector of resultSelectors) { resultElements = await page.$$(selector); console.log(`โœ“ Found ${resultElements.length} elements with selector: ${selector}`); @@ -72,55 +78,53 @@ async function testDuckDuckGo() { break; } } - + if (resultElements.length > 0) { console.log(`\n--- SAMPLE RESULTS (${urlType}) ---`); for (let j = 0; j < Math.min(3, resultElements.length); j++) { // Try multiple title selectors const titleSelectors = ['h2 a', '.result__title a', 'a[data-testid="result-title-a"]', 'h3 a']; const snippetSelectors = ['[data-result="snippet"]', '.result__snippet', '.result-snippet']; - + let title = 'No title'; let url = 'No URL'; let snippet = 'No snippet'; - + for (const titleSel of titleSelectors) { const titleElement = await resultElements[j].$(titleSel); if (titleElement) { - title = await titleElement.textContent() || 'No title'; - url = await titleElement.getAttribute('href') || 'No URL'; + title = (await titleElement.textContent()) || 'No title'; + url = (await titleElement.getAttribute('href')) || 'No URL'; break; } } - + for (const snippetSel of snippetSelectors) { const snippetElement = await resultElements[j].$(snippetSel); if (snippetElement) { - snippet = await snippetElement.textContent() || 'No snippet'; + snippet = (await snippetElement.textContent()) || 'No snippet'; break; } } - + console.log(`${j + 1}. ${title.trim()}`); console.log(` URL: ${url}`); console.log(` Snippet: ${snippet.trim().substring(0, 100)}...`); console.log(''); } - + console.log(`โœ… DUCKDUCKGO SEARCH (${urlType}): SUCCESS`); return true; } else { console.log(`โŒ No results found with ${urlType}`); } - } catch (error) { console.log(`โŒ ${urlType} failed: ${error.message}`); } } - + console.log('โŒ All DuckDuckGo variants failed'); return false; - } catch (error) { console.log(`โŒ DUCKDUCKGO SEARCH FAILED: ${error.message}`); return false; @@ -129,7 +133,4 @@ async function testDuckDuckGo() { } } -testDuckDuckGo().then(success => { - console.log(`\nDUCKDUCKGO RESULT: ${success ? 'WORKING โœ…' : 'FAILED โŒ'}`); - process.exit(success ? 0 : 1); -}); \ No newline at end of file +runMultiBrowserTest(testDuckDuckGo, 'DUCKDUCKGO'); diff --git a/tests/test-search.js b/tests/test-search.js index 8f7f037..b5bf339 100644 --- a/tests/test-search.js +++ b/tests/test-search.js @@ -2,40 +2,40 @@ // Simple test script to verify search functionality import { SearchEngine } from '../dist/search-engine.js'; +import { runMultiBrowserTest } from './test-utilities.js'; const searchEngine = new SearchEngine(); -async function testSearch() { - console.log('Testing search functionality...'); - +async function testSearch(channel) { + console.log(`Testing search functionality (${channel || 'chromium'})...`); + const searchEngine = new SearchEngine({ channel }); + try { const result = await searchEngine.search({ query: 'test search', numResults: 3, - timeout: 15000 // 15 second timeout for testing + timeout: 15000, // 15 second timeout for testing }); - + console.log(`Search completed with engine: ${result.engine}`); console.log(`Found ${result.results.length} results:`); - + result.results.forEach((r, i) => { console.log(`${i + 1}. ${r.title}`); console.log(` URL: ${r.url}`); console.log(` Description: ${r.description.substring(0, 100)}...`); console.log(''); }); - + // Clean up await searchEngine.closeAll(); - + + return result.results.length > 0; } catch (error) { console.error('Search test failed:', error); await searchEngine.closeAll(); - process.exit(1); + return false; } } -testSearch().then(() => { - console.log('Test completed successfully'); - process.exit(0); -}); \ No newline at end of file +runMultiBrowserTest(testSearch, 'SEARCH'); diff --git a/tests/test-utilities.js b/tests/test-utilities.js new file mode 100644 index 0000000..53e5fea --- /dev/null +++ b/tests/test-utilities.js @@ -0,0 +1,58 @@ +/** + * Utility to run a test function across Chromium, Chrome, and Edge channels. + * Also includes isBrowserInstalled utility (inlined from browser-installed.js). + */ +import { chromium } from 'playwright'; + +// Utility to check if a Playwright browser channel is installed +/** + * Checks if a specific Playwright browser channel is installed. + * @param {string} channel - The name of the browser channel to check (e.g., 'chrome', 'msedge'). + * @returns {Promise} Resolves to true if the browser channel is installed, false otherwise. + */ +async function isBrowserInstalled(channel) { + try { + await chromium.executablePath({ channel }); + return true; + } catch { + return false; + } +} + +/** + * Runs a test function across multiple browser channels (Chromium, Chrome, Edge). + * For each channel, checks if the browser is installed, runs the test if so, and reports results. + * Skips tests for browsers that are not installed. + * Exits the process with code 0 if all tests pass, or 1 if any test fails. + * + * @param {(channel: string|undefined) => Promise} testFn - + * An async callback that receives the browser channel name (or undefined for default Chromium) + * and returns a Promise resolving to true (pass) or false (fail). + * @param {string} label - A label describing the test, used in output messages. + */ +export async function runMultiBrowserTest(testFn, label) { + const channels = [ + { channel: undefined, label: 'chromium' }, + { channel: 'chrome', label: 'chrome' }, + { channel: 'msedge', label: 'edge' }, + ]; + + let allPassed = true; + + for (const { channel, label: channelLabel } of channels) { + let installed = true; + if (channel) { + installed = await isBrowserInstalled(channel); + } + if (!installed) { + console.log(`\nSKIPPING ${channelLabel.toUpperCase()} TEST: Browser not installed (xfail)`); + continue; + } + console.log(`\n[runMultiBrowserTest] Running "${label}" on browser: ${channelLabel}`); + const success = await testFn(channel); + allPassed = allPassed && success; + } + + console.log(`\n${label} RESULT: ${allPassed ? 'WORKING โœ…' : 'FAILED โŒ'}`); + process.exit(allPassed ? 0 : 1); +}