Skip to content

Commit 92279bf

Browse files
committed
Fix for Bing serving irrelevant results
1 parent 6ffd8dc commit 92279bf

File tree

2 files changed

+261
-28
lines changed

2 files changed

+261
-28
lines changed

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,12 @@ The server supports several environment variables for configuration:
119119
- **`BROWSER_TYPES`**: Comma-separated list of browser types to use (default: 'chromium,firefox', options: chromium, firefox, webkit)
120120
- **`BROWSER_FALLBACK_THRESHOLD`**: Number of axios failures before using browser fallback (default: 3)
121121

122+
### Search Quality and Engine Selection
123+
124+
- **`ENABLE_RELEVANCE_CHECKING`**: Enable/disable search result quality validation (default: true)
125+
- **`RELEVANCE_THRESHOLD`**: Minimum quality score for search results (0.0-1.0, default: 0.3)
126+
- **`FORCE_MULTI_ENGINE_SEARCH`**: Try all search engines and return best results (default: false)
127+
122128
## Troubleshooting
123129

124130
### Slow Response Times
@@ -133,6 +139,13 @@ The server supports several environment variables for configuration:
133139
- **Network restrictions**: Some networks block browser automation - try different network or VPN
134140
- **HTTP/2 issues**: The server automatically handles HTTP/2 protocol errors with fallback to HTTP/1.1
135141

142+
### Search Quality Issues
143+
- **Irrelevant results**: The server now includes automatic quality validation to detect irrelevant results
144+
- **Enable quality checking**: Set `ENABLE_RELEVANCE_CHECKING=true` (enabled by default)
145+
- **Adjust quality threshold**: Set `RELEVANCE_THRESHOLD=0.5` for stricter quality requirements
146+
- **Force multi-engine search**: Set `FORCE_MULTI_ENGINE_SEARCH=true` to try all engines and return the best results
147+
- **Disable quality checking**: Set `ENABLE_RELEVANCE_CHECKING=false` to disable validation (not recommended)
148+
136149
### Memory Usage
137150
- **Automatic cleanup**: Browsers are automatically cleaned up after each operation to prevent memory leaks
138151
- **Limit browsers**: Reduce `MAX_BROWSERS` (default: 3)

src/search-engine.ts

Lines changed: 248 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -24,21 +24,67 @@ export class SearchEngine {
2424
return await this.rateLimiter.execute(async () => {
2525
console.log(`[SearchEngine] Starting search with multiple engines...`);
2626

27+
// Configuration from environment variables
28+
const enableQualityCheck = process.env.ENABLE_RELEVANCE_CHECKING !== 'false';
29+
const qualityThreshold = parseFloat(process.env.RELEVANCE_THRESHOLD || '0.3');
30+
const forceMultiEngine = process.env.FORCE_MULTI_ENGINE_SEARCH === 'true';
31+
32+
console.log(`[SearchEngine] Quality checking: ${enableQualityCheck}, threshold: ${qualityThreshold}, multi-engine: ${forceMultiEngine}`);
33+
2734
// Try multiple approaches to get search results, starting with most reliable
2835
const approaches = [
2936
{ method: this.tryBrowserBingSearch.bind(this), name: 'Browser Bing' },
3037
{ method: this.tryBrowserBraveSearch.bind(this), name: 'Browser Brave' },
3138
{ method: this.tryDuckDuckGoSearch.bind(this), name: 'Axios DuckDuckGo' }
3239
];
3340

41+
let bestResults: SearchResult[] = [];
42+
let bestEngine = 'None';
43+
let bestQuality = 0;
44+
3445
for (const approach of approaches) {
3546
try {
36-
// Use shorter timeout per approach to allow trying all methods
37-
const approachTimeout = Math.min(timeout / 2, 6000); // Max 6 seconds per approach
47+
// Use more aggressive timeouts for faster fallback
48+
const approachTimeout = Math.min(timeout / 3, 4000); // Max 4 seconds per approach for faster fallback
3849
const results = await approach.method(sanitizedQuery, numResults, approachTimeout);
3950
if (results.length > 0) {
4051
console.log(`[SearchEngine] Found ${results.length} results with ${approach.name}`);
41-
return { results, engine: approach.name };
52+
53+
// Validate result quality to detect irrelevant results
54+
const qualityScore = enableQualityCheck ? this.assessResultQuality(results, sanitizedQuery) : 1.0;
55+
console.log(`[SearchEngine] ${approach.name} quality score: ${qualityScore.toFixed(2)}/1.0`);
56+
57+
// Track the best results so far
58+
if (qualityScore > bestQuality) {
59+
bestResults = results;
60+
bestEngine = approach.name;
61+
bestQuality = qualityScore;
62+
}
63+
64+
// If quality is excellent, return immediately (unless forcing multi-engine)
65+
if (qualityScore >= 0.8 && !forceMultiEngine) {
66+
console.log(`[SearchEngine] Excellent quality results from ${approach.name}, returning immediately`);
67+
return { results, engine: approach.name };
68+
}
69+
70+
// If quality is acceptable and this isn't Bing (first engine), return
71+
if (qualityScore >= qualityThreshold && approach.name !== 'Browser Bing' && !forceMultiEngine) {
72+
console.log(`[SearchEngine] Good quality results from ${approach.name}, using as primary`);
73+
return { results, engine: approach.name };
74+
}
75+
76+
// If this is the last engine or quality is acceptable, prepare to return
77+
if (approach === approaches[approaches.length - 1]) {
78+
if (bestQuality >= qualityThreshold || !enableQualityCheck) {
79+
console.log(`[SearchEngine] Using best results from ${bestEngine} (quality: ${bestQuality.toFixed(2)})`);
80+
return { results: bestResults, engine: bestEngine };
81+
} else if (bestResults.length > 0) {
82+
console.log(`[SearchEngine] Warning: Low quality results from all engines, using best available from ${bestEngine}`);
83+
return { results: bestResults, engine: bestEngine };
84+
}
85+
} else {
86+
console.log(`[SearchEngine] ${approach.name} results quality: ${qualityScore.toFixed(2)}, continuing to try other engines...`);
87+
}
4288
}
4389
} catch (error) {
4490
console.error(`[SearchEngine] ${approach.name} approach failed:`, error);
@@ -118,48 +164,134 @@ export class SearchEngine {
118164
const browser = await this.browserPool.getBrowser();
119165

120166
try {
167+
// Enhanced browser context with more realistic fingerprinting
121168
const context = await browser.newContext({
122169
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
123170
viewport: { width: 1366, height: 768 },
124171
locale: 'en-US',
125172
timezoneId: 'America/New_York',
173+
colorScheme: 'light',
174+
deviceScaleFactor: 1,
175+
hasTouch: false,
176+
isMobile: false,
177+
extraHTTPHeaders: {
178+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
179+
'Accept-Language': 'en-US,en;q=0.9',
180+
'Accept-Encoding': 'gzip, deflate, br',
181+
'DNT': '1',
182+
'Upgrade-Insecure-Requests': '1',
183+
'Sec-Fetch-Dest': 'document',
184+
'Sec-Fetch-Mode': 'navigate',
185+
'Sec-Fetch-Site': 'none'
186+
}
126187
});
127188

128189
const page = await context.newPage();
129190

130-
// Navigate to Bing search
131-
const searchUrl = `https://www.bing.com/search?q=${encodeURIComponent(query)}&count=${Math.min(numResults, 10)}`;
132-
console.log(`[SearchEngine] Browser navigating to Bing: ${searchUrl}`);
133-
134-
await page.goto(searchUrl, {
135-
waitUntil: 'domcontentloaded',
136-
timeout: timeout
137-
});
138-
139-
// Wait for search results to load
191+
// Try enhanced Bing search with proper web interface flow
140192
try {
141-
await page.waitForSelector('.b_algo, .b_result', { timeout: 3000 });
142-
} catch {
143-
console.log(`[SearchEngine] Browser Bing results selector not found, proceeding anyway`);
193+
const results = await this.tryEnhancedBingSearch(page, query, numResults, timeout);
194+
await context.close();
195+
return results;
196+
} catch (enhancedError) {
197+
console.log(`[SearchEngine] Enhanced Bing search failed, trying fallback: ${enhancedError instanceof Error ? enhancedError.message : 'Unknown error'}`);
198+
199+
// Fallback to direct URL approach with enhanced parameters
200+
const results = await this.tryDirectBingSearch(page, query, numResults, timeout);
201+
await context.close();
202+
return results;
144203
}
145-
146-
// Get the page content
147-
const html = await page.content();
148-
149-
await context.close();
150-
151-
console.log(`[SearchEngine] Browser Bing got HTML with length: ${html.length}`);
152-
153-
const results = this.parseBingResults(html, numResults);
154-
console.log(`[SearchEngine] Browser Bing parsed ${results.length} results`);
155-
156-
return results;
157204
} catch (error) {
158205
console.error(`[SearchEngine] Browser Bing search failed:`, error);
159206
throw error;
160207
}
161208
}
162209

210+
private async tryEnhancedBingSearch(page: any, query: string, numResults: number, timeout: number): Promise<SearchResult[]> {
211+
console.log(`[SearchEngine] Trying enhanced Bing search via web interface...`);
212+
213+
// Navigate to Bing homepage first to establish proper session
214+
await page.goto('https://www.bing.com', {
215+
waitUntil: 'domcontentloaded',
216+
timeout: timeout / 2
217+
});
218+
219+
// Wait a moment for page to fully load
220+
await page.waitForTimeout(500);
221+
222+
// Find and use the search box (more realistic than direct URL)
223+
try {
224+
await page.waitForSelector('#sb_form_q', { timeout: 2000 });
225+
await page.fill('#sb_form_q', query);
226+
227+
// Submit the search form
228+
await Promise.all([
229+
page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: timeout }),
230+
page.click('#search_icon')
231+
]);
232+
233+
} catch (formError) {
234+
console.log(`[SearchEngine] Search form submission failed, falling back to URL navigation`);
235+
throw formError;
236+
}
237+
238+
// Wait for search results to load
239+
try {
240+
await page.waitForSelector('.b_algo, .b_result', { timeout: 3000 });
241+
} catch {
242+
console.log(`[SearchEngine] Enhanced Bing results selector not found, proceeding anyway`);
243+
}
244+
245+
const html = await page.content();
246+
console.log(`[SearchEngine] Enhanced Bing got HTML with length: ${html.length}`);
247+
248+
const results = this.parseBingResults(html, numResults);
249+
console.log(`[SearchEngine] Enhanced Bing parsed ${results.length} results`);
250+
251+
return results;
252+
}
253+
254+
private async tryDirectBingSearch(page: any, query: string, numResults: number, timeout: number): Promise<SearchResult[]> {
255+
console.log(`[SearchEngine] Trying direct Bing search with enhanced parameters...`);
256+
257+
// Generate a conversation ID (cvid) similar to what Bing uses
258+
const cvid = this.generateConversationId();
259+
260+
// Construct URL with enhanced parameters based on successful manual searches
261+
const searchUrl = `https://www.bing.com/search?q=${encodeURIComponent(query)}&count=${Math.min(numResults, 10)}&form=QBLH&sp=-1&qs=n&cvid=${cvid}`;
262+
console.log(`[SearchEngine] Browser navigating to enhanced Bing URL: ${searchUrl}`);
263+
264+
await page.goto(searchUrl, {
265+
waitUntil: 'domcontentloaded',
266+
timeout: timeout
267+
});
268+
269+
// Wait for search results to load
270+
try {
271+
await page.waitForSelector('.b_algo, .b_result', { timeout: 3000 });
272+
} catch {
273+
console.log(`[SearchEngine] Direct Bing results selector not found, proceeding anyway`);
274+
}
275+
276+
const html = await page.content();
277+
console.log(`[SearchEngine] Direct Bing got HTML with length: ${html.length}`);
278+
279+
const results = this.parseBingResults(html, numResults);
280+
console.log(`[SearchEngine] Direct Bing parsed ${results.length} results`);
281+
282+
return results;
283+
}
284+
285+
private generateConversationId(): string {
286+
// Generate a conversation ID similar to Bing's format (32 hex characters)
287+
const chars = '0123456789ABCDEF';
288+
let cvid = '';
289+
for (let i = 0; i < 32; i++) {
290+
cvid += chars[Math.floor(Math.random() * chars.length)];
291+
}
292+
return cvid;
293+
}
294+
163295

164296
private async tryDuckDuckGoSearch(query: string, numResults: number, timeout: number): Promise<SearchResult[]> {
165297
console.log(`[SearchEngine] Trying DuckDuckGo as fallback...`);
@@ -665,6 +797,94 @@ export class SearchEngine {
665797
return url;
666798
}
667799

800+
private assessResultQuality(results: SearchResult[], originalQuery: string): number {
801+
if (results.length === 0) return 0;
802+
803+
// Extract keywords from the original query (ignore common words)
804+
const commonWords = new Set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'group', 'members']);
805+
const queryWords = originalQuery.toLowerCase()
806+
.replace(/[^\w\s]/g, ' ')
807+
.split(/\s+/)
808+
.filter(word => word.length > 2 && !commonWords.has(word));
809+
810+
if (queryWords.length === 0) return 0.5; // Default score if no meaningful keywords
811+
812+
console.log(`[SearchEngine] Quality assessment - Query keywords: [${queryWords.join(', ')}]`);
813+
814+
let totalScore = 0;
815+
let scoredResults = 0;
816+
817+
for (const result of results) {
818+
const titleText = result.title.toLowerCase();
819+
const descText = result.description.toLowerCase();
820+
const urlText = result.url.toLowerCase();
821+
const combinedText = `${titleText} ${descText} ${urlText}`;
822+
823+
// Count keyword matches
824+
let keywordMatches = 0;
825+
let phraseMatches = 0;
826+
827+
// Check for exact phrase matches (higher value)
828+
if (queryWords.length >= 2) {
829+
const queryPhrases = [];
830+
for (let i = 0; i < queryWords.length - 1; i++) {
831+
queryPhrases.push(queryWords.slice(i, i + 2).join(' '));
832+
}
833+
if (queryWords.length >= 3) {
834+
queryPhrases.push(queryWords.slice(0, 3).join(' '));
835+
}
836+
837+
for (const phrase of queryPhrases) {
838+
if (combinedText.includes(phrase)) {
839+
phraseMatches++;
840+
}
841+
}
842+
}
843+
844+
// Check individual keyword matches
845+
for (const keyword of queryWords) {
846+
if (combinedText.includes(keyword)) {
847+
keywordMatches++;
848+
}
849+
}
850+
851+
// Calculate score for this result
852+
const keywordRatio = keywordMatches / queryWords.length;
853+
const phraseBonus = phraseMatches * 0.3; // Bonus for phrase matches
854+
const resultScore = Math.min(1.0, keywordRatio + phraseBonus);
855+
856+
// Penalty for obvious irrelevant content
857+
const irrelevantPatterns = [
858+
/recipe/i, /cooking/i, /food/i, /restaurant/i, /menu/i,
859+
/weather/i, /temperature/i, /forecast/i,
860+
/shopping/i, /sale/i, /price/i, /buy/i, /store/i,
861+
/movie/i, /film/i, /tv show/i, /entertainment/i,
862+
/sports/i, /game/i, /score/i, /team/i,
863+
/fashion/i, /clothing/i, /style/i,
864+
/travel/i, /hotel/i, /flight/i, /vacation/i,
865+
/car/i, /vehicle/i, /automotive/i,
866+
/real estate/i, /property/i, /house/i, /apartment/i
867+
];
868+
869+
let penalty = 0;
870+
for (const pattern of irrelevantPatterns) {
871+
if (pattern.test(combinedText)) {
872+
penalty += 0.2;
873+
}
874+
}
875+
876+
const finalScore = Math.max(0, resultScore - penalty);
877+
878+
console.log(`[SearchEngine] Result "${result.title.substring(0, 50)}..." - Score: ${finalScore.toFixed(2)} (keywords: ${keywordMatches}/${queryWords.length}, phrases: ${phraseMatches}, penalty: ${penalty.toFixed(2)})`);
879+
880+
totalScore += finalScore;
881+
scoredResults++;
882+
}
883+
884+
const averageScore = scoredResults > 0 ? totalScore / scoredResults : 0;
885+
return averageScore;
886+
}
887+
668888
async closeAll(): Promise<void> {
669889
await this.browserPool.closeAll();
670890
}

0 commit comments

Comments
 (0)