Skip to content

Commit c15bc58

Browse files
committed
fix: web search filtering for empty domain lists and remove SearXNG
1 parent 610c303 commit c15bc58

File tree

1 file changed

+55
-72
lines changed

1 file changed

+55
-72
lines changed

src/main/kotlin/ee/carlrobert/codegpt/agent/tools/WebSearchTool.kt

Lines changed: 55 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ import kotlinx.coroutines.withContext
88
import kotlinx.serialization.SerialName
99
import kotlinx.serialization.Serializable
1010
import org.jsoup.Jsoup
11+
import java.net.URI
12+
import java.net.URLDecoder
1113
import java.net.URLEncoder
1214
import java.nio.charset.StandardCharsets
1315
import java.time.LocalDate
@@ -62,7 +64,6 @@ IMPORTANT - Use the correct year in search queries:
6264
hookManager = hookManager,
6365
sessionId = sessionId,
6466
) {
65-
6667
@Serializable
6768
data class Args(
6869
@property:LLMDescription(
@@ -96,38 +97,19 @@ IMPORTANT - Use the correct year in search queries:
9697
)
9798

9899
override suspend fun doExecute(args: Args): Result = withContext(Dispatchers.IO) {
99-
try {
100-
val searxResults = searchWithSearxNG(args.query)
101-
if (searxResults.isNotEmpty()) {
102-
val filteredResults =
103-
filterResults(
104-
searxResults,
105-
args.allowedDomains,
106-
args.blockedDomains
107-
)
108-
return@withContext Result(
109-
query = args.query,
110-
results = filteredResults.take(10),
111-
sources = filteredResults.take(10).map { "[${it.title}](${it.url})" }
112-
)
113-
}
114-
} catch (_: Exception) {
115-
// Fall back to DuckDuckGo
116-
}
117-
118100
try {
119101
val duckduckgoResults = searchWithDuckDuckGo(args.query)
120-
val filteredResults =
121-
filterResults(
122-
duckduckgoResults,
123-
args.allowedDomains,
124-
args.blockedDomains
125-
)
126-
return@withContext Result(
102+
val filteredResults = filterResults(
103+
duckduckgoResults,
104+
args.allowedDomains,
105+
args.blockedDomains
106+
).take(10)
107+
val result = Result(
127108
query = args.query,
128-
results = filteredResults.take(10),
129-
sources = filteredResults.take(10).map { "[${it.title}](${it.url})" }
109+
results = filteredResults,
110+
sources = filteredResults.map { "[${it.title}](${it.url})" }
130111
)
112+
return@withContext result
131113
} catch (_: Exception) {
132114
return@withContext Result(
133115
query = args.query,
@@ -148,34 +130,6 @@ IMPORTANT - Use the correct year in search queries:
148130
)
149131
}
150132

151-
private suspend fun searchWithSearxNG(query: String): List<SearchResult> =
152-
withContext(Dispatchers.IO) {
153-
val encodedQuery = URLEncoder.encode(query, StandardCharsets.UTF_8.toString())
154-
val url = "https://searx.space/search?q=$encodedQuery&format=json"
155-
156-
val doc = Jsoup.connect(url)
157-
.userAgent(userAgent)
158-
.timeout(10000)
159-
.ignoreContentType(true)
160-
.get()
161-
162-
val jsonResponse = doc.body().text()
163-
if (jsonResponse.isEmpty()) return@withContext emptyList()
164-
165-
try {
166-
val searxResponse = json.decodeFromString<SearxResponse>(jsonResponse)
167-
searxResponse.results.map { result ->
168-
SearchResult(
169-
title = result.title,
170-
url = result.url,
171-
content = result.content
172-
)
173-
}
174-
} catch (_: Exception) {
175-
emptyList()
176-
}
177-
}
178-
179133
private suspend fun searchWithDuckDuckGo(query: String): List<SearchResult> =
180134
withContext(Dispatchers.IO) {
181135
val encodedQuery = URLEncoder.encode(query, StandardCharsets.UTF_8.toString())
@@ -193,10 +147,11 @@ IMPORTANT - Use the correct year in search queries:
193147
val snippet = resultDiv.selectFirst("a.result__snippet")
194148

195149
if (titleLink != null) {
150+
val resolvedUrl = normalizeSearchResultUrl(titleLink.attr("href"))
196151
results.add(
197152
SearchResult(
198153
title = titleLink.text(),
199-
url = titleLink.attr("href"),
154+
url = resolvedUrl,
200155
content = snippet?.text() ?: ""
201156
)
202157
)
@@ -211,21 +166,61 @@ IMPORTANT - Use the correct year in search queries:
211166
allowedDomains: List<String>?,
212167
blockedDomains: List<String>?
213168
): List<SearchResult> {
169+
val effectiveAllowedDomains = normalizeDomains(allowedDomains)
170+
val effectiveBlockedDomains = normalizeDomains(blockedDomains)
214171
return results.filter { result ->
215172
val urlLower = result.url.lowercase()
216173

217-
blockedDomains?.any { domain ->
174+
effectiveBlockedDomains?.any { domain ->
218175
urlLower.contains(domain.lowercase())
219176
}?.let { if (it) return@filter false }
220177

221-
allowedDomains?.let { allowed ->
178+
effectiveAllowedDomains?.let { allowed ->
222179
allowed.any { domain ->
223180
urlLower.contains(domain.lowercase())
224181
}
225182
} ?: true
226183
}
227184
}
228185

186+
companion object {
187+
internal fun normalizeSearchResultUrl(url: String): String {
188+
if (url.isBlank()) return url
189+
190+
val absoluteUrl = if (url.startsWith("//")) {
191+
"https:$url"
192+
} else {
193+
url
194+
}
195+
196+
return runCatching {
197+
val uri = URI(absoluteUrl)
198+
val host = uri.host?.lowercase()
199+
if (host != "duckduckgo.com" && host != "www.duckduckgo.com") {
200+
return absoluteUrl
201+
}
202+
203+
val query = uri.rawQuery ?: return absoluteUrl
204+
query.split("&")
205+
.firstNotNullOfOrNull { segment ->
206+
val idx = segment.indexOf('=')
207+
if (idx <= 0) return@firstNotNullOfOrNull null
208+
val key = segment.substring(0, idx)
209+
if (key != "uddg") return@firstNotNullOfOrNull null
210+
URLDecoder.decode(segment.substring(idx + 1), StandardCharsets.UTF_8)
211+
}
212+
?: absoluteUrl
213+
}.getOrDefault(absoluteUrl)
214+
}
215+
216+
internal fun normalizeDomains(domains: List<String>?): List<String>? {
217+
return domains
218+
?.map { it.trim() }
219+
?.filter { it.isNotEmpty() }
220+
?.takeIf { it.isNotEmpty() }
221+
}
222+
}
223+
229224
override fun encodeResultToString(result: Result): String =
230225
buildString {
231226
if (result.results.isEmpty()) {
@@ -243,16 +238,4 @@ IMPORTANT - Use the correct year in search queries:
243238
appendLine()
244239
appendLine("*Click on any result to view the full content*")
245240
}.trimEnd().truncateToolResult()
246-
247-
@Serializable
248-
private data class SearxResponse(
249-
val results: List<SearxResult>
250-
)
251-
252-
@Serializable
253-
private data class SearxResult(
254-
val title: String,
255-
val url: String,
256-
val content: String
257-
)
258241
}

0 commit comments

Comments
 (0)