Skip to content

Commit 6a71c29

Browse files
committed
feat: add continue_on_failure option to SitemapExtractor and related components
1 parent bb1e27c commit 6a71c29

File tree

7 files changed

+45
-5
lines changed

7 files changed

+45
-5
lines changed

libs/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,7 @@ For sitemap sources, additional parameters can be provided, e.g.:
328328
- `web_path`: The URL of the XML sitemap to crawl
329329
- `filter_urls`: JSON array of URL patterns to filter pages (optional)
330330
- `header_template`: JSON object for custom HTTP headers (optional)
331+
- `continue_on_failure`: Whether to skip pages that fail to load instead of aborting the crawl (optional, default: `true`)
331332

332333
Technically, all parameters of the `SitemapLoader` from LangChain can be provided.
333334

libs/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,9 @@ async def aextract_content(
118118
if meta_function is not None:
119119
sitemap_loader_parameters["meta_function"] = meta_function
120120

121+
if "continue_on_failure" not in sitemap_loader_parameters:
122+
sitemap_loader_parameters["continue_on_failure"] = True
123+
121124
document_loader = SitemapLoader(**sitemap_loader_parameters)
122125
documents = []
123126
try:
@@ -162,6 +165,19 @@ def _parse_sitemap_loader_parameters(
162165
sitemap_loader_parameters[x.key] = json.loads(x.value)
163166
except (json.JSONDecodeError, TypeError):
164167
sitemap_loader_parameters[x.key] = x.value
168+
elif x.key == "continue_on_failure":
169+
if isinstance(x.value, bool):
170+
sitemap_loader_parameters[x.key] = x.value
171+
elif isinstance(x.value, str):
172+
normalized = x.value.strip().lower()
173+
if normalized in ("true", "1", "yes", "y", "on"):
174+
sitemap_loader_parameters[x.key] = True
175+
elif normalized in ("false", "0", "no", "n", "off"):
176+
sitemap_loader_parameters[x.key] = False
177+
else:
178+
sitemap_loader_parameters[x.key] = x.value
179+
else:
180+
sitemap_loader_parameters[x.key] = x.value
165181
else:
166182
sitemap_loader_parameters[x.key] = int(x.value) if x.value.isdigit() else x.value
167183
return sitemap_loader_parameters, parser_override

libs/extractor-api-lib/tests/sitemap_extractor_test.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,10 @@ async def test_aextract_content_minimal_parameters(self, mock_sitemap_loader_cla
269269

270270
# Verify
271271
assert len(result) == 1
272-
mock_sitemap_loader_class.assert_called_once_with(web_path="https://example.com/sitemap.xml")
272+
mock_sitemap_loader_class.assert_called_once_with(
273+
web_path="https://example.com/sitemap.xml",
274+
continue_on_failure=True,
275+
)
273276

274277
@pytest.mark.asyncio
275278
@patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader")
@@ -428,8 +431,8 @@ async def test_aextract_content_edge_case_empty_kwargs(self, mock_sitemap_loader
428431

429432
# Verify
430433
assert result == []
431-
# Should still call SitemapLoader but with no additional parameters
432-
mock_sitemap_loader_class.assert_called_once_with()
434+
# Should still call SitemapLoader but with default failure handling
435+
mock_sitemap_loader_class.assert_called_once_with(continue_on_failure=True)
433436

434437
@pytest.mark.asyncio
435438
@patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader")
@@ -441,7 +444,7 @@ async def test_aextract_content_mixed_parameter_types(self, mock_sitemap_loader_
441444
kwargs=[
442445
KeyValuePair(key="web_path", value="https://example.com/sitemap.xml"),
443446
KeyValuePair(key="max_depth", value="3"), # Will be converted to int
444-
KeyValuePair(key="continue_on_failure", value="true"), # Will remain string
447+
KeyValuePair(key="continue_on_failure", value="true"), # Will be converted to bool
445448
KeyValuePair(key="filter_urls", value='["pattern1", "pattern2"]'), # Will be parsed as JSON
446449
KeyValuePair(
447450
key="header_template", value='{"Authorization": "Bearer token123"}'
@@ -462,7 +465,7 @@ async def test_aextract_content_mixed_parameter_types(self, mock_sitemap_loader_
462465
call_args = mock_sitemap_loader_class.call_args[1]
463466
assert call_args["web_path"] == "https://example.com/sitemap.xml"
464467
assert call_args["max_depth"] == 3 # Converted to int
465-
assert call_args["continue_on_failure"] == "true" # Remained string
468+
assert call_args["continue_on_failure"] is True # Converted to bool
466469
assert call_args["filter_urls"] == ["pattern1", "pattern2"] # Parsed JSON
467470
assert call_args["header_template"] == {"Authorization": "Bearer token123"} # Parsed JSON
468471
assert call_args["custom_param"] == "custom_value" # Remained string

services/frontend/libs/admin-app/data-access/document.api.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ export interface SitemapConfig {
2424
headerTemplate: string;
2525
name: string;
2626
parser?: 'docusaurus' | 'astro' | 'generic';
27+
continueOnFailure?: boolean;
2728
}
2829

2930
export class DocumentAPI {
@@ -99,6 +100,10 @@ export class DocumentAPI {
99100
payload.push({ key: 'sitemap_parser', value: config.parser });
100101
}
101102

103+
if (typeof config.continueOnFailure === 'boolean') {
104+
payload.push({ key: 'continue_on_failure', value: String(config.continueOnFailure) });
105+
}
106+
102107
// add filter_urls only if provided
103108
if (config.filterUrls && config.filterUrls.trim()) {
104109
// Convert multiline string to array and filter out empty lines

services/frontend/libs/admin-app/feature-document/DocumentUploadContainer.vue

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ const confluenceCql = ref('');
3333
const sitemapFilterUrls = ref('');
3434
const sitemapHeaderTemplate = ref('');
3535
const sitemapParser = ref<'docusaurus' | 'astro' | 'generic' | undefined>(undefined);
36+
const sitemapContinueOnFailure = ref(true);
3637
3738
const error = computed(() => store.error);
3839
@@ -102,6 +103,7 @@ const handleConfluenceUpload = () => {
102103
filterUrls: sitemapFilterUrls.value,
103104
headerTemplate: sitemapHeaderTemplate.value,
104105
parser,
106+
continueOnFailure: sitemapContinueOnFailure.value,
105107
});
106108
}
107109
@@ -237,6 +239,15 @@ const getErrorMessage = (errorType: string) => {
237239
<textarea v-model="sitemapFilterUrls" placeholder="Filter URLs (optional) - one regex pattern per line" class="textarea textarea-bordered w-full" rows="3"></textarea>
238240
<label for="sitemapHeaderTemplate" class="sr-only">Headers JSON</label>
239241
<textarea v-model="sitemapHeaderTemplate" placeholder="Headers (optional) - JSON format: {&quot;Authorization&quot;: &quot;Bearer token&quot;}" class="textarea textarea-bordered w-full" rows="2"></textarea>
242+
<label class="flex items-center justify-between text-sm">
243+
<span>{{ t('documents.sitemapContinueOnFailure') }}</span>
244+
<input
245+
v-model="sitemapContinueOnFailure"
246+
type="checkbox"
247+
class="checkbox checkbox-sm"
248+
:title="t('documents.sitemapContinueOnFailureHint')"
249+
/>
250+
</label>
240251
</div>
241252
<p class="text-xs opacity-50 mb-4">{{ t('documents.sitemapLoadDescription') }}</p>
242253
<button class="btn btn-sm btn-accent" @click="handleSitemapUpload">

services/frontend/libs/i18n/admin/de.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
"sitemapParserAstro": "Parser: Astro / Starlight",
2222
"sitemapParserDocusaurus": "Parser: Docusaurus",
2323
"sitemapParserGeneric": "Parser: Generisch",
24+
"sitemapContinueOnFailure": "Bei Fehlern fortfahren",
25+
"sitemapContinueOnFailureHint": "Fehlgeschlagene Seiten überspringen, statt den Crawl abzubrechen",
2426
"loadSitemap": "Sitemap laden",
2527
"fileTypeNotAllowedTitle": "Dateityp nicht erlaubt",
2628
"fileTypeNotAllowedDescription": "Erlaubte Dateitypen:",

services/frontend/libs/i18n/admin/en.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
"sitemapParserAstro": "Parser: Astro / Starlight",
2424
"sitemapParserDocusaurus": "Parser: Docusaurus",
2525
"sitemapParserGeneric": "Parser: Generic",
26+
"sitemapContinueOnFailure": "Continue on failure",
27+
"sitemapContinueOnFailureHint": "Skip pages that fail to load instead of aborting the crawl",
2628
"loadSitemap": "Load Sitemap",
2729
"select": "Select",
2830
"chat": "Start chat",

0 commit comments

Comments
 (0)