diff --git a/libs/README.md b/libs/README.md index bd6c5f71..a917214c 100644 --- a/libs/README.md +++ b/libs/README.md @@ -328,6 +328,7 @@ For sitemap sources, additional parameters can be provided, e.g.: - `web_path`: The URL of the XML sitemap to crawl - `filter_urls`: JSON array of URL patterns to filter pages (optional) - `header_template`: JSON object for custom HTTP headers (optional) +- `continue_on_failure`: Whether to skip pages that fail to load instead of aborting the crawl (optional, default: `true`) Technically, all parameters of the `SitemapLoader` from LangChain can be provided. diff --git a/libs/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py b/libs/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py index 94c72dbb..88a87b5c 100644 --- a/libs/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py +++ b/libs/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py @@ -118,6 +118,9 @@ async def aextract_content( if meta_function is not None: sitemap_loader_parameters["meta_function"] = meta_function + if "continue_on_failure" not in sitemap_loader_parameters: + sitemap_loader_parameters["continue_on_failure"] = True + document_loader = SitemapLoader(**sitemap_loader_parameters) documents = [] try: @@ -162,6 +165,21 @@ def _parse_sitemap_loader_parameters( sitemap_loader_parameters[x.key] = json.loads(x.value) except (json.JSONDecodeError, TypeError): sitemap_loader_parameters[x.key] = x.value + elif x.key == "continue_on_failure": + sitemap_loader_parameters[x.key] = self._normalize_boolean(x.value) else: sitemap_loader_parameters[x.key] = int(x.value) if x.value.isdigit() else x.value return sitemap_loader_parameters, parser_override + + def _normalize_boolean(self, value: str) -> Optional[bool]: + if isinstance(value, bool): + return value + if isinstance(value, str): + normalized = value.strip().lower() + if normalized in ("true", "1", "yes", "y", "on"): + return True + if normalized in ("false", "0", "no", "n", "off"): + return False + if isinstance(value, (int, float)): + return bool(value) + return None diff --git a/libs/extractor-api-lib/tests/sitemap_extractor_test.py b/libs/extractor-api-lib/tests/sitemap_extractor_test.py index aac6812d..2cf7b0c5 100644 --- a/libs/extractor-api-lib/tests/sitemap_extractor_test.py +++ b/libs/extractor-api-lib/tests/sitemap_extractor_test.py @@ -269,7 +269,10 @@ async def test_aextract_content_minimal_parameters(self, mock_sitemap_loader_cla # Verify assert len(result) == 1 - mock_sitemap_loader_class.assert_called_once_with(web_path="https://example.com/sitemap.xml") + mock_sitemap_loader_class.assert_called_once_with( + web_path="https://example.com/sitemap.xml", + continue_on_failure=True, + ) @pytest.mark.asyncio @patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader") @@ -428,8 +431,8 @@ async def test_aextract_content_edge_case_empty_kwargs(self, mock_sitemap_loader # Verify assert result == [] - # Should still call SitemapLoader but with no additional parameters - mock_sitemap_loader_class.assert_called_once_with() + # Should still call SitemapLoader but with default failure handling + mock_sitemap_loader_class.assert_called_once_with(continue_on_failure=True) @pytest.mark.asyncio @patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader") @@ -441,7 +444,7 @@ async def test_aextract_content_mixed_parameter_types(self, mock_sitemap_loader_ kwargs=[ KeyValuePair(key="web_path", value="https://example.com/sitemap.xml"), KeyValuePair(key="max_depth", value="3"), # Will be converted to int - KeyValuePair(key="continue_on_failure", value="true"), # Will remain string + KeyValuePair(key="continue_on_failure", value="true"), # Will be converted to bool KeyValuePair(key="filter_urls", value='["pattern1", "pattern2"]'), # Will be parsed as JSON KeyValuePair( key="header_template", value='{"Authorization": "Bearer token123"}' @@ -462,7 +465,7 @@ async def test_aextract_content_mixed_parameter_types(self, mock_sitemap_loader_ call_args = mock_sitemap_loader_class.call_args[1] assert call_args["web_path"] == "https://example.com/sitemap.xml" assert call_args["max_depth"] == 3 # Converted to int - assert call_args["continue_on_failure"] == "true" # Remained string + assert call_args["continue_on_failure"] is True # Converted to bool assert call_args["filter_urls"] == ["pattern1", "pattern2"] # Parsed JSON assert call_args["header_template"] == {"Authorization": "Bearer token123"} # Parsed JSON assert call_args["custom_param"] == "custom_value" # Remained string diff --git a/services/frontend/libs/admin-app/data-access/document.api.ts b/services/frontend/libs/admin-app/data-access/document.api.ts index 809e5517..cf691984 100644 --- a/services/frontend/libs/admin-app/data-access/document.api.ts +++ b/services/frontend/libs/admin-app/data-access/document.api.ts @@ -24,6 +24,7 @@ export interface SitemapConfig { headerTemplate: string; name: string; parser?: 'docusaurus' | 'astro' | 'generic'; + continueOnFailure?: boolean; } export class DocumentAPI { @@ -99,6 +100,10 @@ export class DocumentAPI { payload.push({ key: 'sitemap_parser', value: config.parser }); } + if (typeof config.continueOnFailure === 'boolean') { + payload.push({ key: 'continue_on_failure', value: String(config.continueOnFailure) }); + } + // add filter_urls only if provided if (config.filterUrls && config.filterUrls.trim()) { // Convert multiline string to array and filter out empty lines diff --git a/services/frontend/libs/admin-app/feature-document/DocumentUploadContainer.vue b/services/frontend/libs/admin-app/feature-document/DocumentUploadContainer.vue index 8f51d020..b01ee8fd 100644 --- a/services/frontend/libs/admin-app/feature-document/DocumentUploadContainer.vue +++ b/services/frontend/libs/admin-app/feature-document/DocumentUploadContainer.vue @@ -33,6 +33,7 @@ const confluenceCql = ref(''); const sitemapFilterUrls = ref(''); const sitemapHeaderTemplate = ref(''); const sitemapParser = ref<'docusaurus' | 'astro' | 'generic' | undefined>(undefined); + const sitemapContinueOnFailure = ref(true); const error = computed(() => store.error); @@ -102,6 +103,7 @@ const handleConfluenceUpload = () => { filterUrls: sitemapFilterUrls.value, headerTemplate: sitemapHeaderTemplate.value, parser, + continueOnFailure: sitemapContinueOnFailure.value, }); } @@ -237,6 +239,15 @@ const getErrorMessage = (errorType: string) => { +
{{ t('documents.sitemapLoadDescription') }}