diff --git a/haystack_experimental/components/preprocessors/__init__.py b/haystack_experimental/components/preprocessors/__init__.py index c377db50..d9c66e64 100644 --- a/haystack_experimental/components/preprocessors/__init__.py +++ b/haystack_experimental/components/preprocessors/__init__.py @@ -9,10 +9,12 @@ _import_structure = { "embedding_based_document_splitter": ["EmbeddingBasedDocumentSplitter"], + "md_header_level_inferrer": ["MarkdownHeaderLevelInferrer"], } if TYPE_CHECKING: from .embedding_based_document_splitter import EmbeddingBasedDocumentSplitter + from .md_header_level_inferrer import MarkdownHeaderLevelInferrer else: sys.modules[__name__] = LazyImporter(name=__name__, module_file=__file__, import_structure=_import_structure) diff --git a/haystack_experimental/components/preprocessors/md_header_level_inferrer.py b/haystack_experimental/components/preprocessors/md_header_level_inferrer.py index 004d4d35..763fed2a 100644 --- a/haystack_experimental/components/preprocessors/md_header_level_inferrer.py +++ b/haystack_experimental/components/preprocessors/md_header_level_inferrer.py @@ -24,7 +24,7 @@ class MarkdownHeaderLevelInferrer: from haystack_experimental.components.preprocessors import MarkdownHeaderLevelInferrer # Create a document with uniform header levels - text = "## Title\nSome content\n## Section\nMore content\n## Subsection\nFinal content" + text = "## Title\n## Subheader\nSection\n## Subheader\nMore Content" doc = Document(content=text) # Initialize the inferrer and process the document @@ -33,7 +33,7 @@ class MarkdownHeaderLevelInferrer: # The headers are now normalized with proper hierarchy print(result["documents"][0].content) - > # Title\nSome content\n## Section\nMore content\n### Subsection\nFinal content + > # Title\n## Subheader\nSection\n## Subheader\nMore Content ``` """ diff --git a/test/components/preprocessors/test_markdown_header_level_inferrer.py b/test/components/preprocessors/test_markdown_header_level_inferrer.py index 220f5cde..5aeaadf8 100644 --- a/test/components/preprocessors/test_markdown_header_level_inferrer.py +++ b/test/components/preprocessors/test_markdown_header_level_inferrer.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from haystack import Document -from haystack_experimental.components.preprocessors.md_header_level_inferrer import MarkdownHeaderLevelInferrer +from haystack_experimental.components.preprocessors import MarkdownHeaderLevelInferrer def test_single_header_level_inference(): @@ -124,7 +124,7 @@ def test_multiple_documents(): inferrer = MarkdownHeaderLevelInferrer() docs = [Document(content=text1), Document(content=text2)] result = inferrer.run(docs) - + assert len(result["documents"]) == 2 assert result["documents"][0].content == "# Title 1\nContent 1" assert result["documents"][1].content == "# Title 2\nContent 2" @@ -154,7 +154,7 @@ def test_very_long_content(): inferrer = MarkdownHeaderLevelInferrer() doc = Document(content=text) result = inferrer.run([doc]) - + # verify first header becomes level 1, others follow the pattern content = result["documents"][0].content assert content.startswith("# Header 0")