Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions haystack_experimental/components/preprocessors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@

_import_structure = {
"embedding_based_document_splitter": ["EmbeddingBasedDocumentSplitter"],
"md_header_level_inferrer": ["MarkdownHeaderLevelInferrer"],
}

if TYPE_CHECKING:
from .embedding_based_document_splitter import EmbeddingBasedDocumentSplitter
from .md_header_level_inferrer import MarkdownHeaderLevelInferrer

else:
sys.modules[__name__] = LazyImporter(name=__name__, module_file=__file__, import_structure=_import_structure)
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class MarkdownHeaderLevelInferrer:
from haystack_experimental.components.preprocessors import MarkdownHeaderLevelInferrer

# Create a document with uniform header levels
text = "## Title\nSome content\n## Section\nMore content\n## Subsection\nFinal content"
text = "## Title\n## Subheader\nSection\n## Subheader\nMore Content"
doc = Document(content=text)

# Initialize the inferrer and process the document
Expand All @@ -33,7 +33,7 @@ class MarkdownHeaderLevelInferrer:

# The headers are now normalized with proper hierarchy
print(result["documents"][0].content)
> # Title\nSome content\n## Section\nMore content\n### Subsection\nFinal content
> # Title\n## Subheader\nSection\n## Subheader\nMore Content
```
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

from haystack import Document
from haystack_experimental.components.preprocessors.md_header_level_inferrer import MarkdownHeaderLevelInferrer
from haystack_experimental.components.preprocessors import MarkdownHeaderLevelInferrer


def test_single_header_level_inference():
Expand Down Expand Up @@ -124,7 +124,7 @@ def test_multiple_documents():
inferrer = MarkdownHeaderLevelInferrer()
docs = [Document(content=text1), Document(content=text2)]
result = inferrer.run(docs)

assert len(result["documents"]) == 2
assert result["documents"][0].content == "# Title 1\nContent 1"
assert result["documents"][1].content == "# Title 2\nContent 2"
Expand Down Expand Up @@ -154,7 +154,7 @@ def test_very_long_content():
inferrer = MarkdownHeaderLevelInferrer()
doc = Document(content=text)
result = inferrer.run([doc])

# verify first header becomes level 1, others follow the pattern
content = result["documents"][0].content
assert content.startswith("# Header 0")
Expand Down
Loading