Adding the ability to choose from a text (sentence) processor or a markdown processor for adding a file to the RAG db. The commit is created as part of https://issues.redhat.com/browse/MGMT-21899.

andrej1991 · andrej1991 · commit 1ffe30cce64a · 2025-10-13T11:53:04.000+02:00
diff --git a/src/lightspeed_rag_content/document_processor.py b/src/lightspeed_rag_content/document_processor.py
@@ -25,6 +25,7 @@
 import faiss
 from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
 from llama_index.core.llms.utils import resolve_llm
+from llama_index.core.node_parser import MarkdownNodeParser
 from llama_index.core.readers.base import BaseReader
 from llama_index.core.schema import Document, TextNode
 from llama_index.core.storage.storage_context import StorageContext
@@ -72,6 +73,8 @@ def __init__(self, config: _Config):
                 model_name=str(self.config.embeddings_model_dir)
             )
             Settings.llm = resolve_llm(None)
+        if config.doc_type == "markdown":
+            Settings.node_parser = MarkdownNodeParser()
 
     @staticmethod
     def _got_whitespace(text: str) -> bool:
@@ -391,6 +394,7 @@ def __init__(
         vector_store_type: str = "faiss",
         table_name: Optional[str] = None,
         manual_chunking: bool = True,
+        doc_type: str = "text",
     ):
         """Initialize instance."""
         if vector_store_type == "postgres" and not table_name:
@@ -406,6 +410,7 @@ def __init__(
             vector_store_type=vector_store_type,
             table_name=table_name,
             manual_chunking=manual_chunking,
+            doc_type=doc_type,
         )
 
         self._check_config(self.config)
diff --git a/src/lightspeed_rag_content/utils.py b/src/lightspeed_rag_content/utils.py
@@ -68,4 +68,11 @@ def get_common_arg_parser() -> argparse.ArgumentParser:
         help="How to do the chunking for llama-stack, manually like in "
         "llama-index or automatically using the RAG runtime tool.",
     )
+    parser.add_argument(
+        "-dt",
+        "--document-type",
+        dest="doc_type",
+        default="text",
+        help="The type of the document which is to be added to the RAG",
+    )
     return parser
diff --git a/tests/test_document_processor.py b/tests/test_document_processor.py
@@ -52,6 +52,7 @@ def setUp(self):
             "model_name": "sentence-transformers/all-mpnet-base-v2",
             "embeddings_model_dir": "embeddings_model",
             "num_workers": 10,
+            "doc_type": "text",
         }
         self.log = self.patch_object(document_processor, "LOG")
         self.indexdb = self.patch_object(document_processor, "_LlamaIndexDB")
diff --git a/tests/test_document_processor_llama_stack.py b/tests/test_document_processor_llama_stack.py
@@ -54,6 +54,7 @@ def setUp(self):
             vector_store_type="llamastack-faiss",
             embedding_dimension=None,
             manual_chunking=True,
+            doc_type="text",
         )
 
     @mock.patch.object(document_processor.tempfile, "TemporaryDirectory")

Original file line number	Diff line number	Diff line change
`@@ -52,6 +52,7 @@ def setUp(self):`
`52`	`52`	`"model_name": "sentence-transformers/all-mpnet-base-v2",`
`53`	`53`	`"embeddings_model_dir": "embeddings_model",`
`54`	`54`	`"num_workers": 10,`
	`55`	`+ "doc_type": "text",`
`55`	`56`	`}`
`56`	`57`	`self.log = self.patch_object(document_processor, "LOG")`
`57`	`58`	`self.indexdb = self.patch_object(document_processor, "_LlamaIndexDB")`
Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,7 @@ def setUp(self):`
`54`	`54`	`vector_store_type="llamastack-faiss",`
`55`	`55`	`embedding_dimension=None,`
`56`	`56`	`manual_chunking=True,`
	`57`	`+ doc_type="text",`
`57`	`58`	`)`
`58`	`59`
`59`	`60`	`@mock.patch.object(document_processor.tempfile, "TemporaryDirectory")`