Skip to content

Commit 1ffe30c

Browse files
committed
Adding the ability to choose from a text (sentence) processor or a markdown processor for adding a file to the RAG db. The commit is created as part of https://issues.redhat.com/browse/MGMT-21899.
1 parent 218898f commit 1ffe30c

File tree

4 files changed

+14
-0
lines changed

4 files changed

+14
-0
lines changed

src/lightspeed_rag_content/document_processor.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import faiss
2626
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
2727
from llama_index.core.llms.utils import resolve_llm
28+
from llama_index.core.node_parser import MarkdownNodeParser
2829
from llama_index.core.readers.base import BaseReader
2930
from llama_index.core.schema import Document, TextNode
3031
from llama_index.core.storage.storage_context import StorageContext
@@ -72,6 +73,8 @@ def __init__(self, config: _Config):
7273
model_name=str(self.config.embeddings_model_dir)
7374
)
7475
Settings.llm = resolve_llm(None)
76+
if config.doc_type == "markdown":
77+
Settings.node_parser = MarkdownNodeParser()
7578

7679
@staticmethod
7780
def _got_whitespace(text: str) -> bool:
@@ -391,6 +394,7 @@ def __init__(
391394
vector_store_type: str = "faiss",
392395
table_name: Optional[str] = None,
393396
manual_chunking: bool = True,
397+
doc_type: str = "text",
394398
):
395399
"""Initialize instance."""
396400
if vector_store_type == "postgres" and not table_name:
@@ -406,6 +410,7 @@ def __init__(
406410
vector_store_type=vector_store_type,
407411
table_name=table_name,
408412
manual_chunking=manual_chunking,
413+
doc_type=doc_type,
409414
)
410415

411416
self._check_config(self.config)

src/lightspeed_rag_content/utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,4 +68,11 @@ def get_common_arg_parser() -> argparse.ArgumentParser:
6868
help="How to do the chunking for llama-stack, manually like in "
6969
"llama-index or automatically using the RAG runtime tool.",
7070
)
71+
parser.add_argument(
72+
"-dt",
73+
"--document-type",
74+
dest="doc_type",
75+
default="text",
76+
help="The type of the document which is to be added to the RAG",
77+
)
7178
return parser

tests/test_document_processor.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ def setUp(self):
5252
"model_name": "sentence-transformers/all-mpnet-base-v2",
5353
"embeddings_model_dir": "embeddings_model",
5454
"num_workers": 10,
55+
"doc_type": "text",
5556
}
5657
self.log = self.patch_object(document_processor, "LOG")
5758
self.indexdb = self.patch_object(document_processor, "_LlamaIndexDB")

tests/test_document_processor_llama_stack.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ def setUp(self):
5454
vector_store_type="llamastack-faiss",
5555
embedding_dimension=None,
5656
manual_chunking=True,
57+
doc_type="text",
5758
)
5859

5960
@mock.patch.object(document_processor.tempfile, "TemporaryDirectory")

0 commit comments

Comments
 (0)