diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index 696cf6397f..420b4af39f 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -10,6 +10,7 @@ from load_azd_env import load_azd_env from prepdocslib.blobmanager import BlobManager +from prepdocslib.csvparser import CsvParser from prepdocslib.embeddings import ( AzureOpenAIEmbeddingService, ImageEmbeddings, @@ -190,6 +191,7 @@ def setup_file_processors( ".json": FileProcessor(JsonParser(), SimpleTextSplitter()), ".md": FileProcessor(TextParser(), sentence_text_splitter), ".txt": FileProcessor(TextParser(), sentence_text_splitter), + ".csv": FileProcessor(CsvParser(), sentence_text_splitter), } # These require either a Python package or Document Intelligence if pdf_parser is not None: diff --git a/app/backend/prepdocslib/csvparser.py b/app/backend/prepdocslib/csvparser.py new file mode 100644 index 0000000000..cc8cb6d90d --- /dev/null +++ b/app/backend/prepdocslib/csvparser.py @@ -0,0 +1,31 @@ +import csv +from typing import IO, AsyncGenerator + +from .page import Page +from .parser import Parser + + +class CsvParser(Parser): + """ + Concrete parser that can parse CSV into Page objects. Each row becomes a Page object. + """ + + async def parse(self, content: IO) -> AsyncGenerator[Page, None]: + # Check if content is in bytes (binary file) and decode to string + content_str: str + if isinstance(content, (bytes, bytearray)): + content_str = content.decode("utf-8") + elif hasattr(content, "read"): # Handle BufferedReader + content_str = content.read().decode("utf-8") + + # Create a CSV reader from the text content + reader = csv.reader(content_str.splitlines()) + offset = 0 + + # Skip the header row + next(reader, None) + + for i, row in enumerate(reader): + page_text = ",".join(row) + yield Page(i, offset, page_text) + offset += len(page_text) + 1 # Account for newline character diff --git a/tests/test_app_config.py b/tests/test_app_config.py index 29139d2a02..cc7f440083 100644 --- a/tests/test_app_config.py +++ b/tests/test_app_config.py @@ -63,7 +63,7 @@ async def test_app_user_upload_processors(monkeypatch, minimal_env): async with quart_app.test_app(): ingester = quart_app.config[app.CONFIG_INGESTER] assert ingester is not None - assert len(ingester.file_processors.keys()) == 5 + assert len(ingester.file_processors.keys()) == 6 @pytest.mark.asyncio @@ -77,7 +77,7 @@ async def test_app_user_upload_processors_docint(monkeypatch, minimal_env): async with quart_app.test_app(): ingester = quart_app.config[app.CONFIG_INGESTER] assert ingester is not None - assert len(ingester.file_processors.keys()) == 14 + assert len(ingester.file_processors.keys()) == 15 @pytest.mark.asyncio @@ -92,7 +92,7 @@ async def test_app_user_upload_processors_docint_localpdf(monkeypatch, minimal_e async with quart_app.test_app(): ingester = quart_app.config[app.CONFIG_INGESTER] assert ingester is not None - assert len(ingester.file_processors.keys()) == 14 + assert len(ingester.file_processors.keys()) == 15 assert ingester.file_processors[".pdf"] is not ingester.file_processors[".pptx"] @@ -108,7 +108,7 @@ async def test_app_user_upload_processors_docint_localhtml(monkeypatch, minimal_ async with quart_app.test_app(): ingester = quart_app.config[app.CONFIG_INGESTER] assert ingester is not None - assert len(ingester.file_processors.keys()) == 14 + assert len(ingester.file_processors.keys()) == 15 assert ingester.file_processors[".html"] is not ingester.file_processors[".pptx"] diff --git a/tests/test_csvparser.py b/tests/test_csvparser.py new file mode 100644 index 0000000000..3db9dc13f8 --- /dev/null +++ b/tests/test_csvparser.py @@ -0,0 +1,57 @@ +import io + +import pytest + +from prepdocslib.csvparser import CsvParser # Adjust import to the correct module + + +@pytest.mark.asyncio +async def test_csvparser_single_row(): + # Mock CSV content with a single row in binary format + file = io.BytesIO(b"col1,col2,col3\nvalue1,value2,value3") + file.name = "test.csv" + csvparser = CsvParser() + + # Parse the file + pages = [page async for page in csvparser.parse(file)] + + # Assertions + assert len(pages) == 1 + assert pages[0].page_num == 0 + assert pages[0].offset == 0 + assert pages[0].text == "value1,value2,value3" + + +@pytest.mark.asyncio +async def test_csvparser_multiple_rows(): + # Mock CSV content with multiple rows in binary format + file = io.BytesIO(b"col1,col2,col3\nvalue1,value2,value3\nvalue4,value5,value6") + file.name = "test.csv" + csvparser = CsvParser() + + # Parse the file + pages = [page async for page in csvparser.parse(file)] + + # Assertions + assert len(pages) == 2 # Expect only data rows, skipping the header + assert pages[0].page_num == 0 + assert pages[0].offset == 0 + assert pages[0].text == "value1,value2,value3" + + assert pages[1].page_num == 1 + assert pages[1].offset == len(pages[0].text) + 1 # Length of the first row plus a newline + assert pages[1].text == "value4,value5,value6" + + +@pytest.mark.asyncio +async def test_csvparser_empty_file(): + # Mock empty CSV content in binary format + file = io.BytesIO(b"") + file.name = "test.csv" + csvparser = CsvParser() + + # Parse the file + pages = [page async for page in csvparser.parse(file)] + + # Assertions + assert len(pages) == 0 # No rows should be parsed from an empty file