diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index deea428139..f649d5c8b5 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -20,6 +20,7 @@ IntegratedVectorizerStrategy, ) from prepdocslib.jsonparser import JsonParser +from prepdocslib.csvparser import CsvParser from prepdocslib.listfilestrategy import ( ADLSGen2ListFileStrategy, ListFileStrategy, @@ -183,6 +184,7 @@ def setup_file_processors( ".docx": FileProcessor(doc_int_parser, sentence_text_splitter), ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter), ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter), + ".csv": FileProcessor(CsvParser(), sentence_text_splitter), ".png": FileProcessor(doc_int_parser, sentence_text_splitter), ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter), ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter), diff --git a/app/backend/prepdocslib/csvparser.py b/app/backend/prepdocslib/csvparser.py new file mode 100644 index 0000000000..05f0c92d56 --- /dev/null +++ b/app/backend/prepdocslib/csvparser.py @@ -0,0 +1,20 @@ +import csv +from typing import IO, AsyncGenerator +from .page import Page +from .parser import Parser + + +class CsvParser(Parser): + """ + Concrete parser that can parse CSV into Page objects. Each row becomes a Page object. + """ + + async def parse(self, content: IO) -> AsyncGenerator[Page, None]: + # Ensure the file is read in text mode + text_content = content.read().decode('utf-8') # Decode bytes to string if opened in binary mode + reader = csv.reader(text_content.splitlines()) # Create CSV reader from text lines + offset = 0 + for i, row in enumerate(reader): + page_text = ",".join(row) # Combine CSV row elements back to a string + yield Page(i, offset, page_text) + offset += len(page_text) + 1 # Add 1 for the newline character or comma