From bc1b5e30552071be1b99174fe5a82afae3e72e5b Mon Sep 17 00:00:00 2001 From: Saravana Date: Wed, 25 Sep 2024 01:57:39 +0530 Subject: [PATCH 1/2] Create csvparser.py I am adding the CSV parser python code. it works with basic CSV files. --- app/backend/prepdocslib/csvparser.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 app/backend/prepdocslib/csvparser.py diff --git a/app/backend/prepdocslib/csvparser.py b/app/backend/prepdocslib/csvparser.py new file mode 100644 index 0000000000..05f0c92d56 --- /dev/null +++ b/app/backend/prepdocslib/csvparser.py @@ -0,0 +1,20 @@ +import csv +from typing import IO, AsyncGenerator +from .page import Page +from .parser import Parser + + +class CsvParser(Parser): + """ + Concrete parser that can parse CSV into Page objects. Each row becomes a Page object. + """ + + async def parse(self, content: IO) -> AsyncGenerator[Page, None]: + # Ensure the file is read in text mode + text_content = content.read().decode('utf-8') # Decode bytes to string if opened in binary mode + reader = csv.reader(text_content.splitlines()) # Create CSV reader from text lines + offset = 0 + for i, row in enumerate(reader): + page_text = ",".join(row) # Combine CSV row elements back to a string + yield Page(i, offset, page_text) + offset += len(page_text) + 1 # Add 1 for the newline character or comma From 15307b8532ff018f378697224545e13a18fd942d Mon Sep 17 00:00:00 2001 From: Saravana Date: Wed, 25 Sep 2024 01:59:40 +0530 Subject: [PATCH 2/2] Update prepdocs.py updating the csv parser code and importing the CsvParser class --- app/backend/prepdocs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index deea428139..f649d5c8b5 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -20,6 +20,7 @@ IntegratedVectorizerStrategy, ) from prepdocslib.jsonparser import JsonParser +from prepdocslib.csvparser import CsvParser from prepdocslib.listfilestrategy import ( ADLSGen2ListFileStrategy, ListFileStrategy, @@ -183,6 +184,7 @@ def setup_file_processors( ".docx": FileProcessor(doc_int_parser, sentence_text_splitter), ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter), ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter), + ".csv": FileProcessor(CsvParser(), sentence_text_splitter), ".png": FileProcessor(doc_int_parser, sentence_text_splitter), ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter), ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter),