Azure-Samples · saravana87 · Sep 24, 2024 · Sep 24, 2024
diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
@@ -20,6 +20,7 @@
     IntegratedVectorizerStrategy,
 )
 from prepdocslib.jsonparser import JsonParser
+from prepdocslib.csvparser import CsvParser
 from prepdocslib.listfilestrategy import (
     ADLSGen2ListFileStrategy,
     ListFileStrategy,
@@ -183,6 +184,7 @@ def setup_file_processors(
         ".docx": FileProcessor(doc_int_parser, sentence_text_splitter),
         ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter),
         ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter),
+        ".csv": FileProcessor(CsvParser(), sentence_text_splitter),
         ".png": FileProcessor(doc_int_parser, sentence_text_splitter),
         ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter),
         ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter),

diff --git a/app/backend/prepdocslib/csvparser.py b/app/backend/prepdocslib/csvparser.py
@@ -0,0 +1,20 @@
+import csv
+from typing import IO, AsyncGenerator
+from .page import Page
+from .parser import Parser
+
+
+class CsvParser(Parser):
+    """
+    Concrete parser that can parse CSV into Page objects. Each row becomes a Page object.
+    """
+
+    async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
+        # Ensure the file is read in text mode
+        text_content = content.read().decode('utf-8')  # Decode bytes to string if opened in binary mode
+        reader = csv.reader(text_content.splitlines())  # Create CSV reader from text lines
+        offset = 0
+        for i, row in enumerate(reader):
+            page_text = ",".join(row)  # Combine CSV row elements back to a string
+            yield Page(i, offset, page_text)
+            offset += len(page_text) + 1  # Add 1 for the newline character or comma