Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions app/backend/prepdocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
IntegratedVectorizerStrategy,
)
from prepdocslib.jsonparser import JsonParser
from prepdocslib.csvparser import CsvParser
from prepdocslib.listfilestrategy import (
ADLSGen2ListFileStrategy,
ListFileStrategy,
Expand Down Expand Up @@ -183,6 +184,7 @@ def setup_file_processors(
".docx": FileProcessor(doc_int_parser, sentence_text_splitter),
".pptx": FileProcessor(doc_int_parser, sentence_text_splitter),
".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter),
".csv": FileProcessor(CsvParser(), sentence_text_splitter),
".png": FileProcessor(doc_int_parser, sentence_text_splitter),
".jpg": FileProcessor(doc_int_parser, sentence_text_splitter),
".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter),
Expand Down
20 changes: 20 additions & 0 deletions app/backend/prepdocslib/csvparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import csv
from typing import IO, AsyncGenerator
from .page import Page
from .parser import Parser


class CsvParser(Parser):
"""
Concrete parser that can parse CSV into Page objects. Each row becomes a Page object.
"""

async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
# Ensure the file is read in text mode
text_content = content.read().decode('utf-8') # Decode bytes to string if opened in binary mode
reader = csv.reader(text_content.splitlines()) # Create CSV reader from text lines
offset = 0
for i, row in enumerate(reader):
page_text = ",".join(row) # Combine CSV row elements back to a string
yield Page(i, offset, page_text)
offset += len(page_text) + 1 # Add 1 for the newline character or comma
Loading