Skip to content

Commit c7e7e4d

Browse files
authored
Merge pull request #1 from saravana87/dev01
Dev01
2 parents 0225f75 + 15307b8 commit c7e7e4d

File tree

2 files changed

+22
-0
lines changed

2 files changed

+22
-0
lines changed

app/backend/prepdocs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
IntegratedVectorizerStrategy,
2121
)
2222
from prepdocslib.jsonparser import JsonParser
23+
from prepdocslib.csvparser import CsvParser
2324
from prepdocslib.listfilestrategy import (
2425
ADLSGen2ListFileStrategy,
2526
ListFileStrategy,
@@ -183,6 +184,7 @@ def setup_file_processors(
183184
".docx": FileProcessor(doc_int_parser, sentence_text_splitter),
184185
".pptx": FileProcessor(doc_int_parser, sentence_text_splitter),
185186
".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter),
187+
".csv": FileProcessor(CsvParser(), sentence_text_splitter),
186188
".png": FileProcessor(doc_int_parser, sentence_text_splitter),
187189
".jpg": FileProcessor(doc_int_parser, sentence_text_splitter),
188190
".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter),
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import csv
2+
from typing import IO, AsyncGenerator
3+
from .page import Page
4+
from .parser import Parser
5+
6+
7+
class CsvParser(Parser):
8+
"""
9+
Concrete parser that can parse CSV into Page objects. Each row becomes a Page object.
10+
"""
11+
12+
async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
13+
# Ensure the file is read in text mode
14+
text_content = content.read().decode('utf-8') # Decode bytes to string if opened in binary mode
15+
reader = csv.reader(text_content.splitlines()) # Create CSV reader from text lines
16+
offset = 0
17+
for i, row in enumerate(reader):
18+
page_text = ",".join(row) # Combine CSV row elements back to a string
19+
yield Page(i, offset, page_text)
20+
offset += len(page_text) + 1 # Add 1 for the newline character or comma

0 commit comments

Comments
 (0)