File tree Expand file tree Collapse file tree 2 files changed +22
-0
lines changed Expand file tree Collapse file tree 2 files changed +22
-0
lines changed Original file line number Diff line number Diff line change 20
20
IntegratedVectorizerStrategy ,
21
21
)
22
22
from prepdocslib .jsonparser import JsonParser
23
+ from prepdocslib .csvparser import CsvParser
23
24
from prepdocslib .listfilestrategy import (
24
25
ADLSGen2ListFileStrategy ,
25
26
ListFileStrategy ,
@@ -183,6 +184,7 @@ def setup_file_processors(
183
184
".docx" : FileProcessor (doc_int_parser , sentence_text_splitter ),
184
185
".pptx" : FileProcessor (doc_int_parser , sentence_text_splitter ),
185
186
".xlsx" : FileProcessor (doc_int_parser , sentence_text_splitter ),
187
+ ".csv" : FileProcessor (CsvParser (), sentence_text_splitter ),
186
188
".png" : FileProcessor (doc_int_parser , sentence_text_splitter ),
187
189
".jpg" : FileProcessor (doc_int_parser , sentence_text_splitter ),
188
190
".jpeg" : FileProcessor (doc_int_parser , sentence_text_splitter ),
Original file line number Diff line number Diff line change
1
+ import csv
2
+ from typing import IO , AsyncGenerator
3
+ from .page import Page
4
+ from .parser import Parser
5
+
6
+
7
+ class CsvParser (Parser ):
8
+ """
9
+ Concrete parser that can parse CSV into Page objects. Each row becomes a Page object.
10
+ """
11
+
12
+ async def parse (self , content : IO ) -> AsyncGenerator [Page , None ]:
13
+ # Ensure the file is read in text mode
14
+ text_content = content .read ().decode ('utf-8' ) # Decode bytes to string if opened in binary mode
15
+ reader = csv .reader (text_content .splitlines ()) # Create CSV reader from text lines
16
+ offset = 0
17
+ for i , row in enumerate (reader ):
18
+ page_text = "," .join (row ) # Combine CSV row elements back to a string
19
+ yield Page (i , offset , page_text )
20
+ offset += len (page_text ) + 1 # Add 1 for the newline character or comma
You can’t perform that action at this time.
0 commit comments