11import base64
22import json
33import io
4+ import csv
45
56from wasabi import msg
67
2728 msg .warn ("python-docx not installed, DOCX functionality will be limited." )
2829 docx = None
2930
31+ try :
32+ import pandas as pd
33+ except ImportError :
34+ msg .warn ("pandas not installed, Excel functionality will be limited." )
35+ pd = None
36+
37+ try :
38+ import openpyxl
39+ except ImportError :
40+ msg .warn ("openpyxl not installed, Excel functionality will be limited." )
41+ openpyxl = None
42+
43+ try :
44+ import xlrd
45+ except ImportError :
46+ msg .warn ("xlrd not installed, .xls file functionality will be limited." )
47+ xlrd = None
48+
3049
3150class BasicReader (Reader ):
3251 """
33- The BasicReader reads text, code, PDF, and DOCX files.
52+ The BasicReader reads text, code, PDF, DOCX, CSV, and Excel files.
3453 """
3554
3655 def __init__ (self ):
3756 super ().__init__ ()
3857 self .name = "Default"
39- self .description = "Ingests text, code, PDF, and DOCX files"
40- self .requires_library = ["pypdf" , "docx" , "spacy" ]
58+ self .description = "Ingests text, code, PDF, DOCX, CSV, and Excel files"
59+ self .requires_library = ["pypdf" , "docx" , "spacy" , "pandas" , "openpyxl" ]
4160 self .extension = [
4261 ".txt" ,
4362 ".py" ,
@@ -51,6 +70,7 @@ def __init__(self):
5170 ".docx" ,
5271 ".pptx" ,
5372 ".xlsx" ,
73+ ".xls" ,
5474 ".csv" ,
5575 ".ts" ,
5676 ".tsx" ,
@@ -93,6 +113,12 @@ async def load(self, config: dict, fileConfig: FileConfig) -> list[Document]:
93113 file_content = await self .load_pdf_file (decoded_bytes )
94114 elif fileConfig .extension .lower () == "docx" :
95115 file_content = await self .load_docx_file (decoded_bytes )
116+ elif fileConfig .extension .lower () == "csv" :
117+ file_content = await self .load_csv_file (decoded_bytes )
118+ elif fileConfig .extension .lower () in ["xlsx" , "xls" ]:
119+ file_content = await self .load_excel_file (
120+ decoded_bytes , fileConfig .extension .lower ()
121+ )
96122 elif fileConfig .extension .lower () in [
97123 ext .lstrip ("." ) for ext in self .extension
98124 ]:
@@ -150,3 +176,150 @@ async def load_docx_file(self, decoded_bytes: bytes) -> str:
150176 docx_bytes = io .BytesIO (decoded_bytes )
151177 reader = docx .Document (docx_bytes )
152178 return "\n " .join (paragraph .text for paragraph in reader .paragraphs )
179+
180+ async def load_csv_file (self , decoded_bytes : bytes ) -> str :
181+ """Load and convert CSV file to readable text format."""
182+ try :
183+ # Try UTF-8 first, fallback to latin-1
184+ try :
185+ text_content = decoded_bytes .decode ("utf-8" )
186+ except UnicodeDecodeError :
187+ text_content = decoded_bytes .decode ("latin-1" )
188+
189+ csv_reader = csv .reader (io .StringIO (text_content ))
190+ rows = list (csv_reader )
191+
192+ if not rows :
193+ return "Empty CSV file"
194+
195+ # Format as a readable table
196+ result = []
197+ headers = rows [0 ] if rows else []
198+
199+ # Add headers
200+ if headers :
201+ result .append ("Headers: " + " | " .join (headers ))
202+ result .append (" \n \n " )
203+
204+ # Add data rows
205+ for i , row in enumerate (rows [1 :], 1 ):
206+ if len (row ) == len (headers ):
207+ row_data = []
208+ for header , value in zip (headers , row ):
209+ row_data .append (f"{ header } : { value } " )
210+ result .append (f"Row { i } : { ' | ' .join (row_data )} " )
211+ else :
212+ # Handle rows with different column counts
213+ result .append (f"Row { i } : { ' | ' .join (row )} " )
214+ result .append (" \n \n " )
215+ return "\n " .join (result )
216+
217+ except Exception as e :
218+ raise ValueError (f"Error reading CSV file: { str (e )} " )
219+
220+ async def load_excel_file (self , decoded_bytes : bytes , extension : str ) -> str :
221+ """Load and convert Excel file to readable text format."""
222+ if not pd and not openpyxl :
223+ raise ImportError ("pandas or openpyxl is required to process Excel files." )
224+
225+ try :
226+ excel_bytes = io .BytesIO (decoded_bytes )
227+
228+ # Use pandas if available for better support
229+ if pd :
230+ # Read all sheets
231+ if extension == "xlsx" :
232+ sheets_dict = pd .read_excel (
233+ excel_bytes , sheet_name = None , engine = "openpyxl"
234+ )
235+ else : # xls
236+ try :
237+ sheets_dict = pd .read_excel (
238+ excel_bytes , sheet_name = None , engine = "xlrd"
239+ )
240+ except Exception as e :
241+ # Try auto engine detection as fallback
242+ try :
243+ sheets_dict = pd .read_excel (
244+ excel_bytes , sheet_name = None , engine = None
245+ )
246+ except Exception :
247+ raise ImportError (
248+ f"Cannot read .xls file. Please install 'xlrd' for .xls support: pip install xlrd. "
249+ f"Original error: { str (e )} "
250+ )
251+
252+ result = []
253+
254+ for sheet_name , df in sheets_dict .items ():
255+ result .append (f"\n Sheet: { sheet_name } " )
256+
257+ if df .empty :
258+ result .append ("(Empty sheet)" )
259+ continue
260+
261+ result .append (" \n \n " )
262+
263+ # Add column headers
264+ headers = df .columns .tolist ()
265+ result .append ("Headers: " + " | " .join (str (h ) for h in headers ))
266+ result .append (" \n \n " )
267+
268+ for idx , (_ , row ) in enumerate (df .iterrows ()):
269+ row_data = []
270+ for header , value in zip (headers , row ):
271+ # Handle NaN values
272+ display_value = str (value ) if pd .notna (value ) else ""
273+ row_data .append (f"{ header } : { display_value } " )
274+ result .append (f"Row { idx + 1 } : { ' | ' .join (row_data )} " )
275+ result .append (" \n \n " )
276+
277+ return "\n " .join (result )
278+
279+ else :
280+ # Fallback to openpyxl for basic reading
281+ if extension != "xlsx" :
282+ raise ImportError (
283+ "openpyxl only supports .xlsx files. Please install pandas for .xls support."
284+ )
285+
286+ from openpyxl import load_workbook
287+
288+ workbook = load_workbook (excel_bytes , data_only = True )
289+
290+ result = []
291+
292+ for sheet_name in workbook .sheetnames :
293+ sheet = workbook [sheet_name ]
294+ result .append (f"\n Sheet: { sheet_name } " )
295+ result .append (" \n \n " )
296+
297+ rows_data = []
298+ for row in sheet .iter_rows (values_only = True ):
299+ if any (cell is not None for cell in row ): # Skip empty rows
300+ rows_data .append (
301+ [str (cell ) if cell is not None else "" for cell in row ]
302+ )
303+
304+ if not rows_data :
305+ result .append ("(Empty sheet)" )
306+ continue
307+
308+ # Add headers and data
309+ headers = rows_data [0 ] if rows_data else []
310+ result .append ("Headers: " + " | " .join (headers ))
311+ result .append (" \n \n " )
312+
313+ for i , row in enumerate (rows_data [1 :], 1 ):
314+ if len (row ) == len (headers ):
315+ row_data = [f"{ h } : { v } " for h , v in zip (headers , row )]
316+ result .append (f"Row { i } : { ' | ' .join (row_data )} " )
317+ result .append (" \n \n " )
318+ else :
319+ result .append (f"Row { i } : { ' | ' .join (row )} " )
320+ result .append (" \n \n " )
321+
322+ return "\n " .join (result )
323+
324+ except Exception as e :
325+ raise ValueError (f"Error reading Excel file: { str (e )} " )
0 commit comments