Skip to content

Commit 6fb7c98

Browse files
committed
update to 2.1.3
1 parent 5db085c commit 6fb7c98

File tree

55 files changed

+902
-54
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+902
-54
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22

33
All notable changes to this project will be documented in this file.
44

5-
## [2.1.3] Making Verba stable again!
5+
## [2.1.3] More data types
66

77
## Added
88

99
- Added `OLLAMA_MODEL` and `OLLAMA_EMBED_MODEL` environment variables (https://github.com/weaviate/Verba/pull/372)
10+
- Hiding `Getting Started` display after showing once
11+
- Added support for `csv` `xlsx` `xls` for the `DefaultReader`
1012

1113
## [2.1.2] Adding Novita!
1214

frontend/app/components/Login/GettingStarted.tsx

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"use client";
22

3-
import React, { useEffect, useRef } from "react";
3+
import React, { useEffect, useRef, useState } from "react";
44
import VerbaButton from "../Navigation/VerbaButton";
55
import { FaGithub } from "react-icons/fa";
66
import { FaYoutube } from "react-icons/fa";
@@ -18,13 +18,33 @@ const GettingStartedComponent: React.FC<GettingStartedComponentProps> = ({
1818
addStatusMessage,
1919
}) => {
2020
const dialogRef = useRef<HTMLDialogElement>(null);
21+
const [shouldShow, setShouldShow] = useState(false);
2122

2223
useEffect(() => {
23-
if (dialogRef.current) {
24-
dialogRef.current.showModal();
24+
// Check if getting_started variable exists in localStorage
25+
const gettingStartedSeen = localStorage.getItem("getting_started");
26+
27+
// Show modal if getting_started doesn't exist or is set to false
28+
if (!gettingStartedSeen || gettingStartedSeen === "false") {
29+
setShouldShow(true);
30+
if (dialogRef.current) {
31+
dialogRef.current.showModal();
32+
}
2533
}
2634
}, []);
2735

36+
// If we shouldn't show the component, return null
37+
if (!shouldShow) {
38+
return null;
39+
}
40+
41+
const handleGetStarted = () => {
42+
// Set getting_started to true in localStorage
43+
localStorage.setItem("getting_started", "true");
44+
setShouldShow(false);
45+
addStatusMessage("Achievement unlocked: Welcome to Verba!", "SUCCESS");
46+
};
47+
2848
return (
2949
<dialog id={"Getting-Started-Modal"} className="modal" ref={dialogRef}>
3050
<div className="modal-box w-11/12 max-w-5xl">
@@ -97,12 +117,7 @@ const GettingStartedComponent: React.FC<GettingStartedComponentProps> = ({
97117
title="Let's get started"
98118
type="submit"
99119
selected={true}
100-
onClick={() => {
101-
addStatusMessage(
102-
"Achievement unlocked: Welcome to Verba!",
103-
"SUCCESS"
104-
);
105-
}}
120+
onClick={handleGetStarted}
106121
selected_color="bg-primary-verba"
107122
Icon={FaHeart}
108123
/>

goldenverba/components/managers.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import asyncio
1313
import json
1414
import re
15-
from urllib.parse import urlparse
1615
from datetime import datetime
1716

1817
from sklearn.decomposition import PCA

goldenverba/components/reader/BasicReader.py

Lines changed: 176 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import base64
22
import json
33
import io
4+
import csv
45

56
from wasabi import msg
67

@@ -27,17 +28,35 @@
2728
msg.warn("python-docx not installed, DOCX functionality will be limited.")
2829
docx = None
2930

31+
try:
32+
import pandas as pd
33+
except ImportError:
34+
msg.warn("pandas not installed, Excel functionality will be limited.")
35+
pd = None
36+
37+
try:
38+
import openpyxl
39+
except ImportError:
40+
msg.warn("openpyxl not installed, Excel functionality will be limited.")
41+
openpyxl = None
42+
43+
try:
44+
import xlrd
45+
except ImportError:
46+
msg.warn("xlrd not installed, .xls file functionality will be limited.")
47+
xlrd = None
48+
3049

3150
class BasicReader(Reader):
3251
"""
33-
The BasicReader reads text, code, PDF, and DOCX files.
52+
The BasicReader reads text, code, PDF, DOCX, CSV, and Excel files.
3453
"""
3554

3655
def __init__(self):
3756
super().__init__()
3857
self.name = "Default"
39-
self.description = "Ingests text, code, PDF, and DOCX files"
40-
self.requires_library = ["pypdf", "docx", "spacy"]
58+
self.description = "Ingests text, code, PDF, DOCX, CSV, and Excel files"
59+
self.requires_library = ["pypdf", "docx", "spacy", "pandas", "openpyxl"]
4160
self.extension = [
4261
".txt",
4362
".py",
@@ -51,6 +70,7 @@ def __init__(self):
5170
".docx",
5271
".pptx",
5372
".xlsx",
73+
".xls",
5474
".csv",
5575
".ts",
5676
".tsx",
@@ -93,6 +113,12 @@ async def load(self, config: dict, fileConfig: FileConfig) -> list[Document]:
93113
file_content = await self.load_pdf_file(decoded_bytes)
94114
elif fileConfig.extension.lower() == "docx":
95115
file_content = await self.load_docx_file(decoded_bytes)
116+
elif fileConfig.extension.lower() == "csv":
117+
file_content = await self.load_csv_file(decoded_bytes)
118+
elif fileConfig.extension.lower() in ["xlsx", "xls"]:
119+
file_content = await self.load_excel_file(
120+
decoded_bytes, fileConfig.extension.lower()
121+
)
96122
elif fileConfig.extension.lower() in [
97123
ext.lstrip(".") for ext in self.extension
98124
]:
@@ -150,3 +176,150 @@ async def load_docx_file(self, decoded_bytes: bytes) -> str:
150176
docx_bytes = io.BytesIO(decoded_bytes)
151177
reader = docx.Document(docx_bytes)
152178
return "\n".join(paragraph.text for paragraph in reader.paragraphs)
179+
180+
async def load_csv_file(self, decoded_bytes: bytes) -> str:
181+
"""Load and convert CSV file to readable text format."""
182+
try:
183+
# Try UTF-8 first, fallback to latin-1
184+
try:
185+
text_content = decoded_bytes.decode("utf-8")
186+
except UnicodeDecodeError:
187+
text_content = decoded_bytes.decode("latin-1")
188+
189+
csv_reader = csv.reader(io.StringIO(text_content))
190+
rows = list(csv_reader)
191+
192+
if not rows:
193+
return "Empty CSV file"
194+
195+
# Format as a readable table
196+
result = []
197+
headers = rows[0] if rows else []
198+
199+
# Add headers
200+
if headers:
201+
result.append("Headers: " + " | ".join(headers))
202+
result.append(" \n\n")
203+
204+
# Add data rows
205+
for i, row in enumerate(rows[1:], 1):
206+
if len(row) == len(headers):
207+
row_data = []
208+
for header, value in zip(headers, row):
209+
row_data.append(f"{header}: {value}")
210+
result.append(f"Row {i}: {' | '.join(row_data)}")
211+
else:
212+
# Handle rows with different column counts
213+
result.append(f"Row {i}: {' | '.join(row)}")
214+
result.append(" \n\n")
215+
return "\n".join(result)
216+
217+
except Exception as e:
218+
raise ValueError(f"Error reading CSV file: {str(e)}")
219+
220+
async def load_excel_file(self, decoded_bytes: bytes, extension: str) -> str:
221+
"""Load and convert Excel file to readable text format."""
222+
if not pd and not openpyxl:
223+
raise ImportError("pandas or openpyxl is required to process Excel files.")
224+
225+
try:
226+
excel_bytes = io.BytesIO(decoded_bytes)
227+
228+
# Use pandas if available for better support
229+
if pd:
230+
# Read all sheets
231+
if extension == "xlsx":
232+
sheets_dict = pd.read_excel(
233+
excel_bytes, sheet_name=None, engine="openpyxl"
234+
)
235+
else: # xls
236+
try:
237+
sheets_dict = pd.read_excel(
238+
excel_bytes, sheet_name=None, engine="xlrd"
239+
)
240+
except Exception as e:
241+
# Try auto engine detection as fallback
242+
try:
243+
sheets_dict = pd.read_excel(
244+
excel_bytes, sheet_name=None, engine=None
245+
)
246+
except Exception:
247+
raise ImportError(
248+
f"Cannot read .xls file. Please install 'xlrd' for .xls support: pip install xlrd. "
249+
f"Original error: {str(e)}"
250+
)
251+
252+
result = []
253+
254+
for sheet_name, df in sheets_dict.items():
255+
result.append(f"\nSheet: {sheet_name}")
256+
257+
if df.empty:
258+
result.append("(Empty sheet)")
259+
continue
260+
261+
result.append(" \n\n")
262+
263+
# Add column headers
264+
headers = df.columns.tolist()
265+
result.append("Headers: " + " | ".join(str(h) for h in headers))
266+
result.append(" \n\n")
267+
268+
for idx, (_, row) in enumerate(df.iterrows()):
269+
row_data = []
270+
for header, value in zip(headers, row):
271+
# Handle NaN values
272+
display_value = str(value) if pd.notna(value) else ""
273+
row_data.append(f"{header}: {display_value}")
274+
result.append(f"Row {idx + 1}: {' | '.join(row_data)}")
275+
result.append(" \n\n")
276+
277+
return "\n".join(result)
278+
279+
else:
280+
# Fallback to openpyxl for basic reading
281+
if extension != "xlsx":
282+
raise ImportError(
283+
"openpyxl only supports .xlsx files. Please install pandas for .xls support."
284+
)
285+
286+
from openpyxl import load_workbook
287+
288+
workbook = load_workbook(excel_bytes, data_only=True)
289+
290+
result = []
291+
292+
for sheet_name in workbook.sheetnames:
293+
sheet = workbook[sheet_name]
294+
result.append(f"\nSheet: {sheet_name}")
295+
result.append(" \n\n")
296+
297+
rows_data = []
298+
for row in sheet.iter_rows(values_only=True):
299+
if any(cell is not None for cell in row): # Skip empty rows
300+
rows_data.append(
301+
[str(cell) if cell is not None else "" for cell in row]
302+
)
303+
304+
if not rows_data:
305+
result.append("(Empty sheet)")
306+
continue
307+
308+
# Add headers and data
309+
headers = rows_data[0] if rows_data else []
310+
result.append("Headers: " + " | ".join(headers))
311+
result.append(" \n\n")
312+
313+
for i, row in enumerate(rows_data[1:], 1):
314+
if len(row) == len(headers):
315+
row_data = [f"{h}: {v}" for h, v in zip(headers, row)]
316+
result.append(f"Row {i}: {' | '.join(row_data)}")
317+
result.append(" \n\n")
318+
else:
319+
result.append(f"Row {i}: {' | '.join(row)}")
320+
result.append(" \n\n")
321+
322+
return "\n".join(result)
323+
324+
except Exception as e:
325+
raise ValueError(f"Error reading Excel file: {str(e)}")

0 commit comments

Comments
 (0)