|
| 1 | +import os |
| 2 | +import re |
| 3 | +import pandas as pd |
| 4 | +import duckdb |
1 | 5 | from dotenv import load_dotenv |
2 | 6 | from pathlib import Path |
3 | 7 |
|
4 | 8 | from langchain_core.vectorstores import InMemoryVectorStore |
5 | 9 | from langchain_google_genai import GoogleGenerativeAIEmbeddings |
6 | 10 | from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader |
7 | 11 | from langchain_text_splitters import RecursiveCharacterTextSplitter |
| 12 | +from langchain_core.documents import Document |
8 | 13 |
|
9 | 14 | load_dotenv() |
10 | 15 |
|
11 | 16 | BASE = Path(__file__).parent.parent.parent |
12 | 17 | DATA = BASE / "data" |
13 | 18 |
|
14 | 19 |
|
15 | | -def index_text_docs( |
16 | | - data_pth: Path = DATA, |
17 | | -): |
18 | | - embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") |
19 | | - vector_store = InMemoryVectorStore(embeddings) |
| 20 | +def load_and_split_text_docs(data_dir): |
| 21 | + text_chunks = [] |
| 22 | + globs = [ |
| 23 | + "**/*.pdf", |
| 24 | + "**/*.docx", |
| 25 | + "**/*.pptx", |
| 26 | + "**/*.md", |
| 27 | + "**/*.html", |
| 28 | + "**/*.txt", |
| 29 | + ] |
| 30 | + # gaudrail if no files matched |
| 31 | + if not any(next(data_dir.rglob(p), None) for p in globs): |
| 32 | + print(f"No text files found under {data_dir}; skipping.") |
| 33 | + return text_chunks |
20 | 34 |
|
21 | | - # Load the text documents |
| 35 | + print(f"Detected text files under {data_dir}") |
22 | 36 | loader = DirectoryLoader( |
23 | | - str(data_pth), |
24 | | - glob=[ |
25 | | - "**/*.pdf", |
26 | | - "**/*.docx", |
27 | | - "**/*.pptx", |
28 | | - "**/*.md", |
29 | | - "**/*.html", |
30 | | - "**/*.txt", |
31 | | - "**/*.png", |
32 | | - "**/*.jpg", |
33 | | - "**/*.jpeg", |
34 | | - "**/*.tiff", |
35 | | - ], |
| 37 | + str(data_dir), |
| 38 | + glob=globs, |
36 | 39 | loader_cls=UnstructuredFileLoader, |
37 | 40 | ) |
38 | | - print(f"Loading files from {data_pth}") |
| 41 | + print(f"Loading files from {data_dir}") |
39 | 42 | docs = loader.load() |
40 | | - print(f"Loaded {len(docs)} files") |
41 | | - |
42 | | - # Split the texts |
| 43 | + print(f"Loaded {len(docs)} text files") |
| 44 | + # split |
43 | 45 | text_splitter = RecursiveCharacterTextSplitter( |
44 | 46 | chunk_size=1000, |
45 | 47 | chunk_overlap=200, |
46 | 48 | add_start_index=True, |
47 | 49 | separators=["\n\n", "\n", " ", ""], |
48 | 50 | ) |
49 | | - all_splits = text_splitter.split_documents(docs) |
50 | | - print(len(all_splits)) |
| 51 | + text_chunks = text_splitter.split_documents(docs) |
| 52 | + print(f"Split text chunks: {len(text_chunks)}") |
| 53 | + # tag |
| 54 | + for chunk in text_chunks: |
| 55 | + chunk.metadata["source_type"] = "text" |
| 56 | + |
| 57 | + return text_chunks |
| 58 | + |
| 59 | + |
| 60 | +def load_image_docs_as_text(data_dir): |
| 61 | + image_text_docs = [] |
| 62 | + globs = [ |
| 63 | + "**/*.png", |
| 64 | + "**/*.jpg", |
| 65 | + "**/*.jpeg", |
| 66 | + "**/*.tiff", |
| 67 | + ] |
| 68 | + # gaudrail if no files matched |
| 69 | + if not any(next(data_dir.rglob(p), None) for p in globs): |
| 70 | + print(f"No images found under {data_dir}; skipping.") |
| 71 | + return image_text_docs |
| 72 | + |
| 73 | + print(f"Detected images under {data_dir}") |
| 74 | + loader = DirectoryLoader( |
| 75 | + str(data_dir), |
| 76 | + glob=globs, |
| 77 | + loader_cls=UnstructuredFileLoader, |
| 78 | + ) |
| 79 | + print(f"Loading images from {data_dir}") |
| 80 | + image_text_docs = loader.load() |
| 81 | + print(f"Loaded {len(image_text_docs)} image files") |
| 82 | + # tag |
| 83 | + for img in image_text_docs: |
| 84 | + img.metadata["source_type"] = "image_text" |
| 85 | + |
| 86 | + return image_text_docs |
| 87 | + |
| 88 | + |
| 89 | +def _tbl(name: str) -> str: |
| 90 | + """make a safe SQL table name""" |
| 91 | + name = re.sub(r"[^0-9a-zA-Z_]+", "_", name).strip("_") |
| 92 | + if not name or name[0].isdigit(): |
| 93 | + name = f"t_{name}" |
| 94 | + return name.lower() |
| 95 | + |
| 96 | + |
| 97 | +def build_duckdb_and_summary_cards( |
| 98 | + data_dir: Path, |
| 99 | + db_path: Path, |
| 100 | +) -> list[Document]: |
| 101 | + summary_cards = [] |
| 102 | + # skip if there are no .csv/.xlsx/.xls files |
| 103 | + patterns = ("*.csv", "*.xlsx", "*.xls") |
| 104 | + if not any(next(data_dir.rglob(p), None) for p in patterns): |
| 105 | + print(f"No CSV or Excel files found under {data_dir}; skipping.") |
| 106 | + return summary_cards |
| 107 | + print(f"Detected CSV or Excel files under {data_dir}") |
| 108 | + # ensure the DB folder exists |
| 109 | + os.makedirs(db_path.parent, exist_ok=True) |
| 110 | + # empty the entire DB |
| 111 | + if db_path.exists(): |
| 112 | + db_path.unlink() |
| 113 | + # start from an empty fresh DB |
| 114 | + with duckdb.connect(str(db_path)) as con: |
| 115 | + # ingest .csv files into DuckDB (overwrite on rerun) |
| 116 | + for fp in data_dir.rglob("*.csv"): |
| 117 | + table = _tbl(fp.stem) |
| 118 | + fp_sql = fp.as_posix().replace("'", "''") # escape single quotes |
| 119 | + con.execute( |
| 120 | + f""" |
| 121 | + CREATE OR REPLACE TABLE {table} AS |
| 122 | + SELECT * FROM read_csv_auto('{fp_sql}', header=true) |
| 123 | + """ |
| 124 | + ) |
| 125 | + |
| 126 | + # XLSX ingestion via pandas |
| 127 | + for fp in data_dir.rglob("*.xlsx"): |
| 128 | + try: |
| 129 | + xls = pd.ExcelFile(fp) # lists sheet names |
| 130 | + except Exception as e: |
| 131 | + print(f"Skip {fp.name}: {e}") |
| 132 | + continue |
| 133 | + |
| 134 | + # One table per sheet |
| 135 | + for sheet in xls.sheet_names: |
| 136 | + try: |
| 137 | + df = pd.read_excel(fp, sheet_name=sheet) |
| 138 | + except Exception as e: |
| 139 | + print(f"Skip {fp.name}:{sheet}: {e}") |
| 140 | + continue |
| 141 | + |
| 142 | + tmp_name = f"_tmp_{_tbl(fp.stem)}_{_tbl(sheet)}" |
| 143 | + con.register(tmp_name, df) |
51 | 144 |
|
52 | | - # index the docs |
53 | | - ids = vector_store.add_documents(documents=all_splits) |
54 | | - print(len(ids)) |
| 145 | + table = _tbl(f"{fp.stem}__{sheet}") |
| 146 | + con.execute( |
| 147 | + f""" |
| 148 | + CREATE OR REPLACE TABLE {table} AS |
| 149 | + SELECT * FROM {tmp_name}""" |
| 150 | + ) |
| 151 | + con.unregister(tmp_name) |
55 | 152 |
|
| 153 | + for fp in data_dir.rglob("*.xls"): |
| 154 | + # .xls not supported by DuckDB |
| 155 | + print(f"Skip {fp.name}: .xls not supported by DuckDB.") |
| 156 | + |
| 157 | + # build summary cards from DuckDB |
| 158 | + tables = [r[0] for r in con.execute("SHOW TABLES").fetchall()] |
| 159 | + for tbl in tables: |
| 160 | + # DESCRIBE/PRAGMA to get columns & types |
| 161 | + schema_rows = con.execute(f"DESCRIBE {tbl}").fetchall() |
| 162 | + col_names = [r[0] for r in schema_rows] |
| 163 | + col_types = [r[1] for r in schema_rows] |
| 164 | + nrows = con.execute(f"SELECT COUNT(*) FROM {tbl}").fetchone()[0] |
| 165 | + preview_df = con.execute(f"SELECT * FROM {tbl} LIMIT 5").df() |
| 166 | + |
| 167 | + col_str = ", ".join(f"{n}:{t}" for n, t in zip(col_names, col_types)) |
| 168 | + preview_txt = preview_df.to_string(index=False) |
| 169 | + |
| 170 | + text = ( |
| 171 | + f"TABLE CARD — {tbl}\n" |
| 172 | + f"Columns (Length: {len(col_names)}; Format: 'column_name:data_type'): {col_str}\n" |
| 173 | + f"Rows: {nrows}\n\n" |
| 174 | + f"Sample rows (up to 5):\n{preview_txt}\n" |
| 175 | + ) |
| 176 | + |
| 177 | + summary_cards.append( |
| 178 | + Document( |
| 179 | + page_content=text, |
| 180 | + metadata={ |
| 181 | + "source_type": "table_summary", |
| 182 | + "table": tbl, |
| 183 | + "db_path": str(db_path), |
| 184 | + }, |
| 185 | + ) |
| 186 | + ) |
| 187 | + |
| 188 | + return summary_cards |
| 189 | + |
| 190 | + |
| 191 | +def embed_and_index_all_docs( |
| 192 | + data_dir: Path = DATA, db_path: Path = DATA / "csv_excel_to_db" / "my_data.duckdb" |
| 193 | +): |
| 194 | + # load embeedings and vector store |
| 195 | + embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") |
| 196 | + vector_store = InMemoryVectorStore(embeddings) |
| 197 | + |
| 198 | + # LOAD AND SPLIT TEXT DOCS |
| 199 | + text_chunks = load_and_split_text_docs(data_dir) |
| 200 | + # LOAD IMAGES (OCR converts image -> text) |
| 201 | + image_text_docs = load_image_docs_as_text(data_dir) |
| 202 | + # LOAD AND SPLIT CSV/EXCEL DOCS |
| 203 | + summary_cards = build_duckdb_and_summary_cards(data_dir, db_path) |
| 204 | + |
| 205 | + vector_store.add_documents(text_chunks + image_text_docs + summary_cards) |
56 | 206 | return embeddings, vector_store |
| 207 | + |
| 208 | + |
| 209 | +# def index_text_docs( |
| 210 | +# data_pth: Path = DATA, |
| 211 | +# ): |
| 212 | +# embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") |
| 213 | +# vector_store = InMemoryVectorStore(embeddings) |
| 214 | + |
| 215 | +# # Load the text documents |
| 216 | +# loader = DirectoryLoader( |
| 217 | +# str(data_pth), |
| 218 | +# glob=[ |
| 219 | +# "**/*.pdf", |
| 220 | +# "**/*.docx", |
| 221 | +# "**/*.pptx", |
| 222 | +# "**/*.md", |
| 223 | +# "**/*.html", |
| 224 | +# "**/*.txt", |
| 225 | +# "**/*.png", |
| 226 | +# "**/*.jpg", |
| 227 | +# "**/*.jpeg", |
| 228 | +# "**/*.tiff", |
| 229 | +# ], |
| 230 | +# loader_cls=UnstructuredFileLoader, |
| 231 | +# ) |
| 232 | +# print(f"Loading files from {data_pth}") |
| 233 | +# docs = loader.load() |
| 234 | +# print(f"Loaded {len(docs)} files") |
| 235 | + |
| 236 | +# # Split the texts |
| 237 | +# text_splitter = RecursiveCharacterTextSplitter( |
| 238 | +# chunk_size=1000, |
| 239 | +# chunk_overlap=200, |
| 240 | +# add_start_index=True, |
| 241 | +# separators=["\n\n", "\n", " ", ""], |
| 242 | +# ) |
| 243 | +# all_splits = text_splitter.split_documents(docs) |
| 244 | +# print(len(all_splits)) |
| 245 | + |
| 246 | +# # index the docs |
| 247 | +# ids = vector_store.add_documents(documents=all_splits) |
| 248 | +# print(len(ids)) |
| 249 | + |
| 250 | +# return embeddings, vector_store |
0 commit comments