Skip to content

Commit d155f72

Browse files
authored
update requirements, redis and postgres vector stores, and fix git source metadata (#10)
1 parent e7e860e commit d155f72

File tree

10 files changed

+576
-136
lines changed

10 files changed

+576
-136
lines changed

.env

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ EMBEDDING_LENGTH=768
2020
# === Redis ===
2121
REDIS_URL=redis://localhost:6379
2222
REDIS_INDEX=docs
23-
REDIS_SCHEMA=redis_schema.yaml
2423

2524
# === Elasticsearch ===
2625
ELASTIC_URL=http://localhost:9200
@@ -29,7 +28,7 @@ ELASTIC_USER=elastic
2928
ELASTIC_PASSWORD=changeme
3029

3130
# === PGVector ===
32-
PGVECTOR_URL=postgresql://user:pass@localhost:5432/mydb
31+
PGVECTOR_URL=postgresql+psycopg://user:pass@localhost:5432/mydb
3332
PGVECTOR_COLLECTION_NAME=documents
3433

3534
# === SQL Server ===

Containerfile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ COPY vector_db ./vector_db
1818
COPY loaders ./loaders
1919
COPY embed_documents.py .
2020
COPY config.py .
21-
COPY redis_schema.yaml .
2221
COPY .env .
2322

2423
RUN chown -R 1001:0 .

README.md

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,24 @@
88

99
It supports Git repositories, web URLs, and file types like Markdown, PDFs, and HTML. Designed for local runs, containers, or OpenShift/Kubernetes jobs.
1010

11+
- [📚 vector-embedder](#-vector-embedder)
12+
- [⚙️ Features](#️-features)
13+
- [🚀 Quick Start](#-quick-start)
14+
- [1. Configuration](#1-configuration)
15+
- [2. Run Locally](#2-run-locally)
16+
- [3. Or Run in a Container](#3-or-run-in-a-container)
17+
- [🧪 Dry Run Mode](#-dry-run-mode)
18+
- [📦 Dependency Management \& Updates](#-dependency-management--updates)
19+
- [🔧 Installing `pip-tools`](#-installing-pip-tools)
20+
- [➕ Adding / Updating a Package](#-adding--updating-a-package)
21+
- [🗂️ Project Layout](#️-project-layout)
22+
- [🧪 Local DB Testing](#-local-db-testing)
23+
- [PGVector (PostgreSQL)](#pgvector-postgresql)
24+
- [Elasticsearch](#elasticsearch)
25+
- [Redis (RediSearch)](#redis-redisearch)
26+
- [Qdrant](#qdrant)
27+
- [🙌 Acknowledgments](#-acknowledgments)
28+
1129
---
1230

1331
## ⚙️ Features
@@ -101,6 +119,43 @@ Run it:
101119

102120
---
103121

122+
## 📦 Dependency Management & Updates
123+
124+
This project keeps *two* dependency files under version control:
125+
126+
| File | Purpose | Edited by |
127+
|------|---------|-----------|
128+
| **`requirements.in`** | Short, human-readable list of *top-level* libraries (no pins) | You |
129+
| **`requirements.txt`** | Fully-resolved, **pinned** lock file—including hashes—for exact, reproducible builds | `pip-compile` |
130+
131+
### 🔧 Installing `pip-tools`
132+
133+
```bash
134+
python -m pip install --upgrade pip-tools
135+
````
136+
137+
### ➕ Adding / Updating a Package
138+
139+
1. **Edit `requirements.in`**
140+
141+
```diff
142+
- sentence-transformers
143+
+ sentence-transformers>=4.1
144+
+ llama-index
145+
```
146+
2. **Re-lock** the environment
147+
148+
```bash
149+
pip-compile --upgrade
150+
```
151+
3. **Synchronise** your virtual-env
152+
153+
```bash
154+
pip-sync
155+
```
156+
157+
---
158+
104159
## 🗂️ Project Layout
105160

106161
```

config.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,7 @@ def _init_db_provider(db_type: str) -> DBProvider:
114114
if db_type == "REDIS":
115115
url = get("REDIS_URL")
116116
index = os.getenv("REDIS_INDEX", "docs")
117-
schema = os.getenv("REDIS_SCHEMA", "redis_schema.yaml")
118-
return RedisProvider(embedding_model, url, index, schema)
117+
return RedisProvider(embedding_model, url, index)
119118

120119
elif db_type == "ELASTIC":
121120
url = get("ELASTIC_URL")
@@ -127,7 +126,7 @@ def _init_db_provider(db_type: str) -> DBProvider:
127126
elif db_type == "PGVECTOR":
128127
url = get("PGVECTOR_URL")
129128
collection = get("PGVECTOR_COLLECTION_NAME")
130-
return PGVectorProvider(embedding_model, url, collection)
129+
return PGVectorProvider(embedding_model, url, collection, embedding_length)
131130

132131
elif db_type == "MSSQL":
133132
connection_string = get("MSSQL_CONNECTION_STRING")

loaders/git.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,15 +81,27 @@ def load(self) -> List[Document]:
8181
pdf_files = [f for f in matched_files if f.suffix.lower() == ".pdf"]
8282
text_files = [f for f in matched_files if f.suffix.lower() != ".pdf"]
8383

84+
docs: List[Document] = []
8485
if pdf_files:
8586
logger.info("Loading %d PDF file(s) from %s", len(pdf_files), repo_url)
86-
all_chunks.extend(self.pdf_loader.load(pdf_files))
87+
docs.extend(self.pdf_loader.load(pdf_files))
8788

8889
if text_files:
8990
logger.info(
9091
"Loading %d text file(s) from %s", len(text_files), repo_url
9192
)
92-
all_chunks.extend(self.text_loader.load(text_files))
93+
docs.extend(self.text_loader.load(text_files))
94+
95+
for doc in docs:
96+
local_src = Path(doc.metadata.get("source", ""))
97+
try:
98+
rel_path = local_src.relative_to(repo_path)
99+
except ValueError:
100+
rel_path = local_src
101+
102+
doc.metadata.update({"source": f"{repo_url}@{rel_path.as_posix()}"})
103+
104+
all_chunks.extend(docs)
93105

94106
return all_chunks
95107

redis_schema.yaml

Lines changed: 0 additions & 53 deletions
This file was deleted.

requirements.in

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
beautifulsoup4
2+
hf_xet
3+
langchain
4+
langchain-community
5+
langchain-elasticsearch
6+
langchain-huggingface
7+
langchain-postgres
8+
langchain-qdrant
9+
langchain-redis
10+
langchain-sqlserver
11+
psycopg-binary
12+
pyodbc
13+
pypdf
14+
python-dotenv
15+
qdrant-client
16+
sentence-transformers
17+
unstructured[md]

0 commit comments

Comments
 (0)