-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathingest_pg.py
More file actions
63 lines (54 loc) · 1.82 KB
/
ingest_pg.py
File metadata and controls
63 lines (54 loc) · 1.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# ingest_pg.py
import os, glob
from dotenv import load_dotenv
from openai import OpenAI
import psycopg2
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
DB_URL = os.getenv("DB_URL", "postgresql://postgres:postgres@localhost:5432/hri_rag")
EMBED_MODEL = "text-embedding-3-small"
EMBED_DIM = 1536
SCHEMA_SQL = f"""
CREATE EXTENSION IF NOT EXISTS vector;
CREATE TABLE IF NOT EXISTS documents (
id SERIAL PRIMARY KEY,
chunk TEXT NOT NULL,
source TEXT,
chunk_idx INT,
metadata JSONB,
embedding VECTOR({EMBED_DIM})
);
CREATE INDEX IF NOT EXISTS idx_documents_embedding
ON documents USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
"""
def embed(text: str) -> list[float]:
return client.embeddings.create(input=[text], model=EMBED_MODEL).data[0].embedding
def iter_chunks():
files = sorted(glob.glob("kb/*.txt"))
if not files:
raise SystemExit("No hay archivos en kb/*.txt")
for fp in files:
with open(fp, "r", encoding="utf-8") as f:
raw = f.read().strip()
parts = [p.strip() for p in raw.split("\n\n") if p.strip()]
for i, p in enumerate(parts):
yield os.path.basename(fp), i, p
def main():
conn = psycopg2.connect(DB_URL); conn.autocommit = True
cur = conn.cursor(); cur.execute(SCHEMA_SQL)
cur.execute("TRUNCATE documents RESTART IDENTITY")
rows = 0
for src, idx, chunk in iter_chunks():
vec = embed(chunk)
cur.execute(
"""
INSERT INTO documents(chunk, source, chunk_idx, metadata, embedding)
VALUES (%s, %s, %s, %s, %s::vector)
""",
(chunk, src, idx, None, f"[{', '.join(map(str, vec))}]")
)
rows += 1
cur.close(); conn.close()
print(f"Ingesta completa: {rows} chunks.")
if __name__ == "__main__":
main()