Skip to content

Commit d38cac4

Browse files
author
dudcom
committed
adding uitls
1 parent fe96ec4 commit d38cac4

File tree

2 files changed

+349
-0
lines changed

2 files changed

+349
-0
lines changed
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/usr/bin/env python3
2+
3+
import sqlite3
4+
from pathlib import Path
5+
from typing import Iterable, Tuple
6+
7+
8+
def create_bm25_index(documents: Iterable[dict], db_path: Path, text_key: str = "bm25_text", id_key: str = "doc_id") -> None:
9+
db_path = Path(db_path)
10+
db_path.parent.mkdir(parents=True, exist_ok=True)
11+
if db_path.exists():
12+
db_path.unlink()
13+
14+
conn = sqlite3.connect(str(db_path))
15+
try:
16+
conn.execute("PRAGMA journal_mode=WAL")
17+
conn.execute("PRAGMA synchronous=NORMAL")
18+
try:
19+
conn.execute("CREATE VIRTUAL TABLE docs USING fts5(doc_id UNINDEXED, content, tokenize='porter')")
20+
except sqlite3.OperationalError as e:
21+
print(f"FTS5 unavailable, skipping BM25 index: {e}")
22+
return
23+
24+
rows = []
25+
for doc in documents:
26+
doc_id = str(doc.get(id_key, ""))
27+
text = str(doc.get(text_key, ""))
28+
if not doc_id or not text.strip():
29+
continue
30+
rows.append((doc_id, text))
31+
32+
if len(rows) >= 500:
33+
conn.executemany("INSERT INTO docs(doc_id, content) VALUES (?, ?)", rows)
34+
rows = []
35+
36+
if rows:
37+
conn.executemany("INSERT INTO docs(doc_id, content) VALUES (?, ?)", rows)
38+
conn.commit()
39+
finally:
40+
conn.close()
41+
42+
43+
def search_bm25(db_path: Path, query: str, top_k: int = 50) -> list[Tuple[str, float]]:
44+
if not query.strip():
45+
return []
46+
47+
conn = sqlite3.connect(str(db_path))
48+
try:
49+
conn.row_factory = sqlite3.Row
50+
try:
51+
cur = conn.execute(
52+
"SELECT doc_id, bm25(docs) AS score FROM docs WHERE docs MATCH ? ORDER BY score LIMIT ?",
53+
(query, int(top_k)),
54+
)
55+
except sqlite3.OperationalError:
56+
return []
57+
return [(row["doc_id"], float(row["score"])) for row in cur.fetchall()]
58+
finally:
59+
conn.close()
Lines changed: 290 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
#!/usr/bin/env python3
2+
3+
import json
4+
import os
5+
import sys
6+
import hashlib
7+
from html import unescape
8+
from html.parser import HTMLParser
9+
from pathlib import Path
10+
from typing import Dict, Iterable, List, Tuple
11+
12+
try:
13+
import numpy as np
14+
from sentence_transformers import SentenceTransformer
15+
import faiss
16+
except ImportError as e:
17+
print(f"Error: {e}")
18+
print("Install with: pip3 install --user --break-system-packages numpy sentence-transformers faiss-cpu")
19+
sys.exit(1)
20+
21+
try:
22+
from utils.bm25_sqlite import create_bm25_index
23+
except ModuleNotFoundError:
24+
try:
25+
import sys as _sys
26+
_sys.path.insert(0, str(Path(__file__).resolve().parent))
27+
from bm25_sqlite import create_bm25_index
28+
except Exception as e:
29+
print(f"Error importing bm25_sqlite: {e}")
30+
create_bm25_index = None
31+
32+
33+
class _TextExtractor(HTMLParser):
34+
def __init__(self) -> None:
35+
super().__init__()
36+
self.text_parts: List[str] = []
37+
self.skip_tags = {"script", "style", "noscript", "meta", "link", "svg", "path"}
38+
self.skip = False
39+
40+
def handle_starttag(self, tag: str, attrs) -> None:
41+
self.skip = tag.lower() in self.skip_tags
42+
43+
def handle_endtag(self, tag: str) -> None:
44+
self.skip = False
45+
46+
def handle_data(self, data: str) -> None:
47+
if not self.skip and data.strip():
48+
self.text_parts.append(data.strip())
49+
50+
def get_text(self) -> str:
51+
text = " ".join(self.text_parts)
52+
text = unescape(text)
53+
return " ".join(text.split())
54+
55+
56+
def _html_to_text(html: str) -> str:
57+
if not html:
58+
return ""
59+
parser = _TextExtractor()
60+
parser.feed(html)
61+
return parser.get_text()
62+
63+
64+
def _find_split_point(content: str, start: int, end: int) -> int:
65+
for sep in ("\n\n", "\n", " "):
66+
idx = content.rfind(sep, start, end)
67+
if idx != -1 and idx > start:
68+
return idx + len(sep)
69+
return end
70+
71+
72+
def chunk_document(content: str, chunk_size: int = 800, overlap: int = 150) -> List[Dict[str, int | str]]:
73+
chunks = []
74+
text_len = len(content)
75+
if text_len == 0:
76+
return chunks
77+
78+
start = 0
79+
while start < text_len:
80+
end = min(start + chunk_size, text_len)
81+
if end < text_len:
82+
end = _find_split_point(content, start, end)
83+
chunk_text = content[start:end]
84+
if chunk_text.strip():
85+
chunks.append({
86+
"text": chunk_text,
87+
"start_char": start,
88+
"end_char": end,
89+
})
90+
if end >= text_len:
91+
break
92+
start = max(0, end - overlap)
93+
94+
return chunks
95+
96+
97+
def _line_range(content: str, start_char: int, end_char: int) -> Tuple[int, int]:
98+
start_line = content.count("\n", 0, start_char) + 1
99+
end_line = content.count("\n", 0, end_char) + 1
100+
return start_line, end_line
101+
102+
103+
def _iter_issues(json_path: Path) -> Iterable[Dict]:
104+
with json_path.open("r", encoding="utf-8", errors="ignore") as f:
105+
first_char = f.read(1)
106+
f.seek(0)
107+
if first_char == "[":
108+
try:
109+
import ijson # type: ignore
110+
for item in ijson.items(f, "item"):
111+
yield item
112+
return
113+
except Exception:
114+
pass
115+
data = json.load(f)
116+
for item in data:
117+
yield item
118+
else:
119+
for line in f:
120+
line = line.strip()
121+
if not line:
122+
continue
123+
yield json.loads(line)
124+
125+
126+
def _extract_issue_text(issue: Dict) -> Tuple[str, str, str]:
127+
issue_id = issue.get("issue_id") or issue.get("id") or issue.get("issueId") or ""
128+
url = issue.get("url") or issue.get("issue_url") or issue.get("issueUrl") or ""
129+
title = issue.get("title") or issue.get("summary") or ""
130+
131+
parts = []
132+
if title:
133+
parts.append(f"Title: {title}")
134+
135+
description = (
136+
issue.get("description_html")
137+
or issue.get("description")
138+
or issue.get("body_html")
139+
or issue.get("body")
140+
or issue.get("content")
141+
or ""
142+
)
143+
description_text = _html_to_text(description)
144+
if description_text:
145+
parts.append(description_text)
146+
147+
comments = issue.get("comments") or issue.get("comment") or []
148+
if isinstance(comments, list):
149+
for comment in comments:
150+
if not isinstance(comment, dict):
151+
continue
152+
comment_html = (
153+
comment.get("comment_html")
154+
or comment.get("content_html")
155+
or comment.get("comment")
156+
or comment.get("content")
157+
or ""
158+
)
159+
comment_text = _html_to_text(comment_html)
160+
if comment_text:
161+
parts.append(comment_text)
162+
163+
return str(issue_id), str(url), "\n".join(parts).strip()
164+
165+
166+
def _stable_id(value: str) -> str:
167+
return hashlib.sha1(value.encode("utf-8")).hexdigest()
168+
169+
170+
def collect_issues(json_path: Path) -> List[Dict[str, object]]:
171+
documents: List[Dict[str, object]] = []
172+
173+
for issue in _iter_issues(json_path):
174+
issue_id, url, content = _extract_issue_text(issue)
175+
if not content:
176+
continue
177+
178+
chunks = chunk_document(content)
179+
if not chunks:
180+
continue
181+
182+
total_chunks = len(chunks)
183+
parent_id = _stable_id(f"issue:{issue_id}")
184+
for chunk_index, chunk in enumerate(chunks):
185+
start_line, end_line = _line_range(content, chunk["start_char"], chunk["end_char"])
186+
doc_id = _stable_id(f"{issue_id}:{chunk_index}:{chunk['start_char']}:{chunk['end_char']}")
187+
context = (
188+
f"Topic: Chromium Issue\n"
189+
f"Issue: {issue_id}\n"
190+
f"URL: {url}\n"
191+
f"Chunk: {chunk_index + 1}/{total_chunks}\n"
192+
f"Chars: {chunk['start_char']}-{chunk['end_char']}"
193+
)
194+
documents.append({
195+
"doc_id": doc_id,
196+
"issue_id": issue_id,
197+
"url": url,
198+
"topic": "Chromium Issue",
199+
"doc_type": "issue",
200+
"source": "chromium_issues",
201+
"parent_id": parent_id,
202+
"content": chunk["text"],
203+
"context": context,
204+
"chunk_index": chunk_index,
205+
"total_chunks": total_chunks,
206+
"start_char": chunk["start_char"],
207+
"end_char": chunk["end_char"],
208+
"start_line": start_line,
209+
"end_line": end_line,
210+
"char_range": f"{chunk['start_char']}-{chunk['end_char']}",
211+
})
212+
213+
if len(documents) % 200 == 0:
214+
print(f"Collected {len(documents)} issue chunks...")
215+
216+
return documents
217+
218+
219+
def _embedding_text(doc: Dict[str, object]) -> str:
220+
context = str(doc.get("context", "")).strip()
221+
content = str(doc.get("content", "")).strip()
222+
if context:
223+
return f"{context}\n\n{content}".strip()
224+
return content
225+
226+
227+
def create_vector_db(documents: List[Dict[str, object]], output_dir: Path):
228+
print(f"\nCreating embeddings for {len(documents)} issue chunks...")
229+
230+
model = SentenceTransformer("all-MiniLM-L6-v2")
231+
232+
contents = [_embedding_text(doc) for doc in documents]
233+
embeddings = model.encode(contents, show_progress_bar=True, convert_to_numpy=True)
234+
235+
print(f"\nCreated embeddings with shape: {embeddings.shape}")
236+
237+
dimension = embeddings.shape[1]
238+
index = faiss.IndexFlatL2(dimension)
239+
index.add(embeddings.astype("float32"))
240+
241+
output_dir.mkdir(parents=True, exist_ok=True)
242+
243+
faiss.write_index(index, str(output_dir / "chromium_issues_rag.index"))
244+
245+
with open(output_dir / "chromium_issues_rag_metadata.json", "w") as f:
246+
json.dump(documents, f, indent=2)
247+
248+
with open(output_dir / "chromium_issues_rag_model.pkl", "wb") as f:
249+
import pickle
250+
pickle.dump("all-MiniLM-L6-v2", f)
251+
252+
if create_bm25_index is not None:
253+
bm25_path = output_dir / "chromium_issues_rag_bm25.sqlite"
254+
bm25_docs = [{"doc_id": doc.get("doc_id"), "bm25_text": _embedding_text(doc)} for doc in documents]
255+
create_bm25_index(bm25_docs, bm25_path)
256+
print(" - BM25: chromium_issues_rag_bm25.sqlite")
257+
258+
print(f"\nVector database saved to {output_dir}")
259+
print(" - Index: chromium_issues_rag.index")
260+
print(" - Metadata: chromium_issues_rag_metadata.json")
261+
print(" - Model info: chromium_issues_rag_model.pkl")
262+
print(f"\nTotal issue chunks indexed: {len(documents)}")
263+
264+
265+
def main():
266+
default_json = Path("/mnt/vdc/chromium_scraping/chromium_issues_from_tracker.json")
267+
json_path = Path(os.getenv("CHROMIUM_ISSUES_JSON", str(default_json))).expanduser()
268+
if len(sys.argv) > 1:
269+
json_path = Path(sys.argv[1]).expanduser()
270+
271+
if not json_path.exists():
272+
print(f"Error: Chromium issues JSON not found: {json_path}")
273+
sys.exit(1)
274+
275+
default_rag_dir = Path(__file__).resolve().parent.parent / "rag_db"
276+
rag_base_dir = Path(os.getenv("RAG_BASE_DIR", str(default_rag_dir))).expanduser()
277+
output_dir = rag_base_dir / "chromium_issues_rag"
278+
279+
print(f"Loading issues from: {json_path.resolve()}")
280+
documents = collect_issues(json_path)
281+
282+
if not documents:
283+
print("No issue content found to index!")
284+
return
285+
286+
create_vector_db(documents, output_dir)
287+
288+
289+
if __name__ == "__main__":
290+
main()

0 commit comments

Comments
 (0)