-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinspect_chroma.py
More file actions
174 lines (141 loc) · 5.45 KB
/
inspect_chroma.py
File metadata and controls
174 lines (141 loc) · 5.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/env python3
"""
inspect_chroma.py — Peek inside a persisted Chroma DB (no RAG required).
Examples:
# basic peek
python inspect_chroma.py --db ./database/chroma_db1 --collection langchain --limit 5
# include embeddings and show their dimensionality
python inspect_chroma.py --db ./database/chroma_db1 --include-embeddings
# filter by metadata (JSON)
python inspect_chroma.py --db ./database/chroma_db1 --where '{"source": "docs/file1.pdf"}'
# filter by document substring
python inspect_chroma.py --db ./database/chroma_db1 --where-document "neural network"
# fetch specific IDs
python inspect_chroma.py --db ./database/chroma_db1 --ids id1 id2 id3
# combine filters with paging
python inspect_chroma.py --db ./database/chroma_db1 --where '{"page": 2}' --limit 10 --offset 10
"""
import argparse
import json
import sys
from typing import Any, Dict, List, Optional
import chromadb
from chromadb.config import Settings
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description="Inspect a persisted Chroma DB collection.")
p.add_argument("--db", required=True, help="Path to Chroma persist directory (e.g., ./database/chroma_db1)")
p.add_argument("--collection", default=None, help="Collection name (default: try 'langchain', else first found)")
p.add_argument("--limit", type=int, default=5, help="Max items to preview (default: 5)")
p.add_argument("--offset", type=int, default=0, help="Offset for paging (default: 0)")
p.add_argument("--include-embeddings", action="store_true", help="Also fetch embeddings (slower)")
p.add_argument("--ids", nargs="*", default=None, help="Specific IDs to fetch (space-separated)")
p.add_argument(
"--where",
default=None,
help="JSON metadata filter, e.g. '{\"source\": \"/path/file.pdf\", \"page\": 3}'",
)
p.add_argument(
"--where-document",
default=None,
help="Substring filter on document text (Chroma will handle as contains).",
)
return p.parse_args()
def parse_where(where_str: Optional[str]) -> Optional[Dict[str, Any]]:
if not where_str:
return None
try:
data = json.loads(where_str)
if not isinstance(data, dict):
raise ValueError("where must be a JSON object")
return data
except Exception as e:
print(f"[ERROR] Failed to parse --where JSON: {e}", file=sys.stderr)
sys.exit(2)
def safe_len_embedding(vec: Any) -> str:
if vec is None:
return "n/a"
try:
return str(len(vec))
except Exception:
return "unknown"
def main() -> None:
args = parse_args()
client = chromadb.PersistentClient(
path=args.db,
settings=Settings(anonymized_telemetry=False),
)
# List collections
cols = client.list_collections()
print("Collections:")
for c in cols:
print(" -", c.name)
# Resolve collection
target_name = args.collection
if not target_name:
names = {c.name for c in cols}
if "langchain" in names:
target_name = "langchain"
elif cols:
target_name = cols[0].name
else:
print("\nNo collections found in this DB.", file=sys.stderr)
sys.exit(2)
col = client.get_or_create_collection(target_name)
count = col.count()
print(f"\nCollection: {target_name}")
print("Count:", count)
if count == 0:
print("Collection is empty.")
return
include = ["metadatas", "documents"]
if args.include_embeddings:
include.append("embeddings")
where = parse_where(args.where)
# Prepare query kwargs (Chroma will ignore None fields)
get_kwargs: Dict[str, Any] = dict(
where=where,
where_document=args.where_document,
limit=args.limit,
offset=args.offset,
include=include,
)
# When IDs are supplied, Chroma ignores paging args and returns those IDs specifically.
# We'll still pass where/where_document in case the caller wants to intersect (Chroma allows it).
if args.ids:
get_kwargs["ids"] = args.ids
try:
batch = col.get(**get_kwargs)
except Exception as e:
print(f"[ERROR] col.get failed: {e}", file=sys.stderr)
sys.exit(3)
# Normalize keys
ids: List[str] = batch.get("ids") or []
docs: List[Optional[str]] = batch.get("documents") or []
metas: List[Optional[Dict[str, Any]]] = batch.get("metadatas") or []
embs = batch.get("embeddings", None) # can be None or list/array of vectors
# Render results
n = len(ids)
print(f"\nReturned {n} item(s)")
for i in range(n):
_id = ids[i] if i < len(ids) else None
meta = metas[i] if i < len(metas) else None
doc = docs[i] if i < len(docs) else None
print("\nID:", _id)
print("Meta:", meta)
preview = (doc or "")[:200].replace("\n", " ")
print("Text (first 200):", preview)
if args.include_embeddings:
dim = "n/a"
if embs is not None:
try:
vec = embs[i]
except Exception:
vec = None
dim = safe_len_embedding(vec)
print("Embedding dim:", dim)
else:
print("Embedding dim: (not loaded; use --include-embeddings)")
# Helpful footer
print("\nTip: Use --offset to page through results; combine --where and --where-document to narrow down chunks.")
if __name__ == "__main__":
main()