rag_play/inspect_chroma.py at main · mattcurf/rag_play · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/env python3
"""
inspect_chroma.py — Peek inside a persisted Chroma DB (no RAG required).

Examples:
  # basic peek
  python inspect_chroma.py --db ./database/chroma_db1 --collection langchain --limit 5

  # include embeddings and show their dimensionality
  python inspect_chroma.py --db ./database/chroma_db1 --include-embeddings

  # filter by metadata (JSON)
  python inspect_chroma.py --db ./database/chroma_db1 --where '{"source": "docs/file1.pdf"}'

  # filter by document substring
  python inspect_chroma.py --db ./database/chroma_db1 --where-document "neural network"

  # fetch specific IDs
  python inspect_chroma.py --db ./database/chroma_db1 --ids id1 id2 id3

  # combine filters with paging
  python inspect_chroma.py --db ./database/chroma_db1 --where '{"page": 2}' --limit 10 --offset 10
"""

import argparse
import json
import sys
from typing import Any, Dict, List, Optional

import chromadb
from chromadb.config import Settings


def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description="Inspect a persisted Chroma DB collection.")
    p.add_argument("--db", required=True, help="Path to Chroma persist directory (e.g., ./database/chroma_db1)")
    p.add_argument("--collection", default=None, help="Collection name (default: try 'langchain', else first found)")
    p.add_argument("--limit", type=int, default=5, help="Max items to preview (default: 5)")
    p.add_argument("--offset", type=int, default=0, help="Offset for paging (default: 0)")
    p.add_argument("--include-embeddings", action="store_true", help="Also fetch embeddings (slower)")
    p.add_argument("--ids", nargs="*", default=None, help="Specific IDs to fetch (space-separated)")
    p.add_argument(
        "--where",
        default=None,
        help="JSON metadata filter, e.g. '{\"source\": \"/path/file.pdf\", \"page\": 3}'",
    )
    p.add_argument(
        "--where-document",
        default=None,
        help="Substring filter on document text (Chroma will handle as contains).",
    )
    return p.parse_args()


def parse_where(where_str: Optional[str]) -> Optional[Dict[str, Any]]:
    if not where_str:
        return None
    try:
        data = json.loads(where_str)
        if not isinstance(data, dict):
            raise ValueError("where must be a JSON object")
        return data
    except Exception as e:
        print(f"[ERROR] Failed to parse --where JSON: {e}", file=sys.stderr)
        sys.exit(2)


def safe_len_embedding(vec: Any) -> str:
    if vec is None:
        return "n/a"
    try:
        return str(len(vec))
    except Exception:
        return "unknown"


def main() -> None:
    args = parse_args()

    client = chromadb.PersistentClient(
        path=args.db,
        settings=Settings(anonymized_telemetry=False),
    )

    # List collections
    cols = client.list_collections()
    print("Collections:")
    for c in cols:
        print(" -", c.name)

    # Resolve collection
    target_name = args.collection
    if not target_name:
        names = {c.name for c in cols}
        if "langchain" in names:
            target_name = "langchain"
        elif cols:
            target_name = cols[0].name
        else:
            print("\nNo collections found in this DB.", file=sys.stderr)
            sys.exit(2)

    col = client.get_or_create_collection(target_name)
    count = col.count()
    print(f"\nCollection: {target_name}")
    print("Count:", count)
    if count == 0:
        print("Collection is empty.")
        return

    include = ["metadatas", "documents"]
    if args.include_embeddings:
        include.append("embeddings")

    where = parse_where(args.where)

    # Prepare query kwargs (Chroma will ignore None fields)
    get_kwargs: Dict[str, Any] = dict(
        where=where,
        where_document=args.where_document,
        limit=args.limit,
        offset=args.offset,
        include=include,
    )

    # When IDs are supplied, Chroma ignores paging args and returns those IDs specifically.
    # We'll still pass where/where_document in case the caller wants to intersect (Chroma allows it).
    if args.ids:
        get_kwargs["ids"] = args.ids

    try:
        batch = col.get(**get_kwargs)
    except Exception as e:
        print(f"[ERROR] col.get failed: {e}", file=sys.stderr)
        sys.exit(3)

    # Normalize keys
    ids: List[str] = batch.get("ids") or []
    docs: List[Optional[str]] = batch.get("documents") or []
    metas: List[Optional[Dict[str, Any]]] = batch.get("metadatas") or []
    embs = batch.get("embeddings", None)  # can be None or list/array of vectors

    # Render results
    n = len(ids)
    print(f"\nReturned {n} item(s)")
    for i in range(n):
        _id = ids[i] if i < len(ids) else None
        meta = metas[i] if i < len(metas) else None
        doc = docs[i] if i < len(docs) else None

        print("\nID:", _id)
        print("Meta:", meta)
        preview = (doc or "")[:200].replace("\n", " ")
        print("Text (first 200):", preview)

        if args.include_embeddings:
            dim = "n/a"
            if embs is not None:
                try:
                    vec = embs[i]
                except Exception:
                    vec = None
                dim = safe_len_embedding(vec)
            print("Embedding dim:", dim)
        else:
            print("Embedding dim: (not loaded; use --include-embeddings)")

    # Helpful footer
    print("\nTip: Use --offset to page through results; combine --where and --where-document to narrow down chunks.")


if __name__ == "__main__":
    main()