upgrade query handler for aws example (#567)

badmonster0 · web-flow · commit a2a5c002dcaf · 2025-05-27T22:50:04.000-07:00
Update main.py
diff --git a/examples/amazon_s3_embedding/main.py b/examples/amazon_s3_embedding/main.py
@@ -1,9 +1,24 @@
 from dotenv import load_dotenv
-
+from psycopg_pool import ConnectionPool
 import cocoindex
 import os
 
 
+@cocoindex.transform_flow()
+def text_to_embedding(
+    text: cocoindex.DataSlice[str],
+) -> cocoindex.DataSlice[list[float]]:
+    """
+    Embed the text using a SentenceTransformer model.
+    This is a shared logic between indexing and querying, so extract it as a function.
+    """
+    return text.transform(
+        cocoindex.functions.SentenceTransformerEmbed(
+            model="sentence-transformers/all-MiniLM-L6-v2"
+        )
+    )
+
+
 @cocoindex.flow_def(name="AmazonS3TextEmbedding")
 def amazon_s3_text_embedding_flow(
     flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
@@ -19,7 +34,7 @@ def amazon_s3_text_embedding_flow(
         cocoindex.sources.AmazonS3(
             bucket_name=bucket_name,
             prefix=prefix,
-            included_patterns=["*.md", "*.txt", "*.docx"],
+            included_patterns=["*.md", "*.mdx", "*.txt", "*.docx"],
             binary=False,
             sqs_queue_url=sqs_queue_url,
         )
@@ -36,11 +51,7 @@ def amazon_s3_text_embedding_flow(
         )
 
         with doc["chunks"].row() as chunk:
-            chunk["embedding"] = chunk["text"].transform(
-                cocoindex.functions.SentenceTransformerEmbed(
-                    model="sentence-transformers/all-MiniLM-L6-v2"
-                )
-            )
+            chunk["embedding"] = text_to_embedding(chunk["text"])
             doc_embeddings.collect(
                 filename=doc["filename"],
                 location=chunk["location"],
@@ -61,34 +72,45 @@ def amazon_s3_text_embedding_flow(
     )
 
 
-query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
-    name="SemanticsSearch",
-    flow=amazon_s3_text_embedding_flow,
-    target_name="doc_embeddings",
-    query_transform_flow=lambda text: text.transform(
-        cocoindex.functions.SentenceTransformerEmbed(
-            model="sentence-transformers/all-MiniLM-L6-v2"
-        )
-    ),
-    default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
-)
+def search(pool: ConnectionPool, query: str, top_k: int = 5):
+    # Get the table name, for the export target in the amazon_s3_text_embedding_flow above.
+    table_name = cocoindex.utils.get_target_storage_default_name(
+        amazon_s3_text_embedding_flow, "doc_embeddings"
+    )
+    # Evaluate the transform flow defined above with the input query, to get the embedding.
+    query_vector = text_to_embedding.eval(query)
+    # Run the query and get the results.
+    with pool.connection() as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                f"""
+                SELECT filename, text, embedding <=> %s::vector AS distance
+                FROM {table_name} ORDER BY distance LIMIT %s
+            """,
+                (query_vector, top_k),
+            )
+            return [
+                {"filename": row[0], "text": row[1], "score": 1.0 - row[2]}
+                for row in cur.fetchall()
+            ]
 
 
 def _main():
-    # Use a `FlowLiveUpdater` to keep the flow data updated.
-    with cocoindex.FlowLiveUpdater(amazon_s3_text_embedding_flow):
-        # Run queries in a loop to demonstrate the query capabilities.
-        while True:
-            query = input("Enter search query (or Enter to quit): ")
-            if query == "":
-                break
-            results, _ = query_handler.search(query, 10)
-            print("\nSearch results:")
-            for result in results:
-                print(f"[{result.score:.3f}] {result.data['filename']}")
-                print(f"    {result.data['text']}")
-                print("---")
-            print()
+    # Initialize the database connection pool.
+    pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
+    # Run queries in a loop to demonstrate the query capabilities.
+    while True:
+        query = input("Enter search query (or Enter to quit): ")
+        if query == "":
+            break
+        # Run the query function with the database connection pool and the query.
+        results = search(pool, query)
+        print("\nSearch results:")
+        for result in results:
+            print(f"[{result['score']:.3f}] {result['filename']}")
+            print(f"    {result['text']}")
+            print("---")
+        print()
 
 
 if __name__ == "__main__":