add compute embeddings script (#75)

rderbier · web-flow · commit f8463d5c8100 · 2025-11-04T14:23:47.000-08:00
diff --git a/embeddings/computeEmbeddings.py b/embeddings/computeEmbeddings.py
@@ -0,0 +1,271 @@
+# !pip install pydgraph pybars3 sentence_transformers mistralai openai
+import sys
+import json
+import os
+import re
+import pydgraph
+from openai import OpenAI
+from mistralai.client import MistralClient
+
+from pybars import Compiler
+from sentence_transformers import SentenceTransformer
+
+
+# Example of embeddings.json
+# {
+# "embeddings" : [
+#   {
+#     "entityType":"Product",
+#     "attribute":"product_embedding",
+#     "index":"hnsw(metric: \"euclidean\")",
+#     "provider": "huggingface",
+#     "model":"sentence-transformers/all-MiniLM-L6-v2",
+#     "config" : {
+#         "dqlQuery" : "{ title:Product.title }",
+#         "template": "{{title}} "
+#     },
+#     "disabled": false
+#   }
+# ]
+# }
+#
+#  provider : huggingface or openai or mistral
+#  model : model name from the provider doing embeddings
+
+# TODO
+# check code for template loops.
+
+compiler = Compiler()
+global client  # dgrpah client is a global variable
+
+
+# DGRAPH_CONNECTION must be defined in env variables
+# E.g dgraph://young-wind.us-east-1.aws.cloud.dgraph.io:443?sslmode=verify-ca&apikey=...
+assert "DGRAPH_CONNECTION" in os.environ, "DGRAPH_CONNECTION must be defined as a connection string"
+dgraph_cnx = os.environ["DGRAPH_CONNECTION"]
+
+
+# TRANSFORMER_API_KEY must be defined in env variables
+# client stub for on-prem requires grpc host:port without protocol
+# client stub for cloud requires the grpc endpoint of graphql endpoint or base url of the cluster
+# to run on a self-hosted env, unset ADMIN_KEY and set DGRAPH_GRPC
+
+
+def setClient():
+    global client
+    client = pydgraph.open(dgraph_cnx)
+
+
+
+def clearIndex(predicate):
+    print(f"remove index for {predicate}")
+    schema = f"{predicate}: float32vector ."
+    op = pydgraph.Operation(schema=schema)
+    alter = client.alter(op)
+    print(alter)
+
+
+def computeIndex(predicate, index):
+    print(f"create index for {predicate} {index}")
+    schema = f"{predicate}: float32vector @index({index}) ."
+    op = pydgraph.Operation(schema=schema)
+    alter = client.alter(op)
+    print(alter)
+
+
+def huggingfaceEmbeddings(model, sentences):
+    embeddings = model.encode(sentences)
+    return embeddings.tolist()
+
+
+def computeEmbedding(
+    predicate, data, template, provider, modelName, model, llm, dimensions
+):
+    # data is an array of objects contaiing uid and other predicates
+    # create an array of text
+    # get the embeddings
+    # produce a RDF text
+    # data is a list of object having uid and other predicates used in the template
+
+    nquad_list = []
+    sentences = [template(e) for e in data]
+
+    if "huggingface" == provider:
+        embeddings = huggingfaceEmbeddings(model, sentences)
+    elif "openai" == provider:
+        if dimensions is not None:
+            openaidata = llm.embeddings.create(
+                input=sentences,
+                model=modelName,
+                encoding_format="float",
+                dimensions=dimensions,
+            )
+        else:
+            openaidata = llm.embeddings.create(
+                input=sentences, model=modelName, encoding_format="float"
+            )
+        embeddings = [e.embedding for e in openaidata.data]
+    elif "mistral" == provider:
+        mistraldata = llm.embeddings(model=modelName, input=sentences)
+        embeddings = [e.embedding for e in mistraldata.data]
+
+    # embeddings is a list of vectors in the same order as the input data
+    try:
+        for i in range(0, len(data)):
+            uid = data[i]["uid"]
+            nquad_list.append(f'<{uid}> <{predicate}> "{embeddings[i]}" .')
+    # (prompt="{body[uid]}")
+    except Exception:
+        print(embeddings)
+    return nquad_list
+
+
+def mutate_rdf(nquads, client):
+    ret = {}
+    body = "\n".join(nquads)
+    if len(nquads) > 0:
+        txn = client.txn()
+        try:
+            res = txn.mutate(set_nquads=body)
+            txn.commit()
+            ret["nquads"] = (len(nquads),)
+            ret["total_ns"] = res.latency.total_ns
+        except Exception as inst:
+            print(inst)
+        finally:
+            txn.discard()
+    return ret
+
+
+def buildEmbeddings(embedding_def, only_missing=True, filehandle=sys.stdout):
+    global client
+    entity = embedding_def["entityType"]
+    config = embedding_def["config"]
+    provider = embedding_def["provider"]
+    modelName = embedding_def["model"]
+    dimensions = embedding_def["dimensions"] if "dimensions" in embedding_def else None
+    index = embedding_def["index"]
+
+    if "huggingface" == provider:
+        model = SentenceTransformer(modelName)
+        llmclient = None
+    else:
+        model = None
+        if "openai" == provider:
+            llmclient = OpenAI(
+                # This is the default and can be omitted
+                api_key=os.environ.get("OPENAI_API_KEY"),
+            )
+        elif "mistral" == provider:
+            assert "MISTRAL_API_KEY" in os.environ, "MISTRAL_API_KEY must be defined"
+            llmclient = MistralClient(api_key=os.environ.get("MISTRAL_API_KEY"))
+
+    predicate = f"{embedding_def['entityType']}.{embedding_def['attribute']}"
+
+    total = 0
+
+    template = compiler.compile(config["template"])
+    # inject uid in the query
+    # querypart = re.sub(r'([a-zA-Z_]+)',rf"\1:{entity}.\1",config['query'])
+    querypart = config["dqlQuery"]
+    querypart = querypart.replace("{", "{ uid ", 1)
+    print(querypart)
+    # remove index by updating DQL schema
+    clearIndex(predicate)
+    print(
+        f"compute embeddings for {predicate} using  model {modelName} from {provider}"
+    )
+    if only_missing:
+        filter = f"@filter( NOT has({predicate}))"
+    else:
+        filter = ""
+    # Run query.
+    after = ""
+    while True:
+        print(".")
+        txn = client.txn(read_only=True)
+        query = (
+            f"{{list(func: type({entity}),first:100 {after}) {filter}  {querypart} }}"
+        )
+        try:
+            res = txn.query(query)
+            data = json.loads(res.json)
+        except Exception as inst:
+            print(type(inst))  # the exception type
+            print(inst.args)  # arguments stored in .args
+            print(inst)
+            break
+        finally:
+            txn.discard()
+
+        if len(data["list"]) > 0:
+            last_uid = data["list"][-1]["uid"]
+            after = f",after:{last_uid}"
+        else:
+            break
+
+        nquads = computeEmbedding(
+            predicate,
+            data["list"],
+            template,
+            provider,
+            modelName,
+            model,
+            llmclient,
+            dimensions,
+        )
+        if filehandle is None:
+            mutate_rdf(nquads, client)
+        else:
+            filehandle.write("\n".join(nquads))
+        total += len(data["list"])
+
+    computeIndex(predicate, index)
+    return total
+
+
+def replace_env(matchobj):
+    key = matchobj.group(1)
+    assert key in os.environ, (
+        "config file is using a key not defined as environment variable: " + key
+    )
+    return os.environ.get(key)
+
+
+
+print("using connection string ")
+print(dgraph_cnx)
+if len(sys.argv) == 2:
+    outputfile = sys.argv[1]
+    print(f"Produce RDF file in {outputfile}")
+else:
+    outputfile = None
+    print("Mutate embeddings in cluster.")
+
+confirm = input("Continue (y/n)?")
+
+if confirm == "y":
+    q = input("Generate only missing embedding (y/n)?")
+    only_missing = q == "y"
+
+    re_env = re.compile(r"{{env.(\w*)}}")
+    setClient()
+
+    with open("./embeddings.json") as f:
+        data = f.read()
+        raw = re_env.sub(replace_env, data)
+        embeddings = json.loads(raw)
+
+        definitions = embeddings["embeddings"]
+
+        if outputfile is not None:
+            out = open(outputfile, "w")
+        else:
+            out = None
+        for embedding_def in definitions:
+            total = buildEmbeddings(embedding_def, only_missing, out)
+            print(
+                f"{total} embeddings for {embedding_def['entityType']}.{embedding_def['attribute']}"
+            )
+        if out is not None:
+            out.close()
diff --git a/embeddings/embeddings.json b/embeddings/embeddings.json
@@ -0,0 +1,28 @@
+{
+    "embeddings" : [
+        {
+            "entityType":"Product",
+            "attribute":"embedding",
+            "index":"hnsw(metric: \"euclidean\")",
+            "provider":"huggingface",
+            "model":"sentence-transformers/all-MiniLM-L6-v2",
+            "config" : {
+                "dqlQuery" : "{ title:Product.title }",
+                "template": "{{title}} "
+            },
+            "disabled": false
+        },
+        {
+            "entityType":"Service",
+            "attribute":"embedding",
+            "index":"hnsw(metric: \"euclidean\")",
+            "provider":"huggingface",
+            "model":"sentence-transformers/all-MiniLM-L6-v2",
+            "config" : {
+                "dqlQuery" : "{ title:Service.title }",
+                "template": "{{title}} "
+            },
+            "disabled": false
+        }
+    ]
+}
diff --git a/embeddings/reset-index.py b/embeddings/reset-index.py
@@ -0,0 +1,73 @@
+# !pip install  pydgraph python_graphql_client 
+import sys
+import json
+import os
+import re
+import pydgraph
+
+
+# reset the index all embedding predicates or of one provided predicate name
+# Should be replaced by Deploying the GraphQL Schema without indexes and then deploying with the Indexes.
+# 
+global client # dgrpah client is a global variable
+
+assert "DGRAPH_GRPC" in os.environ, "DGRAPH_GRPC must be defined"
+dgraph_grpc = os.environ["DGRAPH_GRPC"]
+if "cloud.dgraph" in dgraph_grpc:
+    assert "DGRAPH_ADMIN_KEY" in os.environ, "DGRAPH_ADMIN_KEY must be defined"
+    APIAdminKey = os.environ["DGRAPH_ADMIN_KEY"]
+else:
+    APIAdminKey = None
+
+# TRANSFORMER_API_KEY must be defined in env variables
+# client stub for on-prem requires grpc host:port without protocol
+# client stub for cloud requires the grpc endpoint of graphql endpoint or base url of the cluster
+# to run on a self-hosted env, unset ADMIN_KEY and set DGRAPH_GRPC
+
+def setClient():
+    global client
+    if APIAdminKey is None:
+      client_stub = pydgraph.DgraphClientStub(dgraph_grpc)
+    else:
+        client_stub = pydgraph.DgraphClientStub.from_cloud(dgraph_grpc,APIAdminKey )     
+    client = pydgraph.DgraphClient(client_stub)
+
+def clearIndex(predicate):
+    print(f"remove index for {predicate}")
+    schema = f"{predicate}: float32vector ."
+    op = pydgraph.Operation(schema=schema)
+    alter = client.alter(op)
+    print(alter)
+def computeIndex(predicate,index):
+    print(f"create index for {predicate} {index}")
+    schema = f"{predicate}: float32vector @index({index}) ."
+    op = pydgraph.Operation(schema=schema)
+    alter = client.alter(op)
+    print(alter) 
+
+
+
+if len(sys.argv) == 3:
+    requested_predicate = sys.argv[2]
+    print(f"Reindexing {requested_predicate} in {dgraph_grpc}")
+else:
+    requested_predicate = None    
+    print(f"Reindexing all embeddings predicates  in {dgraph_grpc}")
+    
+
+confirm = input("Continue (y/n)?")
+
+
+
+if confirm == "y":
+    setClient()
+    
+    with open("./embeddings.json") as f:
+        data = f.read()
+        hm_config = json.loads(data)
+        for embedding_def in hm_config['embeddings']:
+            predicate = f"{embedding_def['entityType']}.{embedding_def['attribute']}"
+            if requested_predicate == None or requested_predicate == predicate:
+                index = embedding_def['index']
+                clearIndex(predicate)
+                computeIndex(predicate,index)