hugegraph · MrJs133 · Dec 9, 2024 · Jan 3, 2025 · Jan 10, 2025 · Jan 16, 2025
diff --git a/hugegraph-llm/src/hugegraph_llm/api/vector_api.py b/hugegraph-llm/src/hugegraph_llm/api/vector_api.py
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+from fastapi import status, APIRouter
+
+
+# pylint: disable=too-many-statements
+def vector_http_api(
+    router: APIRouter,
+    update_embedding_func,
+):
+    @router.post("/vector/embedding", status_code=status.HTTP_200_OK)
+    def update_embedding_api():
+        result = update_embedding_func()
+        return result
diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/app.py b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/app.py
@@ -23,6 +23,7 @@
 
 from hugegraph_llm.api.admin_api import admin_http_api
 from hugegraph_llm.api.rag_api import rag_http_api
+from hugegraph_llm.api.vector_api import vector_http_api
 from hugegraph_llm.config import admin_settings, huge_settings, prompt
 from hugegraph_llm.demo.rag_demo.admin_block import create_admin_block, log_stream
 from hugegraph_llm.demo.rag_demo.configs_block import (
@@ -32,6 +33,7 @@
     apply_reranker_config,
     apply_graph_config,
 )
+from hugegraph_llm.utils.graph_index_utils import update_vid_embedding
 from hugegraph_llm.demo.rag_demo.other_block import create_other_block
 from hugegraph_llm.demo.rag_demo.other_block import lifespan
 from hugegraph_llm.demo.rag_demo.rag_block import create_rag_block, rag_answer
@@ -173,7 +175,7 @@ def create_app():
         apply_reranker_config,
     )
     admin_http_api(api_auth, log_stream)
-
+    vector_http_api(api_auth, update_vid_embedding)
     app.include_router(api_auth)
     # Mount Gradio inside FastAPI
     # TODO: support multi-user login when need

diff --git a/hugegraph-llm/src/hugegraph_llm/models/embeddings/ollama.py b/hugegraph-llm/src/hugegraph_llm/models/embeddings/ollama.py
@@ -39,8 +39,41 @@ def get_text_embedding(
             self,
             text: str
     ) -> List[float]:
-        """Comment"""
-        return list(self.client.embed(model=self.model, input=text)["embeddings"][0])
+        """Get embedding for a single text.
+
+        This method handles different Ollama client API versions by checking for
+        the presence of 'embed' or 'embeddings' methods.
+        """
+        if hasattr(self.client, "embed"):
+            response = self.client.embed(model=self.model, input=text)
+            try:
+                # First, try the structure typically seen for single embeddings
+                # or newer batch responses that might return a single "embedding" key.
+                return list(response["embedding"])
+            except KeyError:
+                # Fallback for older batch-like response for single item,
+                # or if "embeddings" is a list with one item.
+                try:
+                    return list(response["embeddings"][0])
+                except (KeyError, IndexError) as e:
+                    raise RuntimeError(
+                        "Failed to extract embedding from Ollama client 'embed' response. "
+                        f"Response: {response}. Error: {e}"
+                    ) from e
+        elif hasattr(self.client, "embeddings"):
+            response = self.client.embeddings(model=self.model, prompt=text)
+            try:
+                return list(response["embedding"])
+            except KeyError as e:
+                raise RuntimeError(
+                    "Failed to extract embedding from Ollama client 'embeddings' response. "
+                    f"Response: {response}. Error: {e}"
+                ) from e
+        else:
+            raise AttributeError(
+                "Ollama client object has neither 'embed' nor 'embeddings' method. "
+                "Please check your ollama library version."
+            )
 
     def get_texts_embeddings(
             self,
@@ -63,8 +96,20 @@ def get_texts_embeddings(
             A list of embedding vectors, where each vector is a list of floats.
             The order of embeddings matches the order of input texts.
         """
-        response = self.client.embed(model=self.model, input=texts)["embeddings"]
-        return [list(inner_sequence) for inner_sequence in response]
+        if hasattr(self.client, "embed"): # pylint: disable=no-else-return
+            response = self.client.embed(model=self.model, input=texts)["embeddings"]
+            return [list(inner_sequence) for inner_sequence in response]
+        elif hasattr(self.client, "embeddings"):
+            embeddings_list = []
+            for text_item in texts:
+                response_item = self.client.embeddings(model=self.model, prompt=text_item)
+                embeddings_list.append(list(response_item["embedding"]))
+            return embeddings_list
+        else:
+            raise AttributeError(
+                "Ollama client object has neither 'embed' nor 'embeddings' method. "
+                "Please check your ollama library version."
+            )
 
     async def async_get_text_embedding(
             self,

diff --git a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py
@@ -25,6 +25,7 @@ class FetchGraphData:
 
     def __init__(self, graph: PyHugeClient):
         self.graph = graph
+        self.schema = self.graph.schema()
 
     def run(self, graph_summary: Optional[Dict[str, Any]]) -> Dict[str, Any]:
         if graph_summary is None:
@@ -49,4 +50,19 @@ def res = [:];
 
         if isinstance(result, list) and len(result) > 0:
             graph_summary.update({key: result[i].get(key) for i, key in enumerate(keys)})
+
+        index_labels = self.schema.getIndexLabels()
+        if index_labels:
+            graph_summary["index_labels"] = [
+                {
+                    "id": label.id,
+                    "base_type": label.baseType,
+                    "base_value": label.baseValue,
+                    "name": label.name,
+                    "fields": label.fields,
+                    "index_type": label.indexType
+                } for label in index_labels
+            ]
+        else:
+            graph_summary["index_labels"] = []
         return graph_summary
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py b/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py
@@ -19,21 +19,40 @@
 import asyncio
 import os
 from typing import Any, Dict
-
+from collections import defaultdict
 from tqdm import tqdm
 
 from hugegraph_llm.config import resource_path, huge_settings
 from hugegraph_llm.indices.vector_index import VectorIndex
 from hugegraph_llm.models.embeddings.base import BaseEmbedding
 from hugegraph_llm.utils.log import log
 from hugegraph_llm.operators.hugegraph_op.schema_manager import SchemaManager
+from pyhugegraph.client import PyHugeClient
+
+INDEX_PROPERTY_GREMLIN = """
+g.V().hasLabel('{label}')
+ .limit(100000)
+ .project('vid', 'properties')
+ .by(id())
+ .by(valueMap({fields}))
+ .toList()
+"""
 
 class BuildSemanticIndex:
     def __init__(self, embedding: BaseEmbedding):
         self.index_dir = str(os.path.join(resource_path, huge_settings.graph_name, "graph_vids"))
+        self.index_dir_prop = str(os.path.join(resource_path, huge_settings.graph_name, "graph_props"))
         self.vid_index = VectorIndex.from_index_file(self.index_dir)
+        self.prop_index = VectorIndex.from_index_file(self.index_dir_prop)
         self.embedding = embedding
         self.sm = SchemaManager(huge_settings.graph_name)
+        self.client = PyHugeClient(
+            url=huge_settings.graph_url,
+            graph=huge_settings.graph_name,
+            user=huge_settings.graph_user,
+            pwd=huge_settings.graph_pwd,
+            graphspace=huge_settings.graph_space,
+        )
 
     def _extract_names(self, vertices: list[str]) -> list[str]:
         return [v.split(":")[1] for v in vertices]
@@ -63,7 +82,26 @@ async def get_embeddings_with_semaphore(vid_list: list[str]) -> Any:
                 pbar.update(1)
         return embeddings
 
-    def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
+    def diff_property_sets(
+        self,
+        present_prop_value_to_propset: dict,
+        past_prop_value_to_propset: dict
+    ):
+        to_add = []
+        to_update = []
+        to_update_remove = []
+        to_remove_keys = set(past_prop_value_to_propset) - set(present_prop_value_to_propset)
+        to_remove = [past_prop_value_to_propset[k] for k in to_remove_keys]
+        for prop_value, present_propset in present_prop_value_to_propset.items():
+            past_propset = past_prop_value_to_propset.get(prop_value)
+            if past_propset is None:
+                to_add.append((prop_value, present_propset))
+            elif present_propset != past_propset:
+                to_update.append((prop_value, present_propset))
+                to_update_remove.append((prop_value, past_propset))
+        return to_add, to_update, to_remove, to_update_remove
+
+    def run(self, context: Dict[str, Any]) -> Dict[str, Any]: # pylint: disable=too-many-statements, too-many-branches
         vertexlabels = self.sm.schema.getSchema()["vertexlabels"]
         all_pk_flag = all(data.get('id_strategy') == 'PRIMARY_KEY' for data in vertexlabels)
 
@@ -72,6 +110,8 @@ def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
         present_vids = context["vertices"] # Warning: data truncated by fetch_graph_data.py
         removed_vids = set(past_vids) - set(present_vids)
         removed_num = self.vid_index.remove(removed_vids)
+        if removed_num:
+            self.vid_index.to_index_file(self.index_dir)
         added_vids = list(set(present_vids) - set(past_vids))
 
         if added_vids:
@@ -86,4 +126,88 @@ def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
             "removed_vid_vector_num": removed_num,
             "added_vid_vector_num": len(added_vids)
         })
+
+        if context["index_labels"]:
+            # 1. get present prop_value_to_propset
+            results = []
+            for item in context["index_labels"]:
+                label = item["base_value"]
+                fields = item["fields"]
+                fields_str_list = [f"'{field}'" for field in fields]
+                fields_for_query = ", ".join(fields_str_list)
+                gremlin_query = INDEX_PROPERTY_GREMLIN.format(
+                    label=label,
+                    fields=fields_for_query
+                )
+                log.debug("gremlin_query: %s", gremlin_query)
+                result = self.client.gremlin().exec(gremlin=gremlin_query)["data"]
+                results.extend(result)
+            orig_present_prop_value_to_propset = defaultdict(set)
+            for item in results:
+                properties = item["properties"]
+                for prop_key, values in properties.items():
+                    if not values:
+                        continue
+                    prop_value = str(values[0])
+                    orig_present_prop_value_to_propset[prop_value].add((prop_key, prop_value))
+            present_prop_value_to_propset = {
+                k: frozenset(v)
+                for k, v in orig_present_prop_value_to_propset.items()
+            }
+            log.debug("present_prop_value_to_propset: %s", present_prop_value_to_propset)
+            # 2. get past prop_value_to_propset
+            orig_past_prop_value_to_propset = defaultdict(set)
+            for propset in self.prop_index.properties:
+                for prop_key, prop_value in propset:
+                    orig_past_prop_value_to_propset[prop_value].update(propset)
+            past_prop_value_to_propset = {
+                k: frozenset(v)
+                for k, v in orig_past_prop_value_to_propset.items()
+            }
+            # 3. to add(add pk), to update(change pv), to_remove(delete pk), to_update_remove(change pv)
+            to_add, to_update, to_remove, to_update_remove = self.diff_property_sets(
+                present_prop_value_to_propset,
+                past_prop_value_to_propset
+            )
+            log.debug("to_add: %s", to_add)
+            log.debug("to_update: %s", to_update)
+            log.debug("to_remove: %s", to_remove)
+            log.debug("to_update_remove: %s", to_update_remove)
+            log.debug("prop properties.pkl: %s", self.prop_index.properties)
+            # 4. remove
+            log.info("Removing %s outdated property value", len(to_remove))
+            removed_props_num = self.prop_index.remove(to_remove)
+            if removed_props_num:
+                self.prop_index.to_index_file(self.index_dir_prop)
+            # 5. embedding
+            all_to_add = to_add + to_update
+            add_propsets = []
+            add_prop_values = []
+            for prop_value, propset in all_to_add:
+                add_propsets.append(propset)
+                add_prop_values.append(prop_value)
+            if add_prop_values:
+                if len(add_prop_values) > 100000:
+                    log.warning("The number of props > 100000, please select which properties to vectorize.")
+                    context.update({
+                        "removed_props_num": removed_props_num,
+                        "added_props_vector_num": "0 (because of exceeding limit)"
+                    })
+                    return context
+                if to_update_remove:
+                    update_remove_prop_values = [prop_set for _, prop_set in to_update_remove]
+                    removed_num = self.prop_index.remove(update_remove_prop_values)
+                    self.prop_index.to_index_file(self.index_dir_prop)
+                    log.info("In to_update: Removed %s outdated property set", removed_num)
+                added_props_embeddings = asyncio.run(self._get_embeddings_parallel(add_prop_values))
-                added_props_embeddings = asyncio.run(self._get_embeddings_parallel(add_prop_values))
+                added_props_embeddings = await self._get_embeddings_parallel(add_prop_values)
-                added_props_embeddings = asyncio.run(self._get_embeddings_parallel(add_prop_values))
+                added_props_embeddings = await self._get_embeddings_parallel(add_prop_values)
+                self.prop_index.add(added_props_embeddings, add_propsets)
+                log.info("Added %s new or updated property embeddings", len(added_props_embeddings))
+                self.prop_index.to_index_file(self.index_dir_prop)
+            else:
+                log.debug("No update props to build vector index.")
+            context.update({
+                "removed_props_num": removed_props_num,
+                "added_props_vector_num": len(to_add)
+            })
+
         return context
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py b/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
@@ -38,7 +38,9 @@ def __init__(
             vector_dis_threshold: float = huge_settings.vector_dis_threshold,
     ):
         self.index_dir = str(os.path.join(resource_path, huge_settings.graph_name, "graph_vids"))
+        self.index_dir_prop = str(os.path.join(resource_path, huge_settings.graph_name, "graph_props"))
         self.vector_index = VectorIndex.from_index_file(self.index_dir)
+        self.prop_index = VectorIndex.from_index_file(self.index_dir_prop)
         self.embedding = embedding
         self.by = by
         self.topk_per_query = topk_per_query
@@ -51,6 +53,7 @@ def __init__(
             pwd=huge_settings.graph_pwd,
             graphspace=huge_settings.graph_space,
         )
+        self.schema = self._client.schema()
 
     def _exact_match_vids(self, keywords: List[str]) -> Tuple[List[str], List[str]]:
         assert keywords, "keywords can't be empty, please check the logic"
@@ -82,6 +85,33 @@ def _fuzzy_match_vids(self, keywords: List[str]) -> List[str]:
                 fuzzy_match_result.extend(results[:self.topk_per_keyword])
         return fuzzy_match_result
 
+    def _exact_match_properties(self, keywords: List[str]) -> Tuple[List[str], List[str]]:
+        property_keys = self.schema.getPropertyKeys()
+        log.debug("property_keys: %s", property_keys)
+        matched_properties = set()
+        unmatched_keywords = set(keywords)
+
+        for key in property_keys:
+            for keyword in list(unmatched_keywords):
+                gremlin_query = f"g.V().has('{key.name}', '{keyword}').limit(1)"
+                log.debug("prop Gremlin query: %s", gremlin_query)
+                resp = self._client.gremlin().exec(gremlin_query)
+                if resp.get("data"):
+                    matched_properties.add((key, keyword))
+                    unmatched_keywords.remove(keyword)
+
+        return list(matched_properties), list(unmatched_keywords)
+
+    def _fuzzy_match_props(self, keywords: List[str]) -> List[str]:
+        fuzzy_match_result = []
+        for keyword in keywords:
+            keyword_vector = self.embedding.get_text_embedding(keyword)
+            results = self.prop_index.search(keyword_vector, top_k=self.topk_per_keyword,
+                                               dis_threshold=float(self.vector_dis_threshold))
+            if results:
+                fuzzy_match_result.extend(results[:self.topk_per_keyword])
+        return fuzzy_match_result
+
     def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
         graph_query_list = set()
         if self.by == "query":
@@ -101,5 +131,16 @@ def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
             fuzzy_match_vids = self._fuzzy_match_vids(unmatched_vids)
             log.debug("Fuzzy match vids: %s", fuzzy_match_vids)
             graph_query_list.update(fuzzy_match_vids)
+            index_labels = self.schema.getIndexLabels()
+            if index_labels:
+                props_list = set()
+                exact_match_props, unmatched_props = self._exact_match_properties(keywords)
+                log.debug("Exact match props: %s", exact_match_props)
+                props_list.update(exact_match_props)
+                fuzzy_match_props = self._fuzzy_match_props(unmatched_props)
+                log.debug("Fuzzy match props: %s", fuzzy_match_props)
+                props_list.update(fuzzy_match_props)
+                context["match_props"] = list(props_list)
+                log.debug("Match props: %s", context["match_props"])
         context["match_vids"] = list(graph_query_list)
         return context