Skip to content
This repository was archived by the owner on Dec 28, 2025. It is now read-only.
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
74cf3c5
doc: update for inner repo (GraphPlatform-4190)
Dec 9, 2024
e9941ad
Merge branch 'master' of ssh://icode.baidu.com:8235/baidu/starhugegra…
imbajin Jan 3, 2025
d2e4e2e
Merge branch 'main' into master-icode
imbajin Jan 10, 2025
c5ec3ea
Merge branch 'main' into master-icode
HJ-Young Jan 16, 2025
12221c9
docs(client): update README.md
imbajin Jan 20, 2025
8343d3c
Merge remote-tracking branch 'github/main'
Feb 7, 2025
0036b2c
Merge branch 'main'
HJ-Young Feb 25, 2025
173b9b2
Merge branch 'main' into master-icode
imbajin Mar 3, 2025
c9ef9ed
fix(llm): enable concurrency config in rag answer
imbajin Mar 3, 2025
c933c58
Merge branch 'main' into master-icode
imbajin Mar 3, 2025
263758b
Merge branch 'main' into master-icode
imbajin Mar 6, 2025
8a0e6cd
Merge branch 'main' into master-icode
imbajin Mar 11, 2025
a60887d
GraphPlatform-4765 [Tech Task] vermeer-client框架开发
Mar 10, 2025
43cdcae
update llm settings
Apr 27, 2025
b4f1a51
set num_gremlin_generate_example 0
Apr 28, 2025
142a44e
disable text2gql by default
Apr 28, 2025
9f245a0
remove log info
Apr 28, 2025
87cb1cf
temperature 0.01
Apr 29, 2025
961f582
merge community
Apr 29, 2025
4cbb47a
modify prompt to only output gql
Apr 29, 2025
36116e6
test_api_connection
May 7, 2025
5db4e0c
merge main to master
May 12, 2025
bf19a84
empty chunk
May 12, 2025
026c65c
build, get and remove property embeddings
May 19, 2025
f2ee374
limit the number of props to be updated
May 19, 2025
0e98879
disable pylint and modify limit logic
May 20, 2025
a3b864f
save after removing props
May 20, 2025
ac7e137
change key:value to value
May 20, 2025
8ace5a8
pv-embedding + set<(pk, pv)>
May 22, 2025
a9f92c4
vector/embedding api
May 22, 2025
18744d2
Merge branch 'main' into property_embedding
imbajin May 22, 2025
89b2d47
match keywords and props
May 22, 2025
f14087a
fix ollama batch embeddings
May 22, 2025
ef37855
fix ollama single embedding
May 23, 2025
c24d210
pylint
May 23, 2025
f1fdbdb
Merge branch 'main' into property_embedding
MrJs133 May 23, 2025
2953dbc
fix ollama
May 23, 2025
cf279f5
split run()
May 23, 2025
182ecba
using get_texts_embeddings instead of get_text_embeddings
May 23, 2025
b7e7425
match properties and change the structure of fuzzy_matched_props
May 23, 2025
52c40cb
property subgraph_query
May 23, 2025
10e76cd
pylint
May 26, 2025
923502a
pylint
May 26, 2025
739479a
limit 2 times one day
May 26, 2025
9acaa96
pylint
May 26, 2025
86e6098
inner
May 26, 2025
b27d925
Merge branch 'main' into master-icode
imbajin May 26, 2025
765e93f
Merge branch 'master-icode' into property_embedding
imbajin May 26, 2025
b5f31ff
format
May 26, 2025
50ec338
fix lint & comment
imbajin May 26, 2025
7d9d67c
text2gremlin api
May 26, 2025
c918e73
change params
May 26, 2025
7b7260a
change params
May 26, 2025
3754211
text to json
May 27, 2025
7a2cf2b
detail
May 27, 2025
7b3e5e2
add graph_space in text2gremlin api
May 27, 2025
cb31b35
add graph_space in text2gremlin api
May 27, 2025
9778c37
change default in text2gremlin api
May 27, 2025
cf6d2e4
split build_semantic_index.run()
May 28, 2025
eb5e9f1
Merge branch 'property_embedding' of https://icode.baidu.com/baidu/st…
May 28, 2025
e75f68f
conflict
May 28, 2025
4aa1a4d
change daily limit
May 29, 2025
7b7c6d2
create pyhugegraph client by token
May 29, 2025
31bf971
change param
May 29, 2025
a0e460c
name -> graph
May 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions hugegraph-llm/src/hugegraph_llm/api/vector_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


from fastapi import status, APIRouter


# pylint: disable=too-many-statements
def vector_http_api(
router: APIRouter,
update_embedding_func,
):
@router.post("/vector/embedding", status_code=status.HTTP_200_OK)
def update_embedding_api():
result = update_embedding_func()
return result
4 changes: 3 additions & 1 deletion hugegraph-llm/src/hugegraph_llm/demo/rag_demo/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

from hugegraph_llm.api.admin_api import admin_http_api
from hugegraph_llm.api.rag_api import rag_http_api
from hugegraph_llm.api.vector_api import vector_http_api
from hugegraph_llm.config import admin_settings, huge_settings, prompt
from hugegraph_llm.demo.rag_demo.admin_block import create_admin_block, log_stream
from hugegraph_llm.demo.rag_demo.configs_block import (
Expand All @@ -32,6 +33,7 @@
apply_reranker_config,
apply_graph_config,
)
from hugegraph_llm.utils.graph_index_utils import update_vid_embedding
from hugegraph_llm.demo.rag_demo.other_block import create_other_block
from hugegraph_llm.demo.rag_demo.other_block import lifespan
from hugegraph_llm.demo.rag_demo.rag_block import create_rag_block, rag_answer
Expand Down Expand Up @@ -173,7 +175,7 @@ def create_app():
apply_reranker_config,
)
admin_http_api(api_auth, log_stream)

vector_http_api(api_auth, update_vid_embedding)
app.include_router(api_auth)
# Mount Gradio inside FastAPI
# TODO: support multi-user login when need
Expand Down
53 changes: 49 additions & 4 deletions hugegraph-llm/src/hugegraph_llm/models/embeddings/ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,41 @@ def get_text_embedding(
self,
text: str
) -> List[float]:
"""Comment"""
return list(self.client.embed(model=self.model, input=text)["embeddings"][0])
"""Get embedding for a single text.

This method handles different Ollama client API versions by checking for
the presence of 'embed' or 'embeddings' methods.
"""
if hasattr(self.client, "embed"):
response = self.client.embed(model=self.model, input=text)
try:
# First, try the structure typically seen for single embeddings
# or newer batch responses that might return a single "embedding" key.
return list(response["embedding"])
except KeyError:
# Fallback for older batch-like response for single item,
# or if "embeddings" is a list with one item.
try:
return list(response["embeddings"][0])
except (KeyError, IndexError) as e:
raise RuntimeError(
"Failed to extract embedding from Ollama client 'embed' response. "
f"Response: {response}. Error: {e}"
) from e
elif hasattr(self.client, "embeddings"):
response = self.client.embeddings(model=self.model, prompt=text)
try:
return list(response["embedding"])
except KeyError as e:
raise RuntimeError(
"Failed to extract embedding from Ollama client 'embeddings' response. "
f"Response: {response}. Error: {e}"
) from e
else:
raise AttributeError(
"Ollama client object has neither 'embed' nor 'embeddings' method. "
"Please check your ollama library version."
)

def get_texts_embeddings(
self,
Expand All @@ -63,8 +96,20 @@ def get_texts_embeddings(
A list of embedding vectors, where each vector is a list of floats.
The order of embeddings matches the order of input texts.
"""
response = self.client.embed(model=self.model, input=texts)["embeddings"]
return [list(inner_sequence) for inner_sequence in response]
if hasattr(self.client, "embed"): # pylint: disable=no-else-return
response = self.client.embed(model=self.model, input=texts)["embeddings"]
return [list(inner_sequence) for inner_sequence in response]
elif hasattr(self.client, "embeddings"):
embeddings_list = []
for text_item in texts:
response_item = self.client.embeddings(model=self.model, prompt=text_item)
embeddings_list.append(list(response_item["embedding"]))
return embeddings_list
else:
raise AttributeError(
"Ollama client object has neither 'embed' nor 'embeddings' method. "
"Please check your ollama library version."
)

async def async_get_text_embedding(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class FetchGraphData:

def __init__(self, graph: PyHugeClient):
self.graph = graph
self.schema = self.graph.schema()

def run(self, graph_summary: Optional[Dict[str, Any]]) -> Dict[str, Any]:
if graph_summary is None:
Expand All @@ -49,4 +50,19 @@ def res = [:];

if isinstance(result, list) and len(result) > 0:
graph_summary.update({key: result[i].get(key) for i, key in enumerate(keys)})

index_labels = self.schema.getIndexLabels()
if index_labels:
graph_summary["index_labels"] = [
{
"id": label.id,
"base_type": label.baseType,
"base_value": label.baseValue,
"name": label.name,
"fields": label.fields,
"index_type": label.indexType
} for label in index_labels
]
else:
graph_summary["index_labels"] = []
return graph_summary
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,40 @@
import asyncio
import os
from typing import Any, Dict

from collections import defaultdict
from tqdm import tqdm

from hugegraph_llm.config import resource_path, huge_settings
from hugegraph_llm.indices.vector_index import VectorIndex
from hugegraph_llm.models.embeddings.base import BaseEmbedding
from hugegraph_llm.utils.log import log
from hugegraph_llm.operators.hugegraph_op.schema_manager import SchemaManager
from pyhugegraph.client import PyHugeClient

INDEX_PROPERTY_GREMLIN = """
g.V().hasLabel('{label}')
.limit(100000)
.project('vid', 'properties')
.by(id())
.by(valueMap({fields}))
.toList()
"""

class BuildSemanticIndex:
def __init__(self, embedding: BaseEmbedding):
self.index_dir = str(os.path.join(resource_path, huge_settings.graph_name, "graph_vids"))
self.index_dir_prop = str(os.path.join(resource_path, huge_settings.graph_name, "graph_props"))
self.vid_index = VectorIndex.from_index_file(self.index_dir)
self.prop_index = VectorIndex.from_index_file(self.index_dir_prop)
self.embedding = embedding
self.sm = SchemaManager(huge_settings.graph_name)
self.client = PyHugeClient(
url=huge_settings.graph_url,
graph=huge_settings.graph_name,
user=huge_settings.graph_user,
pwd=huge_settings.graph_pwd,
graphspace=huge_settings.graph_space,
)

def _extract_names(self, vertices: list[str]) -> list[str]:
return [v.split(":")[1] for v in vertices]
Expand Down Expand Up @@ -63,7 +82,26 @@ async def get_embeddings_with_semaphore(vid_list: list[str]) -> Any:
pbar.update(1)
return embeddings

def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
def diff_property_sets(
Copy link

Copilot AI May 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The diff_property_sets method contains complex logic; consider refactoring it into smaller helper functions or adding inline comments to improve readability and maintainability.

Copilot uses AI. Check for mistakes.
self,
present_prop_value_to_propset: dict,
past_prop_value_to_propset: dict
):
to_add = []
to_update = []
to_update_remove = []
to_remove_keys = set(past_prop_value_to_propset) - set(present_prop_value_to_propset)
to_remove = [past_prop_value_to_propset[k] for k in to_remove_keys]
for prop_value, present_propset in present_prop_value_to_propset.items():
past_propset = past_prop_value_to_propset.get(prop_value)
if past_propset is None:
to_add.append((prop_value, present_propset))
elif present_propset != past_propset:
to_update.append((prop_value, present_propset))
to_update_remove.append((prop_value, past_propset))
return to_add, to_update, to_remove, to_update_remove

def run(self, context: Dict[str, Any]) -> Dict[str, Any]: # pylint: disable=too-many-statements, too-many-branches
vertexlabels = self.sm.schema.getSchema()["vertexlabels"]
all_pk_flag = all(data.get('id_strategy') == 'PRIMARY_KEY' for data in vertexlabels)

Expand All @@ -72,6 +110,8 @@ def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
present_vids = context["vertices"] # Warning: data truncated by fetch_graph_data.py
removed_vids = set(past_vids) - set(present_vids)
removed_num = self.vid_index.remove(removed_vids)
if removed_num:
self.vid_index.to_index_file(self.index_dir)
added_vids = list(set(present_vids) - set(past_vids))

if added_vids:
Expand All @@ -86,4 +126,88 @@ def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
"removed_vid_vector_num": removed_num,
"added_vid_vector_num": len(added_vids)
})

if context["index_labels"]:
# 1. get present prop_value_to_propset
results = []
for item in context["index_labels"]:
label = item["base_value"]
fields = item["fields"]
fields_str_list = [f"'{field}'" for field in fields]
fields_for_query = ", ".join(fields_str_list)
gremlin_query = INDEX_PROPERTY_GREMLIN.format(
label=label,
fields=fields_for_query
)
log.debug("gremlin_query: %s", gremlin_query)
result = self.client.gremlin().exec(gremlin=gremlin_query)["data"]
results.extend(result)
orig_present_prop_value_to_propset = defaultdict(set)
for item in results:
properties = item["properties"]
for prop_key, values in properties.items():
if not values:
continue
prop_value = str(values[0])
orig_present_prop_value_to_propset[prop_value].add((prop_key, prop_value))
present_prop_value_to_propset = {
k: frozenset(v)
for k, v in orig_present_prop_value_to_propset.items()
}
log.debug("present_prop_value_to_propset: %s", present_prop_value_to_propset)
# 2. get past prop_value_to_propset
orig_past_prop_value_to_propset = defaultdict(set)
for propset in self.prop_index.properties:
for prop_key, prop_value in propset:
orig_past_prop_value_to_propset[prop_value].update(propset)
past_prop_value_to_propset = {
k: frozenset(v)
for k, v in orig_past_prop_value_to_propset.items()
}
# 3. to add(add pk), to update(change pv), to_remove(delete pk), to_update_remove(change pv)
to_add, to_update, to_remove, to_update_remove = self.diff_property_sets(
present_prop_value_to_propset,
past_prop_value_to_propset
)
log.debug("to_add: %s", to_add)
log.debug("to_update: %s", to_update)
log.debug("to_remove: %s", to_remove)
log.debug("to_update_remove: %s", to_update_remove)
log.debug("prop properties.pkl: %s", self.prop_index.properties)
# 4. remove
log.info("Removing %s outdated property value", len(to_remove))
removed_props_num = self.prop_index.remove(to_remove)
if removed_props_num:
self.prop_index.to_index_file(self.index_dir_prop)
# 5. embedding
all_to_add = to_add + to_update
add_propsets = []
add_prop_values = []
for prop_value, propset in all_to_add:
add_propsets.append(propset)
add_prop_values.append(prop_value)
if add_prop_values:
if len(add_prop_values) > 100000:
log.warning("The number of props > 100000, please select which properties to vectorize.")
context.update({
"removed_props_num": removed_props_num,
"added_props_vector_num": "0 (because of exceeding limit)"
})
return context
if to_update_remove:
update_remove_prop_values = [prop_set for _, prop_set in to_update_remove]
removed_num = self.prop_index.remove(update_remove_prop_values)
self.prop_index.to_index_file(self.index_dir_prop)
log.info("In to_update: Removed %s outdated property set", removed_num)
added_props_embeddings = asyncio.run(self._get_embeddings_parallel(add_prop_values))
Copy link

Copilot AI May 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using asyncio.run in a synchronous method for embeddings retrieval may block the processing thread under high load. Consider refactoring to an asynchronous API endpoint or using background tasks to better handle concurrency.

Suggested change
added_props_embeddings = asyncio.run(self._get_embeddings_parallel(add_prop_values))
added_props_embeddings = await self._get_embeddings_parallel(add_prop_values)

Copilot uses AI. Check for mistakes.
self.prop_index.add(added_props_embeddings, add_propsets)
log.info("Added %s new or updated property embeddings", len(added_props_embeddings))
self.prop_index.to_index_file(self.index_dir_prop)
else:
log.debug("No update props to build vector index.")
context.update({
"removed_props_num": removed_props_num,
"added_props_vector_num": len(to_add)
})

return context
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ def __init__(
vector_dis_threshold: float = huge_settings.vector_dis_threshold,
):
self.index_dir = str(os.path.join(resource_path, huge_settings.graph_name, "graph_vids"))
self.index_dir_prop = str(os.path.join(resource_path, huge_settings.graph_name, "graph_props"))
self.vector_index = VectorIndex.from_index_file(self.index_dir)
self.prop_index = VectorIndex.from_index_file(self.index_dir_prop)
self.embedding = embedding
self.by = by
self.topk_per_query = topk_per_query
Expand All @@ -51,6 +53,7 @@ def __init__(
pwd=huge_settings.graph_pwd,
graphspace=huge_settings.graph_space,
)
self.schema = self._client.schema()

def _exact_match_vids(self, keywords: List[str]) -> Tuple[List[str], List[str]]:
assert keywords, "keywords can't be empty, please check the logic"
Expand Down Expand Up @@ -82,6 +85,33 @@ def _fuzzy_match_vids(self, keywords: List[str]) -> List[str]:
fuzzy_match_result.extend(results[:self.topk_per_keyword])
return fuzzy_match_result

def _exact_match_properties(self, keywords: List[str]) -> Tuple[List[str], List[str]]:
property_keys = self.schema.getPropertyKeys()
log.debug("property_keys: %s", property_keys)
matched_properties = set()
unmatched_keywords = set(keywords)

for key in property_keys:
for keyword in list(unmatched_keywords):
gremlin_query = f"g.V().has('{key.name}', '{keyword}').limit(1)"
log.debug("prop Gremlin query: %s", gremlin_query)
resp = self._client.gremlin().exec(gremlin_query)
if resp.get("data"):
matched_properties.add((key, keyword))
unmatched_keywords.remove(keyword)

return list(matched_properties), list(unmatched_keywords)

def _fuzzy_match_props(self, keywords: List[str]) -> List[str]:
fuzzy_match_result = []
for keyword in keywords:
keyword_vector = self.embedding.get_text_embedding(keyword)
results = self.prop_index.search(keyword_vector, top_k=self.topk_per_keyword,
dis_threshold=float(self.vector_dis_threshold))
if results:
fuzzy_match_result.extend(results[:self.topk_per_keyword])
return fuzzy_match_result

def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
graph_query_list = set()
if self.by == "query":
Expand All @@ -101,5 +131,16 @@ def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
fuzzy_match_vids = self._fuzzy_match_vids(unmatched_vids)
log.debug("Fuzzy match vids: %s", fuzzy_match_vids)
graph_query_list.update(fuzzy_match_vids)
index_labels = self.schema.getIndexLabels()
if index_labels:
props_list = set()
exact_match_props, unmatched_props = self._exact_match_properties(keywords)
log.debug("Exact match props: %s", exact_match_props)
props_list.update(exact_match_props)
fuzzy_match_props = self._fuzzy_match_props(unmatched_props)
log.debug("Fuzzy match props: %s", fuzzy_match_props)
props_list.update(fuzzy_match_props)
context["match_props"] = list(props_list)
log.debug("Match props: %s", context["match_props"])
context["match_vids"] = list(graph_query_list)
return context
Loading
Loading