fix: search bug;

CaralHsi · CaralHsi · commit ce4610477fc6 · 2025-07-26T16:09:47.000+08:00
diff --git a/examples/basic_modules/nebular_example.py b/examples/basic_modules/nebular_example.py
@@ -22,9 +22,8 @@ def show(nebular_data):
     from memos.graph_dbs.neo4j import Neo4jGraphDB
 
     tree_config = Neo4jGraphDBConfig.from_json_file("../../examples/data/config/neo4j_config.json")
-    tree_config.use_multi_db = False
+    tree_config.use_multi_db = True
     tree_config.db_name = "nebular-show"
-    tree_config.user_name = "nebular-show"
 
     neo4j_db = Neo4jGraphDB(tree_config)
     neo4j_db.clear()
@@ -108,7 +107,7 @@ def example_shared_db(db_name: str = "shared-traval-group"):
     Multiple users' data in the same Neo4j DB with user_name as a tag.
     """
     # users
-    user_list = ["root"]
+    user_list = ["travel_member_alice", "travel_member_bob"]
 
     for user_name in user_list:
         # Step 1: Build factory config
@@ -198,15 +197,19 @@ def example_shared_db(db_name: str = "shared-traval-group"):
     all_graph_data = graph.export_graph()
     print(str(all_graph_data)[:1000])
 
+    all_nodes = graph.export_graph()
+    show(all_nodes)
+
     # Step 6: Search for alice's data only
     print("\n=== Search for travel_member_alice ===")
     config_alice = GraphDBConfigFactory(
         backend="nebular",
         config={
-            "hosts": json.loads(os.getenv("NEBULAR_HOSTS", "localhost")),
-            "user_name": os.getenv("NEBULAR_USER", "root"),
+            "uri": json.loads(os.getenv("NEBULAR_HOSTS", "localhost")),
+            "user": os.getenv("NEBULAR_USER", "root"),
             "password": os.getenv("NEBULAR_PASSWORD", "xxxxxx"),
             "space": db_name,
+            "user_name": user_list[0],
             "auto_create": True,
             "embedding_dimension": 3072,
             "use_multi_db": False,
@@ -339,7 +342,7 @@ def run_user_session(
     graph.update_node(
         concept_items[0].id, {"confidence": 99.0, "created_at": "2025-07-24T20:11:56.375687"}
     )
-    graph.remove_oldest_memory("LongTermMemory", keep_latest=3)
+    graph.remove_oldest_memory("WorkingMemory", keep_latest=1)
     graph.delete_edge(topic.id, concept_items[0].id, type="PARENT")
     graph.delete_node(concept_items[1].id)
 
diff --git a/src/memos/graph_dbs/nebular.py b/src/memos/graph_dbs/nebular.py
@@ -1,6 +1,8 @@
 from datetime import datetime
 from typing import Any, Literal
 
+import numpy as np
+
 from nebulagraph_python.py_data_types import NVector
 from nebulagraph_python.value_wrapper import ValueWrapper
 
@@ -13,6 +15,12 @@
 logger = get_logger(__name__)
 
 
+def _normalize(vec: list[float]) -> list[float]:
+    v = np.asarray(vec, dtype=np.float32)
+    norm = np.linalg.norm(v)
+    return (v / (norm if norm else 1.0)).tolist()
+
+
 def _compose_node(item: dict[str, Any]) -> tuple[str, str, dict[str, Any]]:
     node_id = item["id"]
     memory = item["memory"]
@@ -36,7 +44,7 @@ def _prepare_node_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
     # Normalize embedding type
     embedding = metadata.get("embedding")
     if embedding and isinstance(embedding, list):
-        metadata["embedding"] = [float(x) for x in embedding]
+        metadata["embedding"] = _normalize([float(x) for x in embedding])
 
     return metadata
 
@@ -175,6 +183,9 @@ def add_node(self, id: str, memory: str, metadata: dict[str, Any]) -> None:
         metadata["id"] = id
         metadata["memory"] = memory
 
+        if "embedding" in metadata and isinstance(metadata["embedding"], list):
+            metadata["embedding"] = _normalize(metadata["embedding"])
+
         properties = ", ".join(f"{k}: {_format_value(v, k)}" for k, v in metadata.items())
         gql = f"INSERT OR IGNORE (n@Memory {{{properties}}})"
 
@@ -616,6 +627,7 @@ def search_by_embedding(
             - Typical use case: restrict to 'status = activated' to avoid
             matching archived or merged nodes.
         """
+        vector = _normalize(vector)
         dim = len(vector)
         vector_str = ",".join(f"{float(x)}" for x in vector)
         gql_vector = f"VECTOR<{dim}, FLOAT>([{vector_str}])"
@@ -634,11 +646,11 @@ def search_by_embedding(
                USE memory_graph
                MATCH (n@Memory)
                {where_clause}
-               ORDER BY euclidean(n.embedding, {gql_vector}) ASC
+               ORDER BY inner_product(n.embedding, {gql_vector}) DESC
                APPROXIMATE
                LIMIT {top_k}
-               OPTIONS {{ METRIC: L2, TYPE: IVF, NPROBE: 8 }}
-               RETURN n.id AS id, euclidean(n.embedding, {gql_vector}) AS score
+               OPTIONS {{ METRIC: IP, TYPE: IVF, NPROBE: 8 }}
+               RETURN n.id AS id, inner_product(n.embedding, {gql_vector}) AS score
            """
 
         try:
@@ -653,6 +665,7 @@ def search_by_embedding(
                 values = row.values()
                 id_val = values[0].as_string()
                 score_val = values[1].as_double()
+                score_val = (score_val + 1) / 2  # align to neo4j, Normalized Cosine Score
                 if threshold is None or score_val <= threshold:
                     output.append({"id": id_val, "score": score_val})
             return output
@@ -1076,7 +1089,7 @@ def _create_vector_index(
                         ON NODE Memory::{vector_property}
                         OPTIONS {{
                             DIM: {dimensions},
-                            METRIC: L2,
+                            METRIC: IP,
                             TYPE: IVF,
                             NLIST: 100,
                             TRAINSIZE: 1000