11from datetime import datetime
22from typing import Any , Literal
33
4+ import numpy as np
5+
46from nebulagraph_python .py_data_types import NVector
57from nebulagraph_python .value_wrapper import ValueWrapper
68
1315logger = get_logger (__name__ )
1416
1517
18+ def _normalize (vec : list [float ]) -> list [float ]:
19+ v = np .asarray (vec , dtype = np .float32 )
20+ norm = np .linalg .norm (v )
21+ return (v / (norm if norm else 1.0 )).tolist ()
22+
23+
1624def _compose_node (item : dict [str , Any ]) -> tuple [str , str , dict [str , Any ]]:
1725 node_id = item ["id" ]
1826 memory = item ["memory" ]
@@ -36,7 +44,7 @@ def _prepare_node_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
3644 # Normalize embedding type
3745 embedding = metadata .get ("embedding" )
3846 if embedding and isinstance (embedding , list ):
39- metadata ["embedding" ] = [float (x ) for x in embedding ]
47+ metadata ["embedding" ] = _normalize ( [float (x ) for x in embedding ])
4048
4149 return metadata
4250
@@ -175,6 +183,9 @@ def add_node(self, id: str, memory: str, metadata: dict[str, Any]) -> None:
175183 metadata ["id" ] = id
176184 metadata ["memory" ] = memory
177185
186+ if "embedding" in metadata and isinstance (metadata ["embedding" ], list ):
187+ metadata ["embedding" ] = _normalize (metadata ["embedding" ])
188+
178189 properties = ", " .join (f"{ k } : { _format_value (v , k )} " for k , v in metadata .items ())
179190 gql = f"INSERT OR IGNORE (n@Memory {{{ properties } }})"
180191
@@ -616,6 +627,7 @@ def search_by_embedding(
616627 - Typical use case: restrict to 'status = activated' to avoid
617628 matching archived or merged nodes.
618629 """
630+ vector = _normalize (vector )
619631 dim = len (vector )
620632 vector_str = "," .join (f"{ float (x )} " for x in vector )
621633 gql_vector = f"VECTOR<{ dim } , FLOAT>([{ vector_str } ])"
@@ -634,11 +646,11 @@ def search_by_embedding(
634646 USE memory_graph
635647 MATCH (n@Memory)
636648 { where_clause }
637- ORDER BY euclidean (n.embedding, { gql_vector } ) ASC
649+ ORDER BY inner_product (n.embedding, { gql_vector } ) DESC
638650 APPROXIMATE
639651 LIMIT { top_k }
640- OPTIONS {{ METRIC: L2 , TYPE: IVF, NPROBE: 8 }}
641- RETURN n.id AS id, euclidean (n.embedding, { gql_vector } ) AS score
652+ OPTIONS {{ METRIC: IP , TYPE: IVF, NPROBE: 8 }}
653+ RETURN n.id AS id, inner_product (n.embedding, { gql_vector } ) AS score
642654 """
643655
644656 try :
@@ -653,6 +665,7 @@ def search_by_embedding(
653665 values = row .values ()
654666 id_val = values [0 ].as_string ()
655667 score_val = values [1 ].as_double ()
668+ score_val = (score_val + 1 ) / 2 # align to neo4j, Normalized Cosine Score
656669 if threshold is None or score_val <= threshold :
657670 output .append ({"id" : id_val , "score" : score_val })
658671 return output
@@ -1076,7 +1089,7 @@ def _create_vector_index(
10761089 ON NODE Memory::{ vector_property }
10771090 OPTIONS {{
10781091 DIM: { dimensions } ,
1079- METRIC: L2 ,
1092+ METRIC: IP ,
10801093 TYPE: IVF,
10811094 NLIST: 100,
10821095 TRAINSIZE: 1000
0 commit comments