feat: optimize semantic similarity calculation with cached base words

zcf0508 · zcf0508 · commit 7ad70ea7b969 · 2025-12-28T21:40:42.000+08:00
diff --git a/packages/core/src/suggest/community.ts b/packages/core/src/suggest/community.ts
@@ -115,8 +115,57 @@ interface BuildWeightedGraphOptions {
   similarityThreshold?: number
 }
 
+/**
+ * Calculate semantic similarity using cached base words.
+ * Returns a value between 0 and 1.
+ */
+function calculateSemanticSimilarityCached(
+  labelA: string,
+  labelB: string,
+  wordsA: string[],
+  wordsB: string[],
+): number {
+  if (labelA === labelB) {
+    return 1;
+  }
+
+  const lowerA = labelA.toLowerCase();
+  const lowerB = labelB.toLowerCase();
+
+  if (lowerA.includes(lowerB) || lowerB.includes(lowerA)) {
+    const shorter = lowerA.length < lowerB.length ? lowerA : lowerB;
+    const longer = lowerA.length < lowerB.length ? lowerB : lowerA;
+    return shorter.length / longer.length;
+  }
+
+  if (wordsA.length === 0 || wordsB.length === 0) {
+    return 0;
+  }
+
+  const setA = new Set(wordsA);
+  const setB = new Set(wordsB);
+
+  let sharedCount = 0;
+  for (const word of setA) {
+    if (setB.has(word)) {
+      sharedCount++;
+    }
+  }
+
+  if (sharedCount === 0) {
+    return 0;
+  }
+
+  return sharedCount / (setA.size + setB.size - sharedCount);
+}
+
 /**
  * Build a weighted graph that combines structural connections with semantic similarity.
+ *
+ * Optimized algorithm:
+ * 1. Cache extractBaseWords results to avoid repeated computation
+ * 2. Build word-to-nodes bucket map, only compare nodes within same bucket
+ *    This reduces O(N²) to O(B × K²) where B = number of buckets, K = avg nodes per bucket
  */
 function buildWeightedGraph(
   graph: Map<TypedNode, Set<{ node: TypedNode, type: RelationType }>>,
@@ -155,30 +204,65 @@ function buildWeightedGraph(
   }
 
   if (semanticWeight > 0) {
-    const nodeArray = Array.from(allNodes);
-    for (let i = 0; i < nodeArray.length; i++) {
-      for (let j = i + 1; j < nodeArray.length; j++) {
-        const nodeA = nodeArray[i];
-        const nodeB = nodeArray[j];
-
-        const pairKey = [nodeA.label, nodeB.label].sort().join('|');
-        const isConnected = connectedPairs.has(pairKey);
-
-        const similarity = calculateSemanticSimilarity(nodeA.label, nodeB.label);
-        if (similarity > similarityThreshold) {
-          const semanticEdgeWeight = similarity * semanticWeight;
-
-          const currentAB = weighted.get(nodeA)!.get(nodeB) || 0;
-          const newWeightAB = isConnected
-            ? Math.max(currentAB, semanticEdgeWeight)
-            : currentAB + semanticEdgeWeight;
-          weighted.get(nodeA)!.set(nodeB, Math.min(newWeightAB, 2.0));
-
-          const currentBA = weighted.get(nodeB)!.get(nodeA) || 0;
-          const newWeightBA = isConnected
-            ? Math.max(currentBA, semanticEdgeWeight)
-            : currentBA + semanticEdgeWeight;
-          weighted.get(nodeB)!.set(nodeA, Math.min(newWeightBA, 2.0));
+    const nodeWordsCache = new Map<TypedNode, string[]>();
+    const wordToBucket = new Map<string, Set<TypedNode>>();
+
+    for (const node of allNodes) {
+      const words = extractBaseWords(node.label);
+      nodeWordsCache.set(node, words);
+
+      for (const word of words) {
+        if (!wordToBucket.has(word)) {
+          wordToBucket.set(word, new Set());
+        }
+        wordToBucket.get(word)!.add(node);
+      }
+    }
+
+    const comparedPairs = new Set<string>();
+
+    for (const [_, bucket] of wordToBucket) {
+      if (bucket.size < 2) {
+        continue;
+      }
+
+      const bucketNodes = Array.from(bucket);
+      for (let i = 0; i < bucketNodes.length; i++) {
+        for (let j = i + 1; j < bucketNodes.length; j++) {
+          const nodeA = bucketNodes[i];
+          const nodeB = bucketNodes[j];
+
+          const comparedKey = [nodeA.label, nodeB.label].sort().join('|');
+          if (comparedPairs.has(comparedKey)) {
+            continue;
+          }
+          comparedPairs.add(comparedKey);
+
+          const wordsA = nodeWordsCache.get(nodeA)!;
+          const wordsB = nodeWordsCache.get(nodeB)!;
+          const similarity = calculateSemanticSimilarityCached(
+            nodeA.label,
+            nodeB.label,
+            wordsA,
+            wordsB,
+          );
+
+          if (similarity > similarityThreshold) {
+            const isConnected = connectedPairs.has(comparedKey);
+            const semanticEdgeWeight = similarity * semanticWeight;
+
+            const currentAB = weighted.get(nodeA)!.get(nodeB) || 0;
+            const newWeightAB = isConnected
+              ? Math.max(currentAB, semanticEdgeWeight)
+              : currentAB + semanticEdgeWeight;
+            weighted.get(nodeA)!.set(nodeB, Math.min(newWeightAB, 2.0));
+
+            const currentBA = weighted.get(nodeB)!.get(nodeA) || 0;
+            const newWeightBA = isConnected
+              ? Math.max(currentBA, semanticEdgeWeight)
+              : currentBA + semanticEdgeWeight;
+            weighted.get(nodeB)!.set(nodeA, Math.min(newWeightBA, 2.0));
+          }
         }
       }
     }