@@ -115,8 +115,57 @@ interface BuildWeightedGraphOptions {
115115 similarityThreshold ?: number
116116}
117117
118+ /**
119+ * Calculate semantic similarity using cached base words.
120+ * Returns a value between 0 and 1.
121+ */
122+ function calculateSemanticSimilarityCached (
123+ labelA : string ,
124+ labelB : string ,
125+ wordsA : string [ ] ,
126+ wordsB : string [ ] ,
127+ ) : number {
128+ if ( labelA === labelB ) {
129+ return 1 ;
130+ }
131+
132+ const lowerA = labelA . toLowerCase ( ) ;
133+ const lowerB = labelB . toLowerCase ( ) ;
134+
135+ if ( lowerA . includes ( lowerB ) || lowerB . includes ( lowerA ) ) {
136+ const shorter = lowerA . length < lowerB . length ? lowerA : lowerB ;
137+ const longer = lowerA . length < lowerB . length ? lowerB : lowerA ;
138+ return shorter . length / longer . length ;
139+ }
140+
141+ if ( wordsA . length === 0 || wordsB . length === 0 ) {
142+ return 0 ;
143+ }
144+
145+ const setA = new Set ( wordsA ) ;
146+ const setB = new Set ( wordsB ) ;
147+
148+ let sharedCount = 0 ;
149+ for ( const word of setA ) {
150+ if ( setB . has ( word ) ) {
151+ sharedCount ++ ;
152+ }
153+ }
154+
155+ if ( sharedCount === 0 ) {
156+ return 0 ;
157+ }
158+
159+ return sharedCount / ( setA . size + setB . size - sharedCount ) ;
160+ }
161+
118162/**
119163 * Build a weighted graph that combines structural connections with semantic similarity.
164+ *
165+ * Optimized algorithm:
166+ * 1. Cache extractBaseWords results to avoid repeated computation
167+ * 2. Build word-to-nodes bucket map, only compare nodes within same bucket
168+ * This reduces O(N²) to O(B × K²) where B = number of buckets, K = avg nodes per bucket
120169 */
121170function buildWeightedGraph (
122171 graph : Map < TypedNode , Set < { node : TypedNode , type : RelationType } > > ,
@@ -155,30 +204,65 @@ function buildWeightedGraph(
155204 }
156205
157206 if ( semanticWeight > 0 ) {
158- const nodeArray = Array . from ( allNodes ) ;
159- for ( let i = 0 ; i < nodeArray . length ; i ++ ) {
160- for ( let j = i + 1 ; j < nodeArray . length ; j ++ ) {
161- const nodeA = nodeArray [ i ] ;
162- const nodeB = nodeArray [ j ] ;
163-
164- const pairKey = [ nodeA . label , nodeB . label ] . sort ( ) . join ( '|' ) ;
165- const isConnected = connectedPairs . has ( pairKey ) ;
166-
167- const similarity = calculateSemanticSimilarity ( nodeA . label , nodeB . label ) ;
168- if ( similarity > similarityThreshold ) {
169- const semanticEdgeWeight = similarity * semanticWeight ;
170-
171- const currentAB = weighted . get ( nodeA ) ! . get ( nodeB ) || 0 ;
172- const newWeightAB = isConnected
173- ? Math . max ( currentAB , semanticEdgeWeight )
174- : currentAB + semanticEdgeWeight ;
175- weighted . get ( nodeA ) ! . set ( nodeB , Math . min ( newWeightAB , 2.0 ) ) ;
176-
177- const currentBA = weighted . get ( nodeB ) ! . get ( nodeA ) || 0 ;
178- const newWeightBA = isConnected
179- ? Math . max ( currentBA , semanticEdgeWeight )
180- : currentBA + semanticEdgeWeight ;
181- weighted . get ( nodeB ) ! . set ( nodeA , Math . min ( newWeightBA , 2.0 ) ) ;
207+ const nodeWordsCache = new Map < TypedNode , string [ ] > ( ) ;
208+ const wordToBucket = new Map < string , Set < TypedNode > > ( ) ;
209+
210+ for ( const node of allNodes ) {
211+ const words = extractBaseWords ( node . label ) ;
212+ nodeWordsCache . set ( node , words ) ;
213+
214+ for ( const word of words ) {
215+ if ( ! wordToBucket . has ( word ) ) {
216+ wordToBucket . set ( word , new Set ( ) ) ;
217+ }
218+ wordToBucket . get ( word ) ! . add ( node ) ;
219+ }
220+ }
221+
222+ const comparedPairs = new Set < string > ( ) ;
223+
224+ for ( const [ _ , bucket ] of wordToBucket ) {
225+ if ( bucket . size < 2 ) {
226+ continue ;
227+ }
228+
229+ const bucketNodes = Array . from ( bucket ) ;
230+ for ( let i = 0 ; i < bucketNodes . length ; i ++ ) {
231+ for ( let j = i + 1 ; j < bucketNodes . length ; j ++ ) {
232+ const nodeA = bucketNodes [ i ] ;
233+ const nodeB = bucketNodes [ j ] ;
234+
235+ const comparedKey = [ nodeA . label , nodeB . label ] . sort ( ) . join ( '|' ) ;
236+ if ( comparedPairs . has ( comparedKey ) ) {
237+ continue ;
238+ }
239+ comparedPairs . add ( comparedKey ) ;
240+
241+ const wordsA = nodeWordsCache . get ( nodeA ) ! ;
242+ const wordsB = nodeWordsCache . get ( nodeB ) ! ;
243+ const similarity = calculateSemanticSimilarityCached (
244+ nodeA . label ,
245+ nodeB . label ,
246+ wordsA ,
247+ wordsB ,
248+ ) ;
249+
250+ if ( similarity > similarityThreshold ) {
251+ const isConnected = connectedPairs . has ( comparedKey ) ;
252+ const semanticEdgeWeight = similarity * semanticWeight ;
253+
254+ const currentAB = weighted . get ( nodeA ) ! . get ( nodeB ) || 0 ;
255+ const newWeightAB = isConnected
256+ ? Math . max ( currentAB , semanticEdgeWeight )
257+ : currentAB + semanticEdgeWeight ;
258+ weighted . get ( nodeA ) ! . set ( nodeB , Math . min ( newWeightAB , 2.0 ) ) ;
259+
260+ const currentBA = weighted . get ( nodeB ) ! . get ( nodeA ) || 0 ;
261+ const newWeightBA = isConnected
262+ ? Math . max ( currentBA , semanticEdgeWeight )
263+ : currentBA + semanticEdgeWeight ;
264+ weighted . get ( nodeB ) ! . set ( nodeA , Math . min ( newWeightBA , 2.0 ) ) ;
265+ }
182266 }
183267 }
184268 }
0 commit comments