@@ -314,20 +314,38 @@ def sim_volume(v1_str: str, v2_str: str) -> float:
314314 return 0.0
315315
316316
317+ def _norm_abstract (text : str ) -> str :
318+ text = "" if text is None else str (text )
319+
320+ # normalize common patterns
321+ text = re .sub (r"([a-z])\s+(\d)" , r"\1\2" , text )
322+ text = re .sub (r"([a-z])\s+([a-z])" , r"\1\2" , text )
323+
324+ # remove punctuation (keep letters/numbers/spaces)
325+ text = re .sub (r"[^a-z0-9\s]" , " " , text )
326+
327+ # collapse whitespace
328+ text = re .sub (r"\s+" , " " , text ).strip ()
329+ return text
330+
331+
317332def sim_abstract (abstract_1 : str , abstract_2 : str ) -> float :
318- abstract_1 = str (abstract_1 )
319- abstract_2 = str (abstract_2 )
333+ a1 = _norm_abstract (abstract_1 )
334+ a2 = _norm_abstract (abstract_2 )
320335
321- if abstract_1 == "" or abstract_2 == "" :
336+ if not a1 or not a2 :
322337 return 0.0
323338
324- if len (abstract_1 ) > 500 and len (abstract_2 ) > 500 :
325- if abstract_1 .startswith (abstract_2 [:- 100 ]) or abstract_2 .startswith (
326- abstract_1 [:- 100 ]
327- ):
328- return 1.0
339+ # If one is essentially a prefix/subsequence of the other (truncated abstract),
340+ # partial_ratio will capture it much better than ratio.
341+ s_ratio = fuzz .ratio (a1 , a2 ) / 100.0
342+ s_partial = fuzz .partial_ratio (a1 , a2 ) / 100.0
343+
344+ # token_set helps when words are same but order/noise differs
345+ s_token = fuzz .token_set_ratio (a1 , a2 ) / 100.0
329346
330- return fuzz .ratio (abstract_1 , str (abstract_2 )) / 100
347+ # take the best signal; you can also blend (see below)
348+ return max (s_ratio , s_partial , s_token )
331349
332350
333351def sim_container_title (container_1 : str , container_2 : str ) -> float :
0 commit comments