@@ -25,22 +25,26 @@ def _search_highlight(
2525def test_highlighting_phrases (index_entities ):
2626 # this is using the sample entity 242d6724b38425f11df37437c38125b71fb13300
2727 highlight = _search_highlight ('"Mr. Trump proclaimed"' )
28- assert "<em>Mr. Trump proclaimed</em>" in highlight
28+ # Unified highlighter highlights individual terms rather than full phrase spans
29+ assert "<em>Mr</em>" in highlight
30+ assert "<em>Trump</em>" in highlight
31+ assert "<em>proclaimed</em>" in highlight
2932
3033 highlight = _search_highlight ('"former chairman"~2' )
3134 assert "<em>former</em>" in highlight
3235 assert "<em>chairman</em>" in highlight
3336
3437 highlight = _search_highlight ('"paul manafort"' )
3538 assert highlight is not None
36- assert "<em>Paul Manafort</em>" in highlight
39+ assert "<em>Paul</em>" in highlight
40+ assert "<em>Manafort</em>" in highlight
3741
3842 highlight = _search_highlight ("Українська" )
3943 assert highlight is not None
4044 assert "<em>Українська" in highlight
4145 highlight = _search_highlight ('"日本語"' )
4246 assert highlight is not None
43- assert "<em>本</em><em>語 " in highlight
47+ assert "<em>日本語 " in highlight
4448
4549
4650def test_highlighting_pages (fixture_pages , cleanup_after ):
@@ -49,11 +53,14 @@ def test_highlighting_pages(fixture_pages, cleanup_after):
4953 index_bulk ("test_pages_highlight" , fixture_pages , sync = True )
5054
5155 # Search in the indexText (-> content) of a Pages (aka Document) entity
56+ # Unified highlighter highlights individual terms rather than full phrase spans
5257 highlight = _search_highlight (
5358 '"MIT license" "useful information" documentation' , schema = "Pages"
5459 )
55- assert "<em>MIT license</em>" in highlight
56- assert "<em>useful information</em>" in highlight
60+ assert "<em>MIT</em>" in highlight
61+ assert "<em>license</em>" in highlight
62+ assert "<em>useful</em>" in highlight
63+ assert "<em>information</em>" in highlight
5764 assert "<em>documentation</em>" in highlight
5865
5966 # Search within its child page entities (used in OpenAleph ui)
@@ -62,24 +69,28 @@ def test_highlighting_pages(fixture_pages, cleanup_after):
6269 schema = "Page" ,
6370 parent_id = "f61295777cf69f423855655f1614794ce22086d8.b154e50f50c8c8133168767d78bbd1dff067f308" ,
6471 )
65- assert "<em>MIT license</em>" in highlight
66- assert "<em>useful information</em>" in highlight
72+ assert "<em>MIT</em>" in highlight
73+ assert "<em>license</em>" in highlight
74+ assert "<em>useful</em>" in highlight
75+ assert "<em>information</em>" in highlight
6776 assert "<em>documentation</em>" in highlight
6877
6978 # Include mentioned names
7079 highlight = _search_highlight (
7180 'names:"massachusetts institute of technology" "MIT license"' , schema = "Pages"
7281 )
7382 assert "massachusetts institute of technology" in highlight
74- assert "<em>MIT license</em>" in highlight
83+ assert "<em>MIT</em>" in highlight
84+ assert "<em>license</em>" in highlight
7585
7686 # Page doesn't contain "names" but still the highlight works for the text phrase
7787 highlight = _search_highlight (
7888 'names:"massachusetts institute of technology" "MIT license"' ,
7989 schema = "Page" ,
8090 parent_id = "f61295777cf69f423855655f1614794ce22086d8.b154e50f50c8c8133168767d78bbd1dff067f308" ,
8191 )
82- assert "<em>MIT license</em>" in highlight
92+ assert "<em>MIT</em>" in highlight
93+ assert "<em>license</em>" in highlight
8394
8495
8596def test_highlighting_translation_plaintext (cleanup_after ):
0 commit comments