11use std:: cmp:: Ordering ;
2- use std:: collections:: HashSet ;
2+ use std:: collections:: { HashMap , HashSet } ;
33use std:: path:: PathBuf ;
44
55use serde:: { Deserialize , Serialize } ;
@@ -82,7 +82,7 @@ pub struct ArticleSearchResult {
8282 #[ serde( skip_serializing_if = "Option::is_none" ) ]
8383 pub score : Option < f64 > ,
8484 #[ serde( default ) ]
85- pub is_retracted : bool ,
85+ pub is_retracted : Option < bool > ,
8686}
8787
8888#[ derive( Debug , Clone , Copy , PartialEq , Eq , Serialize , Deserialize ) ]
@@ -540,7 +540,7 @@ fn matches_result_filters(
540540 if filters. no_preprints && row. journal . as_deref ( ) . is_some_and ( is_preprint_journal) {
541541 return false ;
542542 }
543- if filters. exclude_retracted && row. is_retracted {
543+ if filters. exclude_retracted && row. is_retracted . unwrap_or ( true ) {
544544 return false ;
545545 }
546546 if !matches_optional_journal_filter ( row. journal . as_deref ( ) , filters. journal . as_deref ( ) ) {
@@ -553,10 +553,15 @@ fn matches_result_filters(
553553}
554554
555555fn dedup_by_pmid_preserve_order ( results : Vec < ArticleSearchResult > ) -> Vec < ArticleSearchResult > {
556- let mut deduped = Vec :: with_capacity ( results. len ( ) ) ;
557- let mut seen = HashSet :: with_capacity ( results. len ( ) ) ;
556+ let mut deduped: Vec < ArticleSearchResult > = Vec :: with_capacity ( results. len ( ) ) ;
557+ let mut seen: HashMap < String , usize > = HashMap :: with_capacity ( results. len ( ) ) ;
558558 for row in results {
559- if seen. insert ( row. pmid . clone ( ) ) {
559+ if let Some ( existing_idx) = seen. get ( & row. pmid ) . copied ( ) {
560+ if deduped[ existing_idx] . is_retracted . is_none ( ) && row. is_retracted . is_some ( ) {
561+ deduped[ existing_idx] . is_retracted = row. is_retracted ;
562+ }
563+ } else {
564+ seen. insert ( row. pmid . clone ( ) , deduped. len ( ) ) ;
560565 deduped. push ( row) ;
561566 }
562567 }
@@ -944,7 +949,7 @@ async fn search_europepmc_page(
944949 // try adding one matched retracted publication if available.
945950 if !filters. exclude_retracted
946951 && filters. sort == ArticleSort :: Date
947- && !out. iter ( ) . any ( |row| row. is_retracted )
952+ && !out. iter ( ) . any ( |row| row. is_retracted == Some ( true ) )
948953 {
949954 let retracted_query = format ! ( "({query}) AND PUB_TYPE:\" retracted publication\" " ) ;
950955 if let Ok ( resp) = europe
@@ -958,7 +963,7 @@ async fn search_europepmc_page(
958963 . into_iter ( )
959964 . filter_map ( |hit| transform:: article:: from_europepmc_search_result ( & hit) )
960965 . find ( |row| {
961- row. is_retracted
966+ row. is_retracted == Some ( true )
962967 && !seen_pmids. contains ( & row. pmid )
963968 && matches_result_filters (
964969 row,
@@ -1553,15 +1558,15 @@ mod tests {
15531558 }
15541559
15551560 fn row ( pmid : & str , source : ArticleSource ) -> ArticleSearchResult {
1556- row_with ( pmid, source, Some ( "2025-01-01" ) , Some ( 1 ) , false )
1561+ row_with ( pmid, source, Some ( "2025-01-01" ) , Some ( 1 ) , Some ( false ) )
15571562 }
15581563
15591564 fn row_with (
15601565 pmid : & str ,
15611566 source : ArticleSource ,
15621567 date : Option < & str > ,
15631568 citation_count : Option < u64 > ,
1564- is_retracted : bool ,
1569+ is_retracted : Option < bool > ,
15651570 ) -> ArticleSearchResult {
15661571 ArticleSearchResult {
15671572 pmid : pmid. to_string ( ) ,
@@ -1686,21 +1691,21 @@ mod tests {
16861691 ArticleSource :: EuropePmc ,
16871692 Some ( "2024-01-01" ) ,
16881693 Some ( 1 ) ,
1689- false ,
1694+ Some ( false ) ,
16901695 ) ,
16911696 row_with(
16921697 "200" ,
16931698 ArticleSource :: EuropePmc ,
16941699 Some ( "2025-01-01" ) ,
16951700 Some ( 1 ) ,
1696- false ,
1701+ Some ( false ) ,
16971702 ) ,
16981703 row_with(
16991704 "300" ,
17001705 ArticleSource :: EuropePmc ,
17011706 Some ( "2023-01-01" ) ,
17021707 Some ( 1 ) ,
1703- false ,
1708+ Some ( false ) ,
17041709 ) ,
17051710 ] ,
17061711 Some ( 3 ) ,
@@ -1778,14 +1783,14 @@ mod tests {
17781783 ArticleSource :: PubTator ,
17791784 Some ( "2025-02-01" ) ,
17801785 Some ( 50 ) ,
1781- false ,
1786+ Some ( false ) ,
17821787 ) ,
17831788 row_with(
17841789 "200" ,
17851790 ArticleSource :: PubTator ,
17861791 Some ( "2024-01-01" ) ,
17871792 Some ( 5 ) ,
1788- false ,
1793+ Some ( false ) ,
17891794 ) ,
17901795 ] ,
17911796 Some ( 2 ) ,
@@ -1797,14 +1802,14 @@ mod tests {
17971802 ArticleSource :: EuropePmc ,
17981803 Some ( "2025-03-01" ) ,
17991804 Some ( 100 ) ,
1800- false ,
1805+ Some ( false ) ,
18011806 ) ,
18021807 row_with(
18031808 "400" ,
18041809 ArticleSource :: EuropePmc ,
18051810 Some ( "2024-06-01" ) ,
18061811 Some ( 10 ) ,
1807- false ,
1812+ Some ( false ) ,
18081813 ) ,
18091814 ] ,
18101815 Some ( 2 ) ,
@@ -1832,14 +1837,14 @@ mod tests {
18321837 ArticleSource :: PubTator ,
18331838 Some ( "2025" ) ,
18341839 Some ( 25 ) ,
1835- false ,
1840+ Some ( false ) ,
18361841 ) ,
18371842 row_with(
18381843 "600" ,
18391844 ArticleSource :: PubTator ,
18401845 Some ( "2024-12-31" ) ,
18411846 Some ( 30 ) ,
1842- false ,
1847+ Some ( false ) ,
18431848 ) ,
18441849 ] ,
18451850 Some ( 2 ) ,
@@ -1851,9 +1856,9 @@ mod tests {
18511856 ArticleSource :: EuropePmc ,
18521857 Some ( "2025-06-01" ) ,
18531858 Some ( 10 ) ,
1854- false ,
1859+ Some ( false ) ,
18551860 ) ,
1856- row_with( "800" , ArticleSource :: EuropePmc , None , Some ( 99 ) , false ) ,
1861+ row_with( "800" , ArticleSource :: EuropePmc , None , Some ( 99 ) , Some ( false ) ) ,
18571862 ] ,
18581863 Some ( 2 ) ,
18591864 ) ;
@@ -1907,7 +1912,7 @@ mod tests {
19071912 ArticleSource :: PubTator ,
19081913 Some ( "2025-01-01" ) ,
19091914 Some ( 1 ) ,
1910- true ,
1915+ Some ( true ) ,
19111916 ) ;
19121917 let exclude_filters = ArticleSearchFilters {
19131918 exclude_retracted : true ,
@@ -1921,4 +1926,78 @@ mod tests {
19211926 assert ! ( !matches_result_filters( & row, & exclude_filters, None , None ) ) ;
19221927 assert ! ( matches_result_filters( & row, & include_filters, None , None ) ) ;
19231928 }
1929+
1930+ #[ test]
1931+ fn exclude_retracted_excludes_unknown_retraction_status_by_default ( ) {
1932+ let row = row_with (
1933+ "100" ,
1934+ ArticleSource :: PubTator ,
1935+ Some ( "2025-01-01" ) ,
1936+ Some ( 1 ) ,
1937+ None ,
1938+ ) ;
1939+ let exclude_filters = ArticleSearchFilters {
1940+ exclude_retracted : true ,
1941+ ..empty_filters ( )
1942+ } ;
1943+ let include_filters = ArticleSearchFilters {
1944+ exclude_retracted : false ,
1945+ ..empty_filters ( )
1946+ } ;
1947+
1948+ assert ! ( !matches_result_filters( & row, & exclude_filters, None , None ) ) ;
1949+ assert ! ( matches_result_filters( & row, & include_filters, None , None ) ) ;
1950+ }
1951+
1952+ #[ test]
1953+ fn merge_federated_pages_preserves_known_retraction_status_from_later_duplicate ( ) {
1954+ let pubtator_page = SearchPage :: offset (
1955+ vec ! [ row_with(
1956+ "200" ,
1957+ ArticleSource :: PubTator ,
1958+ Some ( "2025-01-01" ) ,
1959+ Some ( 1 ) ,
1960+ None ,
1961+ ) ] ,
1962+ Some ( 1 ) ,
1963+ ) ;
1964+ let europe_page = SearchPage :: offset (
1965+ vec ! [ row_with(
1966+ "200" ,
1967+ ArticleSource :: EuropePmc ,
1968+ Some ( "2025-01-01" ) ,
1969+ Some ( 10 ) ,
1970+ Some ( true ) ,
1971+ ) ] ,
1972+ Some ( 1 ) ,
1973+ ) ;
1974+
1975+ let merged = merge_federated_pages (
1976+ Ok ( pubtator_page) ,
1977+ Ok ( europe_page) ,
1978+ 10 ,
1979+ 0 ,
1980+ ArticleSort :: Relevance ,
1981+ )
1982+ . expect ( "federated merge should succeed" ) ;
1983+
1984+ assert_eq ! ( merged. results. len( ) , 1 ) ;
1985+ assert_eq ! ( merged. results[ 0 ] . source, ArticleSource :: PubTator ) ;
1986+ assert_eq ! ( merged. results[ 0 ] . is_retracted, Some ( true ) ) ;
1987+ }
1988+
1989+ #[ test]
1990+ fn article_search_result_serializes_unknown_retraction_as_null ( ) {
1991+ let row = row_with (
1992+ "100" ,
1993+ ArticleSource :: PubTator ,
1994+ Some ( "2025-01-01" ) ,
1995+ Some ( 1 ) ,
1996+ None ,
1997+ ) ;
1998+
1999+ let value = serde_json:: to_value ( & row) . expect ( "search row should serialize" ) ;
2000+ assert ! ( value. get( "is_retracted" ) . is_some( ) ) ;
2001+ assert ! ( value[ "is_retracted" ] . is_null( ) ) ;
2002+ }
19242003}
0 commit comments