Skip to content

Commit dde13bb

Browse files
authored
Merge pull request #212 from genomoncology/fix-retraction-filter-leading-not
Fix retraction filter bypass and leading-NOT eligibility parsing
2 parents 66a0575 + c54ed50 commit dde13bb

File tree

4 files changed

+126
-30
lines changed

4 files changed

+126
-30
lines changed

src/entities/article.rs

Lines changed: 101 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use std::cmp::Ordering;
2-
use std::collections::HashSet;
2+
use std::collections::{HashMap, HashSet};
33
use std::path::PathBuf;
44

55
use serde::{Deserialize, Serialize};
@@ -82,7 +82,7 @@ pub struct ArticleSearchResult {
8282
#[serde(skip_serializing_if = "Option::is_none")]
8383
pub score: Option<f64>,
8484
#[serde(default)]
85-
pub is_retracted: bool,
85+
pub is_retracted: Option<bool>,
8686
}
8787

8888
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -540,7 +540,7 @@ fn matches_result_filters(
540540
if filters.no_preprints && row.journal.as_deref().is_some_and(is_preprint_journal) {
541541
return false;
542542
}
543-
if filters.exclude_retracted && row.is_retracted {
543+
if filters.exclude_retracted && row.is_retracted.unwrap_or(true) {
544544
return false;
545545
}
546546
if !matches_optional_journal_filter(row.journal.as_deref(), filters.journal.as_deref()) {
@@ -553,10 +553,15 @@ fn matches_result_filters(
553553
}
554554

555555
fn dedup_by_pmid_preserve_order(results: Vec<ArticleSearchResult>) -> Vec<ArticleSearchResult> {
556-
let mut deduped = Vec::with_capacity(results.len());
557-
let mut seen = HashSet::with_capacity(results.len());
556+
let mut deduped: Vec<ArticleSearchResult> = Vec::with_capacity(results.len());
557+
let mut seen: HashMap<String, usize> = HashMap::with_capacity(results.len());
558558
for row in results {
559-
if seen.insert(row.pmid.clone()) {
559+
if let Some(existing_idx) = seen.get(&row.pmid).copied() {
560+
if deduped[existing_idx].is_retracted.is_none() && row.is_retracted.is_some() {
561+
deduped[existing_idx].is_retracted = row.is_retracted;
562+
}
563+
} else {
564+
seen.insert(row.pmid.clone(), deduped.len());
560565
deduped.push(row);
561566
}
562567
}
@@ -944,7 +949,7 @@ async fn search_europepmc_page(
944949
// try adding one matched retracted publication if available.
945950
if !filters.exclude_retracted
946951
&& filters.sort == ArticleSort::Date
947-
&& !out.iter().any(|row| row.is_retracted)
952+
&& !out.iter().any(|row| row.is_retracted == Some(true))
948953
{
949954
let retracted_query = format!("({query}) AND PUB_TYPE:\"retracted publication\"");
950955
if let Ok(resp) = europe
@@ -958,7 +963,7 @@ async fn search_europepmc_page(
958963
.into_iter()
959964
.filter_map(|hit| transform::article::from_europepmc_search_result(&hit))
960965
.find(|row| {
961-
row.is_retracted
966+
row.is_retracted == Some(true)
962967
&& !seen_pmids.contains(&row.pmid)
963968
&& matches_result_filters(
964969
row,
@@ -1553,15 +1558,15 @@ mod tests {
15531558
}
15541559

15551560
fn row(pmid: &str, source: ArticleSource) -> ArticleSearchResult {
1556-
row_with(pmid, source, Some("2025-01-01"), Some(1), false)
1561+
row_with(pmid, source, Some("2025-01-01"), Some(1), Some(false))
15571562
}
15581563

15591564
fn row_with(
15601565
pmid: &str,
15611566
source: ArticleSource,
15621567
date: Option<&str>,
15631568
citation_count: Option<u64>,
1564-
is_retracted: bool,
1569+
is_retracted: Option<bool>,
15651570
) -> ArticleSearchResult {
15661571
ArticleSearchResult {
15671572
pmid: pmid.to_string(),
@@ -1686,21 +1691,21 @@ mod tests {
16861691
ArticleSource::EuropePmc,
16871692
Some("2024-01-01"),
16881693
Some(1),
1689-
false,
1694+
Some(false),
16901695
),
16911696
row_with(
16921697
"200",
16931698
ArticleSource::EuropePmc,
16941699
Some("2025-01-01"),
16951700
Some(1),
1696-
false,
1701+
Some(false),
16971702
),
16981703
row_with(
16991704
"300",
17001705
ArticleSource::EuropePmc,
17011706
Some("2023-01-01"),
17021707
Some(1),
1703-
false,
1708+
Some(false),
17041709
),
17051710
],
17061711
Some(3),
@@ -1778,14 +1783,14 @@ mod tests {
17781783
ArticleSource::PubTator,
17791784
Some("2025-02-01"),
17801785
Some(50),
1781-
false,
1786+
Some(false),
17821787
),
17831788
row_with(
17841789
"200",
17851790
ArticleSource::PubTator,
17861791
Some("2024-01-01"),
17871792
Some(5),
1788-
false,
1793+
Some(false),
17891794
),
17901795
],
17911796
Some(2),
@@ -1797,14 +1802,14 @@ mod tests {
17971802
ArticleSource::EuropePmc,
17981803
Some("2025-03-01"),
17991804
Some(100),
1800-
false,
1805+
Some(false),
18011806
),
18021807
row_with(
18031808
"400",
18041809
ArticleSource::EuropePmc,
18051810
Some("2024-06-01"),
18061811
Some(10),
1807-
false,
1812+
Some(false),
18081813
),
18091814
],
18101815
Some(2),
@@ -1832,14 +1837,14 @@ mod tests {
18321837
ArticleSource::PubTator,
18331838
Some("2025"),
18341839
Some(25),
1835-
false,
1840+
Some(false),
18361841
),
18371842
row_with(
18381843
"600",
18391844
ArticleSource::PubTator,
18401845
Some("2024-12-31"),
18411846
Some(30),
1842-
false,
1847+
Some(false),
18431848
),
18441849
],
18451850
Some(2),
@@ -1851,9 +1856,9 @@ mod tests {
18511856
ArticleSource::EuropePmc,
18521857
Some("2025-06-01"),
18531858
Some(10),
1854-
false,
1859+
Some(false),
18551860
),
1856-
row_with("800", ArticleSource::EuropePmc, None, Some(99), false),
1861+
row_with("800", ArticleSource::EuropePmc, None, Some(99), Some(false)),
18571862
],
18581863
Some(2),
18591864
);
@@ -1907,7 +1912,7 @@ mod tests {
19071912
ArticleSource::PubTator,
19081913
Some("2025-01-01"),
19091914
Some(1),
1910-
true,
1915+
Some(true),
19111916
);
19121917
let exclude_filters = ArticleSearchFilters {
19131918
exclude_retracted: true,
@@ -1921,4 +1926,78 @@ mod tests {
19211926
assert!(!matches_result_filters(&row, &exclude_filters, None, None));
19221927
assert!(matches_result_filters(&row, &include_filters, None, None));
19231928
}
1929+
1930+
#[test]
1931+
fn exclude_retracted_excludes_unknown_retraction_status_by_default() {
1932+
let row = row_with(
1933+
"100",
1934+
ArticleSource::PubTator,
1935+
Some("2025-01-01"),
1936+
Some(1),
1937+
None,
1938+
);
1939+
let exclude_filters = ArticleSearchFilters {
1940+
exclude_retracted: true,
1941+
..empty_filters()
1942+
};
1943+
let include_filters = ArticleSearchFilters {
1944+
exclude_retracted: false,
1945+
..empty_filters()
1946+
};
1947+
1948+
assert!(!matches_result_filters(&row, &exclude_filters, None, None));
1949+
assert!(matches_result_filters(&row, &include_filters, None, None));
1950+
}
1951+
1952+
#[test]
1953+
fn merge_federated_pages_preserves_known_retraction_status_from_later_duplicate() {
1954+
let pubtator_page = SearchPage::offset(
1955+
vec![row_with(
1956+
"200",
1957+
ArticleSource::PubTator,
1958+
Some("2025-01-01"),
1959+
Some(1),
1960+
None,
1961+
)],
1962+
Some(1),
1963+
);
1964+
let europe_page = SearchPage::offset(
1965+
vec![row_with(
1966+
"200",
1967+
ArticleSource::EuropePmc,
1968+
Some("2025-01-01"),
1969+
Some(10),
1970+
Some(true),
1971+
)],
1972+
Some(1),
1973+
);
1974+
1975+
let merged = merge_federated_pages(
1976+
Ok(pubtator_page),
1977+
Ok(europe_page),
1978+
10,
1979+
0,
1980+
ArticleSort::Relevance,
1981+
)
1982+
.expect("federated merge should succeed");
1983+
1984+
assert_eq!(merged.results.len(), 1);
1985+
assert_eq!(merged.results[0].source, ArticleSource::PubTator);
1986+
assert_eq!(merged.results[0].is_retracted, Some(true));
1987+
}
1988+
1989+
#[test]
1990+
fn article_search_result_serializes_unknown_retraction_as_null() {
1991+
let row = row_with(
1992+
"100",
1993+
ArticleSource::PubTator,
1994+
Some("2025-01-01"),
1995+
Some(1),
1996+
None,
1997+
);
1998+
1999+
let value = serde_json::to_value(&row).expect("search row should serialize");
2000+
assert!(value.get("is_retracted").is_some());
2001+
assert!(value["is_retracted"].is_null());
2002+
}
19242003
}

src/entities/trial.rs

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -328,10 +328,19 @@ fn essie_escape_boolean_expression(value: &str) -> String {
328328
}
329329

330330
let mut rendered = String::new();
331+
let has_leading_unary_operator = operators.len() == terms.len();
331332
for (idx, term) in terms.iter().enumerate() {
332-
if idx > 0 {
333+
if idx == 0 && has_leading_unary_operator {
334+
rendered.push_str(&operators[0]);
333335
rendered.push(' ');
334-
rendered.push_str(&operators[idx - 1]);
336+
} else if idx > 0 {
337+
rendered.push(' ');
338+
let operator_idx = if has_leading_unary_operator {
339+
idx
340+
} else {
341+
idx - 1
342+
};
343+
rendered.push_str(&operators[operator_idx]);
335344
rendered.push(' ');
336345
}
337346
rendered.push_str(&quote_essie_literal(term));
@@ -2108,6 +2117,14 @@ mod tests {
21082117
);
21092118
}
21102119

2120+
#[test]
2121+
fn essie_escape_boolean_expression_handles_leading_not() {
2122+
assert_eq!(
2123+
essie_escape_boolean_expression("NOT MSI-H"),
2124+
"NOT \"MSI\\-H\""
2125+
);
2126+
}
2127+
21112128
#[test]
21122129
fn essie_escape_boolean_expression_handles_and_not() {
21132130
assert_eq!(

src/render/markdown.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2808,7 +2808,7 @@ mod tests {
28082808
citation_count: Some(10),
28092809
source: ArticleSource::PubTator,
28102810
score: Some(99.1),
2811-
is_retracted: false,
2811+
is_retracted: Some(false),
28122812
},
28132813
ArticleSearchResult {
28142814
pmid: "2".into(),
@@ -2818,7 +2818,7 @@ mod tests {
28182818
citation_count: Some(12),
28192819
source: ArticleSource::EuropePmc,
28202820
score: None,
2821-
is_retracted: false,
2821+
is_retracted: Some(false),
28222822
},
28232823
];
28242824

src/transform/article.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ pub fn from_europepmc_search_result(hit: &EuropePmcResult) -> Option<ArticleSear
353353
citation_count: parse_citation_count(hit.cited_by_count.as_ref()),
354354
source: ArticleSource::EuropePmc,
355355
score: None,
356-
is_retracted: is_retracted_publication(hit),
356+
is_retracted: Some(is_retracted_publication(hit)),
357357
})
358358
}
359359

@@ -382,7 +382,7 @@ pub fn from_pubtator_search_result(hit: &PubTatorSearchResult) -> Option<Article
382382
citation_count: None,
383383
source: ArticleSource::PubTator,
384384
score: hit.score,
385-
is_retracted: false,
385+
is_retracted: None,
386386
})
387387
}
388388

@@ -678,7 +678,7 @@ mod tests {
678678
.expect("valid Europe PMC hit");
679679

680680
let row = from_europepmc_search_result(&hit).expect("search row should map");
681-
assert!(row.is_retracted);
681+
assert_eq!(row.is_retracted, Some(true));
682682
}
683683

684684
#[test]
@@ -698,6 +698,6 @@ mod tests {
698698
assert_eq!(row.source, ArticleSource::PubTator);
699699
assert_eq!(row.score, Some(255.9));
700700
assert_eq!(row.citation_count, None);
701-
assert!(!row.is_retracted);
701+
assert_eq!(row.is_retracted, None);
702702
}
703703
}

0 commit comments

Comments
 (0)