@@ -59,11 +59,12 @@ def both_entrytypes(entrytype: str) -> str:
5959# - Queries are better for debugging (understanding which conditions do/do not apply)
6060# https://jakevdp.github.io/PythonDataScienceHandbook/03.12-performance-eval-and-query.html
6161
62+
6263duplicate_conditions = [
6364 # Substantial differences in one of AUTHOR/TITLE/CONTAINER_TITLE
6465 f"({ au07_ti10_ct10 } & { match (VOLUME , PAGES )} )" ,
6566 f"({ au07_ti10_ct10 } & { non_contradicting (VOLUME , NUMBER , PAGES , YEAR , DOI )} )" ,
66- f"({ au10_ti07_ct10 } & { non_contradicting (NUMBER , PAGES , YEAR , DOI )} )" ,
67+ f"({ au10_ti07_ct10 } & { non_contradicting (VOLUME , NUMBER , PAGES , YEAR , DOI )} )" ,
6768 f"({ au10_ti10_ct07 } & { non_contradicting (VOLUME , NUMBER , PAGES , YEAR , DOI )} )" ,
6869 # Differences across AUTHOR/TITLE/CONTAINER_TITLE
6970 f"({ au08_ti09_ct09 } & { non_contradicting (VOLUME , NUMBER , YEAR , DOI )} & { PAGES } > 0.75 )" ,
@@ -78,7 +79,7 @@ def both_entrytypes(entrytype: str) -> str:
7879 f'({ au095_ti09_ct075 } & { both_entrytypes ("inproceedings" )} & { match (YEAR )} )' , # Inproceedings
7980 f"({ au07_ti10_ct10 } & { DOI } > 0.9)" , # Updates
8081 # no AUTHOR
81- f" ({ auXX_ti095_ct095 } & { non_contradicting (VOLUME , NUMBER , PAGES , YEAR , DOI )} )" ,
82+ f' ({ auXX_ti095_ct095 } & { non_contradicting (VOLUME , NUMBER , PAGES , YEAR , DOI )} ) & { both_entrytypes ( "article" ) } ' ,
8283 f"({ auXX_ti095_ct095 } & { match (VOLUME , NUMBER , PAGES , YEAR )} & { non_contradicting (DOI , ABSTRACT )} )" ,
8384 # no CONTAINER_TITLE
8485 f"({ au10_ti10_ctNC } & { match (VOLUME , YEAR )} & { non_contradicting (NUMBER , PAGES , DOI , ABSTRACT )} )" ,
@@ -92,13 +93,26 @@ def both_entrytypes(entrytype: str) -> str:
9293 f"(({ match (DOI )} & ~(doi_1 == '' | doi_2 == '')) & ({ TITLE } > 0.95) & ({ AUTHOR } > 0.9) & ({ YEAR } > 0.9)) & { non_contradicting (CONTAINER_TITLE )} " ,
9394 # no TITLE
9495 f"({ au10_tiXX_ct10 } & { match (VOLUME , NUMBER , PAGES , YEAR )} & { non_contradicting (DOI )} & ({ ABSTRACT } > 0.95 | { non_contradicting (ABSTRACT )} ))" , # typically for number-mismatches in title
96+ # early_view_vs_final
97+ f"({ au095_ti09_ct075 } "
98+ f" & { non_contradicting (DOI )} "
99+ f" & ((volume_1 != '' & volume_2 == '') | (volume_2 != '' & volume_1 == ''))"
100+ f" & ((number_1 != '' & number_2 == '') | (number_2 != '' & number_1 == '') | { non_contradicting (NUMBER )} )"
101+ f" & (pages_1.str.match('^1[-–]') | pages_2.str.match('^1[-–]'))"
102+ f")" ,
95103]
96104
97105non_duplicate_conditions = [
98106 f"({ mismatch (YEAR )} & ~({ match (VOLUME )} | { match (NUMBER )} | { match (PAGES )} | { match (DOI )} | { match (CONTAINER_TITLE )} ))" ,
99107 f'({ mismatch (TITLE )} & ({ PAGE_RANGES_ADJACENT } == "adjacent" | { PAGE_RANGES_ADJACENT } == "non_overlapping"))' ,
100108 f"(~(doi_1 == '' | doi_2 == '') & { DOI } < 0.8 & ~({ non_contradicting (AUTHOR , TITLE , YEAR , CONTAINER_TITLE , VOLUME , NUMBER , PAGES )} ))" ,
101- f"({ mismatch (VOLUME , NUMBER , PAGES )} )" ,
109+ # f"({mismatch(VOLUME, NUMBER, PAGES)})",
110+ f"({ mismatch (VOLUME , NUMBER , PAGES )} "
111+ f" & ~({ au095_ti09_ct075 } "
112+ f" & ((volume_1 != '' & volume_2 == '') | (volume_2 != '' & volume_1 == ''))"
113+ f" & (pages_1.str.match('^1[-–]') | pages_2.str.match('^1[-–]'))"
114+ f" )"
115+ f")" ,
102116 # Editorials: minor differences in volume/number/pages can be meaningful
103117 f'(title_1.str.contains("editor") & title_1.str.len() < 60 & ( { mismatch (VOLUME )} | { mismatch (NUMBER )} | { mismatch (PAGES )} ))' ,
104118 # Journal vs. conference/workshop
@@ -110,4 +124,14 @@ def both_entrytypes(entrytype: str) -> str:
110124 f' ~({ CONTAINER_TITLE } _2.str.contains("conf") | { CONTAINER_TITLE } _2.str.contains("work") | { CONTAINER_TITLE } _2.str.contains("proc")) ) & '
111125 f' ( ({ CONTAINER_TITLE } _1.str.contains("conf") | { CONTAINER_TITLE } _1.str.contains("work") | { CONTAINER_TITLE } _1.str.contains("proc")) & '
112126 f' ~{ CONTAINER_TITLE } _1.str.contains("j") ))' ,
127+ # Inproceedings: more sensitive to year mismatches
128+ f'({ both_entrytypes ("inproceedings" )} & { mismatch (YEAR )} )' ,
129+ # TODO : we may need to pre-compute this:
130+ # https://chatgpt.com/c/695554d0-a548-8332-9921-ec28332246fd
131+ # NEW: container title appears inside (either) title => treat as non-duplicate with anything else
132+ # (e.g., "Communications of the Association for Information Systems ... Volume 52 Paper 41")
133+ # f'(container_title_1 != "" & title_1 != "" & '
134+ # f' title_1.str.contains(container_title_1.str.lower(), na=False))'
135+ # f' | (container_title_2 != "" & title_2 != "" & '
136+ # f' title_2.str.contains(container_title_2.str.lower(), na=False))',
113137]
0 commit comments