|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Enrich artifacts (software and datasets) with indirect citations. |
| 4 | +
|
| 5 | +This script calculates indirect citation metrics for artifacts by: |
| 6 | +- Reading publications associated with artifacts via DOIs |
| 7 | +- Matching DOIs with paper information (doi, paper_id, openaire_id, citation_count) |
| 8 | +- Summing citation counts for each artifact's publications, counting each openaire_id only once |
| 9 | +
|
| 10 | +**Input Files Required (Tab-Separated):** |
| 11 | +
|
| 12 | +1. **artifacts-to-publications**: Publications associated with artifacts |
| 13 | + - Required columns: `artifact_id`, `doi` |
| 14 | + - Optional: other artifact metadata columns |
| 15 | + |
| 16 | +2. **publications**: Paper information with DOI mapping and citation counts |
| 17 | + - Required columns: `doi`, `paper_id`, `openaire_id`, `citation_count` |
| 18 | +
|
| 19 | +**Output Files (Tab-Separated):** |
| 20 | +
|
| 21 | +1. **output**: Unique artifacts file with citation counts |
| 22 | + - Columns: `artifact_id`, `indirect_citations` |
| 23 | + - One row per unique artifact, sorted by `indirect_citations` descending |
| 24 | +
|
| 25 | +**Note:** All input and output files must be tab-separated (TSV format). |
| 26 | +
|
| 27 | +**Usage:** |
| 28 | + python indirect_citations.py \\ |
| 29 | + --artifacts-to-publications <artifacts_to_publications_csv> \\ |
| 30 | + --publications <publications_csv> \\ |
| 31 | + [--output <output_csv>] |
| 32 | +
|
| 33 | +**Example:** |
| 34 | + python indirect_citations.py \\ |
| 35 | + --artifacts-to-publications artifacts_to_publications.csv \\ |
| 36 | + --publications publications.csv \\ |
| 37 | + --output enriched_output.csv |
| 38 | +
|
| 39 | +**Note:** All files are tab-separated (TSV format), even though they use .csv extension. |
| 40 | +""" |
| 41 | + |
| 42 | +import pandas as pd |
| 43 | +import sys |
| 44 | +import argparse |
| 45 | +from collections import defaultdict |
| 46 | + |
| 47 | + |
| 48 | +def load_publications_info(publications_csv): |
| 49 | + """ |
| 50 | + Load paper information from CSV. |
| 51 | + |
| 52 | + Creates lookup dictionaries for DOI -> paper info and computes |
| 53 | + openaire_id uniqueness (whether each openaire_id appears only once). |
| 54 | + |
| 55 | + Args: |
| 56 | + publications_csv: Path to CSV with columns: doi, paper_id, openaire_id, citation_count |
| 57 | + |
| 58 | + Returns: |
| 59 | + tuple: (paper_info_dict, openaire_uniqueness_dict) |
| 60 | + - paper_info_dict: {doi: {'paper_id': ..., 'openaire_id': ..., 'citation_count': ...}} |
| 61 | + - openaire_uniqueness_dict: {openaire_id: True/False} |
| 62 | + """ |
| 63 | + print(f" -> Reading publications file: {publications_csv}") |
| 64 | + publications_df = pd.read_csv(publications_csv, sep='\t') |
| 65 | + print(f" -> Loaded {len(publications_df)} publications from file") |
| 66 | + |
| 67 | + # Validate required columns |
| 68 | + required_cols = ['doi', 'paper_id', 'openaire_id', 'citation_count'] |
| 69 | + missing_cols = [col for col in required_cols if col not in publications_df.columns] |
| 70 | + if missing_cols: |
| 71 | + raise ValueError(f"Missing required columns in publications CSV: {missing_cols}") |
| 72 | + |
| 73 | + # Create DOI -> paper info dictionary |
| 74 | + paper_info = {} |
| 75 | + for _, row in publications_df.iterrows(): |
| 76 | + doi = str(row['doi']).strip() if pd.notna(row['doi']) else None |
| 77 | + if doi: |
| 78 | + paper_info[doi] = { |
| 79 | + 'paper_id': row['paper_id'], |
| 80 | + 'openaire_id': str(row['openaire_id']) if pd.notna(row['openaire_id']) else None, |
| 81 | + 'citation_count': int(row['citation_count']) if pd.notna(row['citation_count']) else 0 |
| 82 | + } |
| 83 | + |
| 84 | + print(f" -> Created lookup for {len(paper_info)} unique DOIs") |
| 85 | + |
| 86 | + # Compute openaire_id uniqueness |
| 87 | + # An openaire_id is unique if it appears only once in the publications CSV |
| 88 | + openaire_counts = publications_df['openaire_id'].value_counts() |
| 89 | + openaire_uniqueness = { |
| 90 | + str(openaire_id): (count == 1) |
| 91 | + for openaire_id, count in openaire_counts.items() |
| 92 | + if pd.notna(openaire_id) |
| 93 | + } |
| 94 | + |
| 95 | + unique_count = sum(1 for is_unique in openaire_uniqueness.values() if is_unique) |
| 96 | + print(f" -> {unique_count} out of {len(openaire_uniqueness)} openaire_ids are unique") |
| 97 | + |
| 98 | + return paper_info, openaire_uniqueness |
| 99 | + |
| 100 | + |
| 101 | +def calculate_indirect_citations(publications_df, paper_info, openaire_uniqueness): |
| 102 | + """ |
| 103 | + Calculate indirect citations for artifacts based on their publications. |
| 104 | + |
| 105 | + For each artifact, finds all associated DOIs from publications, matches them |
| 106 | + with paper information, and sums citation counts. Each openaire_id is |
| 107 | + counted only once per artifact to avoid double-counting. |
| 108 | + |
| 109 | + Args: |
| 110 | + publications_df: DataFrame with columns: artifact_id, doi |
| 111 | + paper_info: Dict {doi: {paper_id, openaire_id, citation_count}} |
| 112 | + openaire_uniqueness: Dict {openaire_id: True/False} |
| 113 | + |
| 114 | + Returns: |
| 115 | + dict: {artifact_id: total_indirect_citations} |
| 116 | + """ |
| 117 | + print("\nCalculating indirect citations...") |
| 118 | + |
| 119 | + indirect_citation_totals = defaultdict(int) |
| 120 | + |
| 121 | + processed = 0 |
| 122 | + matched = 0 |
| 123 | + unique_count = 0 |
| 124 | + |
| 125 | + # Group by artifact_id to process all publications for an artifact together |
| 126 | + for artifact_id, group in publications_df.groupby('artifact_id'): |
| 127 | + artifact_openaire_ids = set() # Track openaire_ids for this artifact |
| 128 | + |
| 129 | + for _, row in group.iterrows(): |
| 130 | + if pd.isna(row['doi']) or str(row['doi']).strip() == '': |
| 131 | + continue |
| 132 | + |
| 133 | + doi = str(row['doi']).strip() |
| 134 | + processed += 1 |
| 135 | + |
| 136 | + if doi in paper_info: |
| 137 | + paper_data = paper_info[doi] |
| 138 | + matched += 1 |
| 139 | + openaire_id = paper_data['openaire_id'] |
| 140 | + |
| 141 | + # Only count each openaire_id once per artifact |
| 142 | + if openaire_id and openaire_id not in artifact_openaire_ids: |
| 143 | + artifact_openaire_ids.add(openaire_id) |
| 144 | + unique_count += 1 |
| 145 | + indirect_citation_totals[artifact_id] += paper_data['citation_count'] |
| 146 | + |
| 147 | + if processed % 1000 == 0: |
| 148 | + print(f" -> Processed {processed} DOIs, matched {matched}, unique: {unique_count}") |
| 149 | + |
| 150 | + print(f" -> Processed {processed} DOIs, matched {matched}, unique openaire_ids: {unique_count}") |
| 151 | + print(f" -> Found indirect citations for {len(indirect_citation_totals)} artifacts") |
| 152 | + |
| 153 | + return indirect_citation_totals |
| 154 | + |
| 155 | + |
| 156 | +def main(): |
| 157 | + """Main function to orchestrate the enrichment process.""" |
| 158 | + parser = argparse.ArgumentParser( |
| 159 | + description='Enrich artifacts with indirect citations from CSV files', |
| 160 | + formatter_class=argparse.RawDescriptionHelpFormatter, |
| 161 | + epilog=__doc__ |
| 162 | + ) |
| 163 | + |
| 164 | + parser.add_argument('--artifacts-to-publications', required=True, |
| 165 | + help='Path to artifacts-to-publications CSV (columns: artifact_id, doi)') |
| 166 | + parser.add_argument('--publications', required=True, |
| 167 | + help='Path to publications CSV (columns: doi, paper_id, openaire_id, citation_count)') |
| 168 | + parser.add_argument('--output', default='enriched_output.csv', |
| 169 | + help='Output path for enriched tab-separated file (default: enriched_output.csv)') |
| 170 | + |
| 171 | + args = parser.parse_args() |
| 172 | + |
| 173 | + print("=" * 80) |
| 174 | + print("ARTIFACTS INDIRECT CITATION ENRICHMENT SCRIPT") |
| 175 | + print("=" * 80) |
| 176 | + print(f"\nArtifacts-to-publications file: {args.artifacts_to_publications}") |
| 177 | + print(f"Publications file: {args.publications}") |
| 178 | + print(f"Output: {args.output}") |
| 179 | + print("=" * 80) |
| 180 | + |
| 181 | + # STEP 1: Load publications information |
| 182 | + print("\nSTEP 1: Loading publications information...") |
| 183 | + try: |
| 184 | + paper_info, openaire_uniqueness = load_publications_info(args.publications) |
| 185 | + except Exception as e: |
| 186 | + print(f"ERROR: Failed to load publications CSV: {e}") |
| 187 | + sys.exit(1) |
| 188 | + |
| 189 | + # STEP 2: Load artifacts-to-publications file |
| 190 | + print("\nSTEP 2: Loading artifacts-to-publications file...") |
| 191 | + try: |
| 192 | + artifacts_publications_df = pd.read_csv(args.artifacts_to_publications, sep='\t') |
| 193 | + print(f" -> Loaded {len(artifacts_publications_df)} publication entries") |
| 194 | + |
| 195 | + if 'artifact_id' not in artifacts_publications_df.columns or 'doi' not in artifacts_publications_df.columns: |
| 196 | + raise ValueError("Artifacts-to-publications file must have 'artifact_id' and 'doi' columns") |
| 197 | + except Exception as e: |
| 198 | + print(f"ERROR: Failed to load artifacts-to-publications file: {e}") |
| 199 | + sys.exit(1) |
| 200 | + |
| 201 | + # STEP 3: Calculate indirect citations |
| 202 | + print("\nSTEP 3: Processing indirect citations...") |
| 203 | + indirect_citation_totals = calculate_indirect_citations( |
| 204 | + artifacts_publications_df, paper_info, openaire_uniqueness |
| 205 | + ) |
| 206 | + |
| 207 | + # STEP 4: Create enriched output with unique artifacts |
| 208 | + print("\nSTEP 4: Creating enriched output...") |
| 209 | + |
| 210 | + # Create output DataFrame with unique artifacts |
| 211 | + unique_artifacts = pd.DataFrame({ |
| 212 | + 'artifact_id': list(indirect_citation_totals.keys()), |
| 213 | + 'indirect_citations': list(indirect_citation_totals.values()) |
| 214 | + }) |
| 215 | + |
| 216 | + # Sort by indirect_citations descending |
| 217 | + unique_artifacts = unique_artifacts.sort_values('indirect_citations', ascending=False, na_position='last') |
| 218 | + |
| 219 | + # Save enriched CSV |
| 220 | + print(f"\nSTEP 5: Saving enriched output file...") |
| 221 | + unique_artifacts.to_csv(args.output, index=False, sep='\t') |
| 222 | + print(f" -> Saved to: {args.output}") |
| 223 | + print(f" -> Total unique artifacts: {len(unique_artifacts)}") |
| 224 | + |
| 225 | + |
| 226 | +if __name__ == "__main__": |
| 227 | + main() |
0 commit comments