|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Generate comprehensive test_data_global_feeds.json fixture with: |
| 4 | +- One work completely within each global region (continents + oceans) |
| 5 | +- One work overlapping two neighboring regions |
| 6 | +- Seven works spanning more than two global regions |
| 7 | +- Metadata diversity matching test_data_optimap.json patterns |
| 8 | +""" |
| 9 | + |
| 10 | +import json |
| 11 | +from datetime import datetime, timedelta |
| 12 | +import random |
| 13 | + |
| 14 | +# Metadata samples for diversity (matching optimap patterns) |
| 15 | +AUTHOR_SAMPLES = [ |
| 16 | + [], # No authors (for some publications) |
| 17 | + ["Dr. Single Author"], |
| 18 | + ["Dr. First Author", "Prof. Second Author"], |
| 19 | + ["Dr. Alice Smith", "Prof. Bob Jones", "Dr. Carol Williams"], |
| 20 | + ["Dr. Maria Garcia", "Prof. John Smith", "Dr. Emma Johnson", "Dr. Li Wei"], |
| 21 | + ["Prof. A", "Dr. B", "Dr. C", "Dr. D", "Dr. E", "Prof. F"], |
| 22 | + ["Dr. Zhang Wei", "Prof. Sarah Johnson", "Dr. Ahmed Hassan", "Dr. Maria Rodriguez", "Dr. John O'Connor", "Prof. Yuki Tanaka", "Dr. Pierre Dubois"], |
| 23 | +] |
| 24 | + |
| 25 | +KEYWORD_SAMPLES = [ |
| 26 | + [], # No keywords |
| 27 | + ["single keyword"], |
| 28 | + ["first keyword", "second keyword"], |
| 29 | + ["climate change", "remote sensing", "geospatial analysis"], |
| 30 | + ["biodiversity", "ecosystem services", "conservation", "habitat mapping"], |
| 31 | + ["urban planning", "sustainability", "GIS", "land use", "spatial analysis", "demographics"], |
| 32 | +] |
| 33 | + |
| 34 | +TOPIC_SAMPLES = [ |
| 35 | + [], # No topics |
| 36 | + ["Geography"], |
| 37 | + ["Environmental Science", "Ecology"], |
| 38 | + ["Climate Science", "Atmospheric Science", "Meteorology"], |
| 39 | +] |
| 40 | + |
| 41 | +OPENALEX_STATUS_SAMPLES = [None, "green", "gold", "hybrid", "bronze", "closed", "diamond"] |
| 42 | + |
| 43 | +PROVENANCE_TEMPLATES = [ |
| 44 | + "Harvested via OAI-PMH from {source_name} on {timestamp}.\nHarvestingEvent ID: {event_id}.\n\nMetadata Sources:\n - authors: original_source\n - keywords: original_source\n - topics: openalex\n - openalex_metadata: openalex", |
| 45 | + "Harvested via RSS/Atom feed from {source_name} on {timestamp}.\nHarvestingEvent ID: {event_id}.\n\nMetadata Sources:\n - authors: openalex\n - keywords: original_source\n - topics: openalex\n - openalex_metadata: openalex", |
| 46 | + "Harvested via OAI-PMH from {source_name} on {timestamp}.\nHarvestingEvent ID: {event_id}.\n\nNo authors or keywords found in original source. OpenAlex matching found partial matches but no exact match.", |
| 47 | + "Harvested via RSS/Atom feed from {source_name} on {timestamp}.\nHarvestingEvent ID: {event_id}.\n\nMetadata Sources:\n - authors: original_source\n - keywords: original_source\n - topics: (none - OpenAlex match not found)", |
| 48 | +] |
| 49 | + |
| 50 | +# Global region definitions with representative geometries |
| 51 | +# Format: (name, geometry_wkt, description) |
| 52 | +CONTINENTS = [ |
| 53 | + # Africa (completely within) |
| 54 | + ("Africa", "POLYGON ((10 -25, 40 -25, 40 30, 10 30, 10 -25))", "Central and Eastern Africa"), |
| 55 | + # Asia (completely within) |
| 56 | + ("Asia", "POLYGON ((70 15, 120 15, 120 50, 70 50, 70 15))", "Central and East Asia"), |
| 57 | + # Europe (completely within) |
| 58 | + ("Europe", "POLYGON ((0 45, 30 45, 30 65, 0 65, 0 45))", "Central and Western Europe"), |
| 59 | + # North America (completely within) |
| 60 | + ("North America", "POLYGON ((-120 30, -80 30, -80 50, -120 50, -120 30))", "Central United States and Canada"), |
| 61 | + # South America (completely within) |
| 62 | + ("South America", "POLYGON ((-70 -30, -50 -30, -50 0, -70 0, -70 -30))", "Brazil and surrounding regions"), |
| 63 | + # Australia (completely within) |
| 64 | + ("Australia", "POLYGON ((120 -35, 145 -35, 145 -15, 120 -15, 120 -35))", "Eastern Australia"), |
| 65 | + # Antarctica (completely within) |
| 66 | + ("Antarctica", "POLYGON ((-60 -75, 60 -75, 60 -65, -60 -65, -60 -75))", "Antarctic Peninsula region"), |
| 67 | +] |
| 68 | + |
| 69 | +OCEANS = [ |
| 70 | + # Atlantic Ocean (completely within) |
| 71 | + ("Atlantic Ocean", "POLYGON ((-40 10, -20 10, -20 40, -40 40, -40 10))", "North Atlantic Ocean"), |
| 72 | + # Pacific Ocean (completely within) |
| 73 | + ("Pacific Ocean", "POLYGON ((150 -20, 170 -20, 170 10, 150 10, 150 -20))", "Western Pacific Ocean"), |
| 74 | + # Indian Ocean (completely within) |
| 75 | + ("Indian Ocean", "POLYGON ((60 -30, 80 -30, 80 -10, 60 -10, 60 -30))", "Western Indian Ocean"), |
| 76 | + # Arctic Ocean (completely within) |
| 77 | + ("Arctic Ocean", "POLYGON ((-20 75, 20 75, 20 85, -20 85, -20 75))", "Arctic Ocean near North Pole"), |
| 78 | + # Southern Ocean (completely within) |
| 79 | + ("Southern Ocean", "POLYGON ((0 -65, 40 -65, 40 -55, 0 -55, 0 -65))", "Southern Ocean around Antarctica"), |
| 80 | +] |
| 81 | + |
| 82 | +# Works that overlap two neighboring regions |
| 83 | +TWO_REGION_OVERLAPS = [ |
| 84 | + ("Europe-Asia", "POLYGON ((25 40, 65 40, 65 55, 25 55, 25 40))", "Spanning Eastern Europe and Western Asia"), |
| 85 | + ("North America-Atlantic", "POLYGON ((-80 25, -50 25, -50 45, -80 45, -80 25))", "Eastern North America and Western Atlantic"), |
| 86 | + ("Africa-Indian Ocean", "POLYGON ((35 -20, 55 -20, 55 5, 35 5, 35 -20))", "East African coast and Western Indian Ocean"), |
| 87 | + ("South America-Pacific", "POLYGON ((-85 -20, -65 -20, -65 5, -85 5, -85 -20))", "Western South America and Eastern Pacific"), |
| 88 | + ("Asia-Pacific", "POLYGON ((115 20, 140 20, 140 45, 115 45, 115 20))", "East Asian coast and Western Pacific"), |
| 89 | +] |
| 90 | + |
| 91 | +# Works that span more than two global regions (7 required) |
| 92 | +MULTI_REGION_SPANS = [ |
| 93 | + ("Global Ocean Survey", "MULTIPOLYGON (((-40 -10, -20 -10, -20 10, -40 10, -40 -10)), ((60 -20, 80 -20, 80 0, 60 0, 60 -20)), ((150 -30, 170 -30, 170 -10, 150 -10, 150 -30)))", "Atlantic, Indian, and Pacific Oceans"), |
| 94 | + ("Trans-Atlantic Research", "POLYGON ((-70 20, 10 20, 10 50, -70 50, -70 20))", "North America, Atlantic Ocean, and Europe"), |
| 95 | + ("African-Asian Monsoon Study", "POLYGON ((20 -10, 90 -10, 90 25, 20 25, 20 -10))", "Africa, Indian Ocean, and Asia"), |
| 96 | + ("Pan-Pacific Study", "POLYGON ((110 -40, -80 -40, -80 50, 110 50, 110 -40))", "Asia, Pacific Ocean, North America, South America, Australia"), |
| 97 | + ("Southern Hemisphere Ocean Study", "POLYGON ((-180 -60, 180 -60, 180 -35, -180 -35, -180 -60))", "Southern Ocean, Pacific, Atlantic, Indian Oceans, South America, Africa, Australia, Antarctica"), |
| 98 | + ("Arctic Circumpolar Study", "POLYGON ((-180 65, 180 65, 180 85, -180 85, -180 65))", "Arctic Ocean, North America, Europe, Asia"), |
| 99 | + ("Global Climate Network", "MULTIPOLYGON (((-120 30, -100 30, -100 45, -120 45, -120 30)), ((10 40, 30 40, 30 55, 10 55, 10 40)), ((120 -30, 140 -30, 140 -20, 120 -20, 120 -30)), ((-50 -20, -40 -20, -40 -10, -50 -10, -50 -20)))", "North America, Europe, Australia, South America"), |
| 100 | +] |
| 101 | + |
| 102 | +def create_source(pk, name, issn_l=None, is_oa=True): |
| 103 | + """Create a source object.""" |
| 104 | + return { |
| 105 | + "model": "publications.source", |
| 106 | + "pk": pk, |
| 107 | + "fields": { |
| 108 | + "name": name, |
| 109 | + "issn_l": issn_l, |
| 110 | + "openalex_id": f"https://openalex.org/S{pk}000000" if random.random() > 0.3 else None, |
| 111 | + "openalex_url": f"https://api.openalex.org/sources/S{pk}000000" if random.random() > 0.3 else None, |
| 112 | + "publisher_name": f"{name.split()[0]} Publishers", |
| 113 | + "works_count": random.randint(100, 5000), |
| 114 | + "homepage_url": f"http://{name.lower().replace(' ', '')}.example.org", |
| 115 | + "abbreviated_title": name[:15] + ".", |
| 116 | + "is_oa": is_oa, |
| 117 | + "cited_by_count": random.randint(500, 50000), |
| 118 | + "is_preprint": random.choice([True, False]), |
| 119 | + } |
| 120 | + } |
| 121 | + |
| 122 | +def create_publication(pk, source_pk, title, abstract, geometry_wkt, region_desc, |
| 123 | + authors_idx, keywords_idx, topics_idx, |
| 124 | + has_openalex=True, is_retracted=False, event_id=1000): |
| 125 | + """Create a publication object with varied metadata.""" |
| 126 | + |
| 127 | + # Generate dates |
| 128 | + base_date = datetime(2020, 1, 1) |
| 129 | + pub_date = base_date + timedelta(days=random.randint(0, 1800)) |
| 130 | + creation_date = pub_date + timedelta(days=random.randint(1, 30)) |
| 131 | + |
| 132 | + # Select metadata |
| 133 | + authors = AUTHOR_SAMPLES[authors_idx % len(AUTHOR_SAMPLES)] |
| 134 | + keywords = KEYWORD_SAMPLES[keywords_idx % len(KEYWORD_SAMPLES)] |
| 135 | + topics = TOPIC_SAMPLES[topics_idx % len(TOPIC_SAMPLES)] |
| 136 | + |
| 137 | + # Generate DOI |
| 138 | + doi = f"10.5555/global-{pk}-{random.randint(1000, 9999)}" |
| 139 | + |
| 140 | + # OpenAlex fields |
| 141 | + openalex_id = None |
| 142 | + openalex_match_info = None |
| 143 | + openalex_fulltext_origin = None |
| 144 | + openalex_ids = None |
| 145 | + openalex_open_access_status = None |
| 146 | + |
| 147 | + if has_openalex: |
| 148 | + if random.random() > 0.2: # 80% have full OpenAlex match |
| 149 | + openalex_id = f"https://openalex.org/W{3000000 + pk}" |
| 150 | + openalex_fulltext_origin = random.choice(["repository", "publisher", None]) |
| 151 | + openalex_ids = json.dumps({"doi": f"https://doi.org/{doi}", "pmid": f"{38000000 + pk}" if random.random() > 0.5 else None}) |
| 152 | + openalex_open_access_status = random.choice(OPENALEX_STATUS_SAMPLES) |
| 153 | + else: # 20% have partial match info |
| 154 | + openalex_match_info = json.dumps([{ |
| 155 | + "openalex_id": f"https://openalex.org/W{2900000 + pk}", |
| 156 | + "title": f"Similar Study {pk}", |
| 157 | + "doi": None, |
| 158 | + "match_type": "title" |
| 159 | + }]) |
| 160 | + |
| 161 | + # Generate provenance |
| 162 | + source_name = f"Global Source {source_pk}" |
| 163 | + timestamp = creation_date.isoformat() + "Z" |
| 164 | + provenance_template = random.choice(PROVENANCE_TEMPLATES) |
| 165 | + provenance = provenance_template.format( |
| 166 | + source_name=source_name, |
| 167 | + timestamp=timestamp, |
| 168 | + event_id=event_id + pk |
| 169 | + ) |
| 170 | + |
| 171 | + return { |
| 172 | + "model": "publications.publication", |
| 173 | + "pk": pk, |
| 174 | + "fields": { |
| 175 | + "status": random.choice(["p", "p", "p", "h", "c"]), # mostly published |
| 176 | + "title": title, |
| 177 | + "abstract": abstract, |
| 178 | + "publicationDate": pub_date.strftime("%Y-%m-%d"), |
| 179 | + "doi": doi, |
| 180 | + "url": f"https://example.org/publications/{pk}", |
| 181 | + "geometry": f"SRID=4326;GEOMETRYCOLLECTION({geometry_wkt})", |
| 182 | + "creationDate": creation_date.isoformat() + "Z", |
| 183 | + "lastUpdate": (creation_date + timedelta(hours=random.randint(1, 48))).isoformat() + "Z", |
| 184 | + "source": source_pk, |
| 185 | + "timeperiod_startdate": f"[\"{pub_date.year - random.randint(1, 3)}\"]", |
| 186 | + "timeperiod_enddate": f"[\"{pub_date.year}\"]", |
| 187 | + "provenance": provenance, |
| 188 | + "authors": authors, |
| 189 | + "keywords": keywords, |
| 190 | + "topics": topics, |
| 191 | + "openalex_id": openalex_id, |
| 192 | + "openalex_match_info": openalex_match_info, |
| 193 | + "openalex_fulltext_origin": openalex_fulltext_origin, |
| 194 | + "openalex_is_retracted": is_retracted, |
| 195 | + "openalex_ids": openalex_ids, |
| 196 | + "openalex_open_access_status": openalex_open_access_status, |
| 197 | + } |
| 198 | + } |
| 199 | + |
| 200 | +def main(): |
| 201 | + print("Creating comprehensive test_data_global_feeds.json fixture...") |
| 202 | + |
| 203 | + fixture_data = [] |
| 204 | + pk_counter = 2000 |
| 205 | + source_pk = 2000 |
| 206 | + |
| 207 | + # Create a few diverse sources |
| 208 | + sources = [ |
| 209 | + create_source(2000, "Global Geoscience Journal", "2234-5678", True), |
| 210 | + create_source(2001, "International Earth Sciences", "3345-6789", True), |
| 211 | + create_source(2002, "World Environmental Research", "4456-7890", False), |
| 212 | + create_source(2003, "Planetary Studies Quarterly", "5567-8901", True), |
| 213 | + ] |
| 214 | + fixture_data.extend(sources) |
| 215 | + |
| 216 | + print(f"\nCreating {len(sources)} sources...") |
| 217 | + |
| 218 | + # Track metadata distribution for similar patterns to optimap |
| 219 | + author_idx = 0 |
| 220 | + keyword_idx = 0 |
| 221 | + topic_idx = 0 |
| 222 | + |
| 223 | + print("\n=== Creating works for each continent ===") |
| 224 | + for i, (region_name, geometry, description) in enumerate(CONTINENTS): |
| 225 | + pk = pk_counter |
| 226 | + pk_counter += 1 |
| 227 | + source_pk_choice = 2000 + (i % len(sources)) |
| 228 | + |
| 229 | + pub = create_publication( |
| 230 | + pk=pk, |
| 231 | + source_pk=source_pk_choice, |
| 232 | + title=f"Geological Survey of {region_name}", |
| 233 | + abstract=f"Comprehensive geological and environmental study covering {description}. This research examines {region_name.lower()} geology, climate patterns, and ecological systems.", |
| 234 | + geometry_wkt=geometry, |
| 235 | + region_desc=description, |
| 236 | + authors_idx=author_idx, |
| 237 | + keywords_idx=keyword_idx, |
| 238 | + topics_idx=topic_idx, |
| 239 | + has_openalex=True, |
| 240 | + ) |
| 241 | + fixture_data.append(pub) |
| 242 | + print(f" [{pk}] {region_name}: {len(pub['fields']['authors'])} authors, {len(pub['fields']['keywords'])} keywords, {len(pub['fields']['topics'])} topics") |
| 243 | + |
| 244 | + author_idx += 1 |
| 245 | + keyword_idx += 1 |
| 246 | + topic_idx += 1 |
| 247 | + |
| 248 | + print("\n=== Creating works for each ocean ===") |
| 249 | + for i, (region_name, geometry, description) in enumerate(OCEANS): |
| 250 | + pk = pk_counter |
| 251 | + pk_counter += 1 |
| 252 | + source_pk_choice = 2000 + (i % len(sources)) |
| 253 | + |
| 254 | + pub = create_publication( |
| 255 | + pk=pk, |
| 256 | + source_pk=source_pk_choice, |
| 257 | + title=f"Marine Biology and Oceanography of the {region_name}", |
| 258 | + abstract=f"Detailed oceanographic study of {description}. Research includes marine ecosystems, ocean currents, temperature patterns, and biodiversity in the {region_name.lower()}.", |
| 259 | + geometry_wkt=geometry, |
| 260 | + region_desc=description, |
| 261 | + authors_idx=author_idx, |
| 262 | + keywords_idx=keyword_idx, |
| 263 | + topics_idx=topic_idx, |
| 264 | + has_openalex=True, |
| 265 | + ) |
| 266 | + fixture_data.append(pub) |
| 267 | + print(f" [{pk}] {region_name}: {len(pub['fields']['authors'])} authors, {len(pub['fields']['keywords'])} keywords, {len(pub['fields']['topics'])} topics") |
| 268 | + |
| 269 | + author_idx += 1 |
| 270 | + keyword_idx += 1 |
| 271 | + topic_idx += 1 |
| 272 | + |
| 273 | + print("\n=== Creating works overlapping two neighboring regions ===") |
| 274 | + for i, (region_name, geometry, description) in enumerate(TWO_REGION_OVERLAPS): |
| 275 | + pk = pk_counter |
| 276 | + pk_counter += 1 |
| 277 | + source_pk_choice = 2000 + (i % len(sources)) |
| 278 | + |
| 279 | + pub = create_publication( |
| 280 | + pk=pk, |
| 281 | + source_pk=source_pk_choice, |
| 282 | + title=f"Cross-Regional Study: {region_name}", |
| 283 | + abstract=f"Cross-border environmental and geological research {description}. This study analyzes patterns that span multiple geographical regions.", |
| 284 | + geometry_wkt=geometry, |
| 285 | + region_desc=description, |
| 286 | + authors_idx=author_idx, |
| 287 | + keywords_idx=keyword_idx, |
| 288 | + topics_idx=topic_idx, |
| 289 | + has_openalex=True, |
| 290 | + ) |
| 291 | + fixture_data.append(pub) |
| 292 | + print(f" [{pk}] {region_name}: {len(pub['fields']['authors'])} authors, {len(pub['fields']['keywords'])} keywords, {len(pub['fields']['topics'])} topics") |
| 293 | + |
| 294 | + author_idx += 1 |
| 295 | + keyword_idx += 1 |
| 296 | + topic_idx += 1 |
| 297 | + |
| 298 | + print("\n=== Creating works spanning more than two regions ===") |
| 299 | + for i, (title_suffix, geometry, description) in enumerate(MULTI_REGION_SPANS): |
| 300 | + pk = pk_counter |
| 301 | + pk_counter += 1 |
| 302 | + source_pk_choice = 2000 + (i % len(sources)) |
| 303 | + |
| 304 | + pub = create_publication( |
| 305 | + pk=pk, |
| 306 | + source_pk=source_pk_choice, |
| 307 | + title=title_suffix, |
| 308 | + abstract=f"Large-scale multi-regional research project covering {description}. This comprehensive study examines global patterns and connections across multiple continents and oceans.", |
| 309 | + geometry_wkt=geometry, |
| 310 | + region_desc=description, |
| 311 | + authors_idx=author_idx, |
| 312 | + keywords_idx=keyword_idx, |
| 313 | + topics_idx=topic_idx, |
| 314 | + has_openalex=True, |
| 315 | + ) |
| 316 | + fixture_data.append(pub) |
| 317 | + print(f" [{pk}] {title_suffix}: {len(pub['fields']['authors'])} authors, {len(pub['fields']['keywords'])} keywords, {len(pub['fields']['topics'])} topics") |
| 318 | + |
| 319 | + author_idx += 1 |
| 320 | + keyword_idx += 1 |
| 321 | + topic_idx += 1 |
| 322 | + |
| 323 | + # Create backup of original |
| 324 | + import os |
| 325 | + import shutil |
| 326 | + fixture_path = "test_data_global_feeds.json" |
| 327 | + backup_path = fixture_path + ".backup" |
| 328 | + |
| 329 | + if os.path.exists(fixture_path): |
| 330 | + print(f"\n=== Creating backup: {backup_path} ===") |
| 331 | + shutil.copy(fixture_path, backup_path) |
| 332 | + |
| 333 | + # Write fixture |
| 334 | + print(f"\n=== Writing fixture to {fixture_path} ===") |
| 335 | + with open(fixture_path, "w") as f: |
| 336 | + json.dump(fixture_data, f, indent=2) |
| 337 | + |
| 338 | + # Calculate statistics |
| 339 | + publications = [item for item in fixture_data if item["model"] == "publications.publication"] |
| 340 | + |
| 341 | + with_authors = sum(1 for p in publications if p["fields"]["authors"]) |
| 342 | + with_keywords = sum(1 for p in publications if p["fields"]["keywords"]) |
| 343 | + with_topics = sum(1 for p in publications if p["fields"]["topics"]) |
| 344 | + with_openalex = sum(1 for p in publications if p["fields"]["openalex_id"]) |
| 345 | + is_retracted = sum(1 for p in publications if p["fields"]["openalex_is_retracted"]) |
| 346 | + |
| 347 | + print("\n=== Summary ===") |
| 348 | + print(f"Total publications: {len(publications)}") |
| 349 | + print(f" - Continents: {len(CONTINENTS)}") |
| 350 | + print(f" - Oceans: {len(OCEANS)}") |
| 351 | + print(f" - Two-region overlaps: {len(TWO_REGION_OVERLAPS)}") |
| 352 | + print(f" - Multi-region spans: {len(MULTI_REGION_SPANS)}") |
| 353 | + print(f"\nMetadata coverage:") |
| 354 | + print(f" - With authors: {with_authors}/{len(publications)}") |
| 355 | + print(f" - With keywords: {with_keywords}/{len(publications)}") |
| 356 | + print(f" - With topics: {with_topics}/{len(publications)}") |
| 357 | + print(f" - With OpenAlex ID: {with_openalex}/{len(publications)}") |
| 358 | + print(f" - Retracted: {is_retracted}/{len(publications)}") |
| 359 | + |
| 360 | + # Calculate array field distributions |
| 361 | + from collections import Counter |
| 362 | + authors_counts = Counter(len(p["fields"]["authors"]) for p in publications) |
| 363 | + keywords_counts = Counter(len(p["fields"]["keywords"]) for p in publications) |
| 364 | + topics_counts = Counter(len(p["fields"]["topics"]) for p in publications) |
| 365 | + |
| 366 | + print(f"\nArray field diversity:") |
| 367 | + print(f" - Authors distribution: {dict(sorted(authors_counts.items()))}") |
| 368 | + print(f" - Keywords distribution: {dict(sorted(keywords_counts.items()))}") |
| 369 | + print(f" - Topics distribution: {dict(sorted(topics_counts.items()))}") |
| 370 | + |
| 371 | + print("\n✓ Fixture creation complete!") |
| 372 | + print(f"\nTo load the fixture:") |
| 373 | + print(f" python manage.py loaddata {fixture_path}") |
| 374 | + |
| 375 | +if __name__ == "__main__": |
| 376 | + main() |
0 commit comments