Skip to content

Commit e25cdec

Browse files
committed
regenerate fixtures to test global regions
1 parent 20becc8 commit e25cdec

File tree

3 files changed

+1353
-293
lines changed

3 files changed

+1353
-293
lines changed
Lines changed: 376 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,376 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Generate comprehensive test_data_global_feeds.json fixture with:
4+
- One work completely within each global region (continents + oceans)
5+
- One work overlapping two neighboring regions
6+
- Seven works spanning more than two global regions
7+
- Metadata diversity matching test_data_optimap.json patterns
8+
"""
9+
10+
import json
11+
from datetime import datetime, timedelta
12+
import random
13+
14+
# Metadata samples for diversity (matching optimap patterns)
15+
AUTHOR_SAMPLES = [
16+
[], # No authors (for some publications)
17+
["Dr. Single Author"],
18+
["Dr. First Author", "Prof. Second Author"],
19+
["Dr. Alice Smith", "Prof. Bob Jones", "Dr. Carol Williams"],
20+
["Dr. Maria Garcia", "Prof. John Smith", "Dr. Emma Johnson", "Dr. Li Wei"],
21+
["Prof. A", "Dr. B", "Dr. C", "Dr. D", "Dr. E", "Prof. F"],
22+
["Dr. Zhang Wei", "Prof. Sarah Johnson", "Dr. Ahmed Hassan", "Dr. Maria Rodriguez", "Dr. John O'Connor", "Prof. Yuki Tanaka", "Dr. Pierre Dubois"],
23+
]
24+
25+
KEYWORD_SAMPLES = [
26+
[], # No keywords
27+
["single keyword"],
28+
["first keyword", "second keyword"],
29+
["climate change", "remote sensing", "geospatial analysis"],
30+
["biodiversity", "ecosystem services", "conservation", "habitat mapping"],
31+
["urban planning", "sustainability", "GIS", "land use", "spatial analysis", "demographics"],
32+
]
33+
34+
TOPIC_SAMPLES = [
35+
[], # No topics
36+
["Geography"],
37+
["Environmental Science", "Ecology"],
38+
["Climate Science", "Atmospheric Science", "Meteorology"],
39+
]
40+
41+
OPENALEX_STATUS_SAMPLES = [None, "green", "gold", "hybrid", "bronze", "closed", "diamond"]
42+
43+
PROVENANCE_TEMPLATES = [
44+
"Harvested via OAI-PMH from {source_name} on {timestamp}.\nHarvestingEvent ID: {event_id}.\n\nMetadata Sources:\n - authors: original_source\n - keywords: original_source\n - topics: openalex\n - openalex_metadata: openalex",
45+
"Harvested via RSS/Atom feed from {source_name} on {timestamp}.\nHarvestingEvent ID: {event_id}.\n\nMetadata Sources:\n - authors: openalex\n - keywords: original_source\n - topics: openalex\n - openalex_metadata: openalex",
46+
"Harvested via OAI-PMH from {source_name} on {timestamp}.\nHarvestingEvent ID: {event_id}.\n\nNo authors or keywords found in original source. OpenAlex matching found partial matches but no exact match.",
47+
"Harvested via RSS/Atom feed from {source_name} on {timestamp}.\nHarvestingEvent ID: {event_id}.\n\nMetadata Sources:\n - authors: original_source\n - keywords: original_source\n - topics: (none - OpenAlex match not found)",
48+
]
49+
50+
# Global region definitions with representative geometries
51+
# Format: (name, geometry_wkt, description)
52+
CONTINENTS = [
53+
# Africa (completely within)
54+
("Africa", "POLYGON ((10 -25, 40 -25, 40 30, 10 30, 10 -25))", "Central and Eastern Africa"),
55+
# Asia (completely within)
56+
("Asia", "POLYGON ((70 15, 120 15, 120 50, 70 50, 70 15))", "Central and East Asia"),
57+
# Europe (completely within)
58+
("Europe", "POLYGON ((0 45, 30 45, 30 65, 0 65, 0 45))", "Central and Western Europe"),
59+
# North America (completely within)
60+
("North America", "POLYGON ((-120 30, -80 30, -80 50, -120 50, -120 30))", "Central United States and Canada"),
61+
# South America (completely within)
62+
("South America", "POLYGON ((-70 -30, -50 -30, -50 0, -70 0, -70 -30))", "Brazil and surrounding regions"),
63+
# Australia (completely within)
64+
("Australia", "POLYGON ((120 -35, 145 -35, 145 -15, 120 -15, 120 -35))", "Eastern Australia"),
65+
# Antarctica (completely within)
66+
("Antarctica", "POLYGON ((-60 -75, 60 -75, 60 -65, -60 -65, -60 -75))", "Antarctic Peninsula region"),
67+
]
68+
69+
OCEANS = [
70+
# Atlantic Ocean (completely within)
71+
("Atlantic Ocean", "POLYGON ((-40 10, -20 10, -20 40, -40 40, -40 10))", "North Atlantic Ocean"),
72+
# Pacific Ocean (completely within)
73+
("Pacific Ocean", "POLYGON ((150 -20, 170 -20, 170 10, 150 10, 150 -20))", "Western Pacific Ocean"),
74+
# Indian Ocean (completely within)
75+
("Indian Ocean", "POLYGON ((60 -30, 80 -30, 80 -10, 60 -10, 60 -30))", "Western Indian Ocean"),
76+
# Arctic Ocean (completely within)
77+
("Arctic Ocean", "POLYGON ((-20 75, 20 75, 20 85, -20 85, -20 75))", "Arctic Ocean near North Pole"),
78+
# Southern Ocean (completely within)
79+
("Southern Ocean", "POLYGON ((0 -65, 40 -65, 40 -55, 0 -55, 0 -65))", "Southern Ocean around Antarctica"),
80+
]
81+
82+
# Works that overlap two neighboring regions
83+
TWO_REGION_OVERLAPS = [
84+
("Europe-Asia", "POLYGON ((25 40, 65 40, 65 55, 25 55, 25 40))", "Spanning Eastern Europe and Western Asia"),
85+
("North America-Atlantic", "POLYGON ((-80 25, -50 25, -50 45, -80 45, -80 25))", "Eastern North America and Western Atlantic"),
86+
("Africa-Indian Ocean", "POLYGON ((35 -20, 55 -20, 55 5, 35 5, 35 -20))", "East African coast and Western Indian Ocean"),
87+
("South America-Pacific", "POLYGON ((-85 -20, -65 -20, -65 5, -85 5, -85 -20))", "Western South America and Eastern Pacific"),
88+
("Asia-Pacific", "POLYGON ((115 20, 140 20, 140 45, 115 45, 115 20))", "East Asian coast and Western Pacific"),
89+
]
90+
91+
# Works that span more than two global regions (7 required)
92+
MULTI_REGION_SPANS = [
93+
("Global Ocean Survey", "MULTIPOLYGON (((-40 -10, -20 -10, -20 10, -40 10, -40 -10)), ((60 -20, 80 -20, 80 0, 60 0, 60 -20)), ((150 -30, 170 -30, 170 -10, 150 -10, 150 -30)))", "Atlantic, Indian, and Pacific Oceans"),
94+
("Trans-Atlantic Research", "POLYGON ((-70 20, 10 20, 10 50, -70 50, -70 20))", "North America, Atlantic Ocean, and Europe"),
95+
("African-Asian Monsoon Study", "POLYGON ((20 -10, 90 -10, 90 25, 20 25, 20 -10))", "Africa, Indian Ocean, and Asia"),
96+
("Pan-Pacific Study", "POLYGON ((110 -40, -80 -40, -80 50, 110 50, 110 -40))", "Asia, Pacific Ocean, North America, South America, Australia"),
97+
("Southern Hemisphere Ocean Study", "POLYGON ((-180 -60, 180 -60, 180 -35, -180 -35, -180 -60))", "Southern Ocean, Pacific, Atlantic, Indian Oceans, South America, Africa, Australia, Antarctica"),
98+
("Arctic Circumpolar Study", "POLYGON ((-180 65, 180 65, 180 85, -180 85, -180 65))", "Arctic Ocean, North America, Europe, Asia"),
99+
("Global Climate Network", "MULTIPOLYGON (((-120 30, -100 30, -100 45, -120 45, -120 30)), ((10 40, 30 40, 30 55, 10 55, 10 40)), ((120 -30, 140 -30, 140 -20, 120 -20, 120 -30)), ((-50 -20, -40 -20, -40 -10, -50 -10, -50 -20)))", "North America, Europe, Australia, South America"),
100+
]
101+
102+
def create_source(pk, name, issn_l=None, is_oa=True):
103+
"""Create a source object."""
104+
return {
105+
"model": "publications.source",
106+
"pk": pk,
107+
"fields": {
108+
"name": name,
109+
"issn_l": issn_l,
110+
"openalex_id": f"https://openalex.org/S{pk}000000" if random.random() > 0.3 else None,
111+
"openalex_url": f"https://api.openalex.org/sources/S{pk}000000" if random.random() > 0.3 else None,
112+
"publisher_name": f"{name.split()[0]} Publishers",
113+
"works_count": random.randint(100, 5000),
114+
"homepage_url": f"http://{name.lower().replace(' ', '')}.example.org",
115+
"abbreviated_title": name[:15] + ".",
116+
"is_oa": is_oa,
117+
"cited_by_count": random.randint(500, 50000),
118+
"is_preprint": random.choice([True, False]),
119+
}
120+
}
121+
122+
def create_publication(pk, source_pk, title, abstract, geometry_wkt, region_desc,
123+
authors_idx, keywords_idx, topics_idx,
124+
has_openalex=True, is_retracted=False, event_id=1000):
125+
"""Create a publication object with varied metadata."""
126+
127+
# Generate dates
128+
base_date = datetime(2020, 1, 1)
129+
pub_date = base_date + timedelta(days=random.randint(0, 1800))
130+
creation_date = pub_date + timedelta(days=random.randint(1, 30))
131+
132+
# Select metadata
133+
authors = AUTHOR_SAMPLES[authors_idx % len(AUTHOR_SAMPLES)]
134+
keywords = KEYWORD_SAMPLES[keywords_idx % len(KEYWORD_SAMPLES)]
135+
topics = TOPIC_SAMPLES[topics_idx % len(TOPIC_SAMPLES)]
136+
137+
# Generate DOI
138+
doi = f"10.5555/global-{pk}-{random.randint(1000, 9999)}"
139+
140+
# OpenAlex fields
141+
openalex_id = None
142+
openalex_match_info = None
143+
openalex_fulltext_origin = None
144+
openalex_ids = None
145+
openalex_open_access_status = None
146+
147+
if has_openalex:
148+
if random.random() > 0.2: # 80% have full OpenAlex match
149+
openalex_id = f"https://openalex.org/W{3000000 + pk}"
150+
openalex_fulltext_origin = random.choice(["repository", "publisher", None])
151+
openalex_ids = json.dumps({"doi": f"https://doi.org/{doi}", "pmid": f"{38000000 + pk}" if random.random() > 0.5 else None})
152+
openalex_open_access_status = random.choice(OPENALEX_STATUS_SAMPLES)
153+
else: # 20% have partial match info
154+
openalex_match_info = json.dumps([{
155+
"openalex_id": f"https://openalex.org/W{2900000 + pk}",
156+
"title": f"Similar Study {pk}",
157+
"doi": None,
158+
"match_type": "title"
159+
}])
160+
161+
# Generate provenance
162+
source_name = f"Global Source {source_pk}"
163+
timestamp = creation_date.isoformat() + "Z"
164+
provenance_template = random.choice(PROVENANCE_TEMPLATES)
165+
provenance = provenance_template.format(
166+
source_name=source_name,
167+
timestamp=timestamp,
168+
event_id=event_id + pk
169+
)
170+
171+
return {
172+
"model": "publications.publication",
173+
"pk": pk,
174+
"fields": {
175+
"status": random.choice(["p", "p", "p", "h", "c"]), # mostly published
176+
"title": title,
177+
"abstract": abstract,
178+
"publicationDate": pub_date.strftime("%Y-%m-%d"),
179+
"doi": doi,
180+
"url": f"https://example.org/publications/{pk}",
181+
"geometry": f"SRID=4326;GEOMETRYCOLLECTION({geometry_wkt})",
182+
"creationDate": creation_date.isoformat() + "Z",
183+
"lastUpdate": (creation_date + timedelta(hours=random.randint(1, 48))).isoformat() + "Z",
184+
"source": source_pk,
185+
"timeperiod_startdate": f"[\"{pub_date.year - random.randint(1, 3)}\"]",
186+
"timeperiod_enddate": f"[\"{pub_date.year}\"]",
187+
"provenance": provenance,
188+
"authors": authors,
189+
"keywords": keywords,
190+
"topics": topics,
191+
"openalex_id": openalex_id,
192+
"openalex_match_info": openalex_match_info,
193+
"openalex_fulltext_origin": openalex_fulltext_origin,
194+
"openalex_is_retracted": is_retracted,
195+
"openalex_ids": openalex_ids,
196+
"openalex_open_access_status": openalex_open_access_status,
197+
}
198+
}
199+
200+
def main():
201+
print("Creating comprehensive test_data_global_feeds.json fixture...")
202+
203+
fixture_data = []
204+
pk_counter = 2000
205+
source_pk = 2000
206+
207+
# Create a few diverse sources
208+
sources = [
209+
create_source(2000, "Global Geoscience Journal", "2234-5678", True),
210+
create_source(2001, "International Earth Sciences", "3345-6789", True),
211+
create_source(2002, "World Environmental Research", "4456-7890", False),
212+
create_source(2003, "Planetary Studies Quarterly", "5567-8901", True),
213+
]
214+
fixture_data.extend(sources)
215+
216+
print(f"\nCreating {len(sources)} sources...")
217+
218+
# Track metadata distribution for similar patterns to optimap
219+
author_idx = 0
220+
keyword_idx = 0
221+
topic_idx = 0
222+
223+
print("\n=== Creating works for each continent ===")
224+
for i, (region_name, geometry, description) in enumerate(CONTINENTS):
225+
pk = pk_counter
226+
pk_counter += 1
227+
source_pk_choice = 2000 + (i % len(sources))
228+
229+
pub = create_publication(
230+
pk=pk,
231+
source_pk=source_pk_choice,
232+
title=f"Geological Survey of {region_name}",
233+
abstract=f"Comprehensive geological and environmental study covering {description}. This research examines {region_name.lower()} geology, climate patterns, and ecological systems.",
234+
geometry_wkt=geometry,
235+
region_desc=description,
236+
authors_idx=author_idx,
237+
keywords_idx=keyword_idx,
238+
topics_idx=topic_idx,
239+
has_openalex=True,
240+
)
241+
fixture_data.append(pub)
242+
print(f" [{pk}] {region_name}: {len(pub['fields']['authors'])} authors, {len(pub['fields']['keywords'])} keywords, {len(pub['fields']['topics'])} topics")
243+
244+
author_idx += 1
245+
keyword_idx += 1
246+
topic_idx += 1
247+
248+
print("\n=== Creating works for each ocean ===")
249+
for i, (region_name, geometry, description) in enumerate(OCEANS):
250+
pk = pk_counter
251+
pk_counter += 1
252+
source_pk_choice = 2000 + (i % len(sources))
253+
254+
pub = create_publication(
255+
pk=pk,
256+
source_pk=source_pk_choice,
257+
title=f"Marine Biology and Oceanography of the {region_name}",
258+
abstract=f"Detailed oceanographic study of {description}. Research includes marine ecosystems, ocean currents, temperature patterns, and biodiversity in the {region_name.lower()}.",
259+
geometry_wkt=geometry,
260+
region_desc=description,
261+
authors_idx=author_idx,
262+
keywords_idx=keyword_idx,
263+
topics_idx=topic_idx,
264+
has_openalex=True,
265+
)
266+
fixture_data.append(pub)
267+
print(f" [{pk}] {region_name}: {len(pub['fields']['authors'])} authors, {len(pub['fields']['keywords'])} keywords, {len(pub['fields']['topics'])} topics")
268+
269+
author_idx += 1
270+
keyword_idx += 1
271+
topic_idx += 1
272+
273+
print("\n=== Creating works overlapping two neighboring regions ===")
274+
for i, (region_name, geometry, description) in enumerate(TWO_REGION_OVERLAPS):
275+
pk = pk_counter
276+
pk_counter += 1
277+
source_pk_choice = 2000 + (i % len(sources))
278+
279+
pub = create_publication(
280+
pk=pk,
281+
source_pk=source_pk_choice,
282+
title=f"Cross-Regional Study: {region_name}",
283+
abstract=f"Cross-border environmental and geological research {description}. This study analyzes patterns that span multiple geographical regions.",
284+
geometry_wkt=geometry,
285+
region_desc=description,
286+
authors_idx=author_idx,
287+
keywords_idx=keyword_idx,
288+
topics_idx=topic_idx,
289+
has_openalex=True,
290+
)
291+
fixture_data.append(pub)
292+
print(f" [{pk}] {region_name}: {len(pub['fields']['authors'])} authors, {len(pub['fields']['keywords'])} keywords, {len(pub['fields']['topics'])} topics")
293+
294+
author_idx += 1
295+
keyword_idx += 1
296+
topic_idx += 1
297+
298+
print("\n=== Creating works spanning more than two regions ===")
299+
for i, (title_suffix, geometry, description) in enumerate(MULTI_REGION_SPANS):
300+
pk = pk_counter
301+
pk_counter += 1
302+
source_pk_choice = 2000 + (i % len(sources))
303+
304+
pub = create_publication(
305+
pk=pk,
306+
source_pk=source_pk_choice,
307+
title=title_suffix,
308+
abstract=f"Large-scale multi-regional research project covering {description}. This comprehensive study examines global patterns and connections across multiple continents and oceans.",
309+
geometry_wkt=geometry,
310+
region_desc=description,
311+
authors_idx=author_idx,
312+
keywords_idx=keyword_idx,
313+
topics_idx=topic_idx,
314+
has_openalex=True,
315+
)
316+
fixture_data.append(pub)
317+
print(f" [{pk}] {title_suffix}: {len(pub['fields']['authors'])} authors, {len(pub['fields']['keywords'])} keywords, {len(pub['fields']['topics'])} topics")
318+
319+
author_idx += 1
320+
keyword_idx += 1
321+
topic_idx += 1
322+
323+
# Create backup of original
324+
import os
325+
import shutil
326+
fixture_path = "test_data_global_feeds.json"
327+
backup_path = fixture_path + ".backup"
328+
329+
if os.path.exists(fixture_path):
330+
print(f"\n=== Creating backup: {backup_path} ===")
331+
shutil.copy(fixture_path, backup_path)
332+
333+
# Write fixture
334+
print(f"\n=== Writing fixture to {fixture_path} ===")
335+
with open(fixture_path, "w") as f:
336+
json.dump(fixture_data, f, indent=2)
337+
338+
# Calculate statistics
339+
publications = [item for item in fixture_data if item["model"] == "publications.publication"]
340+
341+
with_authors = sum(1 for p in publications if p["fields"]["authors"])
342+
with_keywords = sum(1 for p in publications if p["fields"]["keywords"])
343+
with_topics = sum(1 for p in publications if p["fields"]["topics"])
344+
with_openalex = sum(1 for p in publications if p["fields"]["openalex_id"])
345+
is_retracted = sum(1 for p in publications if p["fields"]["openalex_is_retracted"])
346+
347+
print("\n=== Summary ===")
348+
print(f"Total publications: {len(publications)}")
349+
print(f" - Continents: {len(CONTINENTS)}")
350+
print(f" - Oceans: {len(OCEANS)}")
351+
print(f" - Two-region overlaps: {len(TWO_REGION_OVERLAPS)}")
352+
print(f" - Multi-region spans: {len(MULTI_REGION_SPANS)}")
353+
print(f"\nMetadata coverage:")
354+
print(f" - With authors: {with_authors}/{len(publications)}")
355+
print(f" - With keywords: {with_keywords}/{len(publications)}")
356+
print(f" - With topics: {with_topics}/{len(publications)}")
357+
print(f" - With OpenAlex ID: {with_openalex}/{len(publications)}")
358+
print(f" - Retracted: {is_retracted}/{len(publications)}")
359+
360+
# Calculate array field distributions
361+
from collections import Counter
362+
authors_counts = Counter(len(p["fields"]["authors"]) for p in publications)
363+
keywords_counts = Counter(len(p["fields"]["keywords"]) for p in publications)
364+
topics_counts = Counter(len(p["fields"]["topics"]) for p in publications)
365+
366+
print(f"\nArray field diversity:")
367+
print(f" - Authors distribution: {dict(sorted(authors_counts.items()))}")
368+
print(f" - Keywords distribution: {dict(sorted(keywords_counts.items()))}")
369+
print(f" - Topics distribution: {dict(sorted(topics_counts.items()))}")
370+
371+
print("\n✓ Fixture creation complete!")
372+
print(f"\nTo load the fixture:")
373+
print(f" python manage.py loaddata {fixture_path}")
374+
375+
if __name__ == "__main__":
376+
main()

0 commit comments

Comments
 (0)