Skip to content

Commit 8705efa

Browse files
committed
Adds RSS/Atom feed harvesting support
1 parent 3185776 commit 8705efa

File tree

8 files changed

+492
-20
lines changed

8 files changed

+492
-20
lines changed

.claude/settings.local.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
"Bash(git checkout:*)",
66
"Bash(pip install:*)",
77
"Bash(gh issue view:*)",
8-
"Bash(pytest:*)"
8+
"Bash(pytest:*)",
9+
"Bash(pip search:*)",
10+
"Bash(psql:*)"
911
],
1012
"deny": [],
1113
"ask": []

CHANGELOG.md

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,26 @@
44

55
### Added
66

7-
- Django management command `harvest_journals` for harvesting real OAI-PMH journal sources
8-
- Support for ESSD, AGILE-GISS, and GEO-LEO journals
7+
- **RSS/Atom feed harvesting support** (`publications/tasks.py`)
8+
- `parse_rss_feed_and_save_publications()` function for parsing RSS/Atom feeds
9+
- `harvest_rss_endpoint()` function for complete RSS harvesting workflow
10+
- Support for RDF-based RSS feeds (Scientific Data journal)
11+
- DOI extraction from multiple feed fields (prism:doi, dc:identifier)
12+
- Duplicate detection by DOI and URL
13+
- Abstract/description extraction from feed content
14+
- feedparser library integration (v6.0.12)
15+
- Added to requirements.txt for RSS/Atom feed parsing
16+
- Supports RSS 1.0/2.0, Atom, and RDF feeds
17+
- Django management command `harvest_journals` enhanced for RSS/Atom feeds
18+
- Added Scientific Data journal with RSS feed support
19+
- Support for both OAI-PMH and RSS/Atom feed types
20+
- Automatic feed type detection based on journal configuration
21+
- Now supports 4 journals: ESSD, AGILE-GISS, GEO-LEO (OAI-PMH), Scientific Data (RSS)
22+
- Comprehensive RSS harvesting tests (`RSSFeedHarvestingTests`)
23+
- 7 test cases covering RSS parsing, duplicate detection, error handling
24+
- Test fixture with sample RDF/RSS feed (`tests/harvesting/rss_feed_sample.xml`)
25+
- Tests for max_records limit, invalid feeds, and HTTP errors
26+
- Django management command `harvest_journals` for harvesting real journal sources
927
- Command-line options for journal selection, record limits, and source creation
1028
- Detailed progress reporting with colored output
1129
- Statistics for spatial/temporal metadata extraction

README.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -282,9 +282,12 @@ python manage.py harvest_journals --all --user-email [email protected]
282282

283283
**Currently configured journals**:
284284

285-
- `essd` - Earth System Science Data ([Issue #59](https://github.com/GeoinformationSystems/optimap/issues/59))
286-
- `agile-giss` - AGILE-GISS conference series ([Issue #60](https://github.com/GeoinformationSystems/optimap/issues/60))
287-
- `geo-leo` - GEO-LEO e-docs repository ([Issue #13](https://github.com/GeoinformationSystems/optimap/issues/13))
285+
- `essd` - Earth System Science Data (OAI-PMH) ([Issue #59](https://github.com/GeoinformationSystems/optimap/issues/59))
286+
- `agile-giss` - AGILE-GISS conference series (OAI-PMH) ([Issue #60](https://github.com/GeoinformationSystems/optimap/issues/60))
287+
- `geo-leo` - GEO-LEO e-docs repository (OAI-PMH) ([Issue #13](https://github.com/GeoinformationSystems/optimap/issues/13))
288+
- `scientific-data` - Scientific Data (RSS/Atom) ([Issue #58](https://github.com/GeoinformationSystems/optimap/issues/58))
289+
290+
The command supports both OAI-PMH and RSS/Atom feeds, automatically detecting the feed type for each journal.
288291

289292
The command provides detailed progress reporting including:
290293

publications/management/commands/harvest_journals.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,39 +17,44 @@
1717
from django.contrib.auth import get_user_model
1818
from django.utils import timezone
1919
from publications.models import Source, HarvestingEvent, Publication
20-
from publications.tasks import harvest_oai_endpoint
20+
from publications.tasks import harvest_oai_endpoint, harvest_rss_endpoint
2121

2222
logger = logging.getLogger(__name__)
2323
User = get_user_model()
2424

25-
# Journal configurations with OAI-PMH endpoints
25+
# Journal configurations with OAI-PMH and RSS/Atom endpoints
2626
JOURNAL_CONFIGS = {
2727
'essd': {
2828
'name': 'Earth System Science Data',
2929
'url': 'https://oai-pmh.copernicus.org/oai.php?verb=ListRecords&metadataPrefix=oai_dc&set=essd',
3030
'collection_name': 'ESSD',
3131
'homepage_url': 'https://essd.copernicus.org/',
3232
'publisher_name': 'Copernicus Publications',
33-
'is_oa': True,
34-
'issue': 59,
33+
'feed_type': 'oai-pmh',
3534
},
3635
'agile-giss': {
3736
'name': 'AGILE-GISS',
3837
'url': 'https://oai-pmh.copernicus.org/oai.php?verb=ListRecords&metadataPrefix=oai_dc&set=agile-giss',
3938
'collection_name': 'AGILE-GISS',
4039
'homepage_url': 'https://www.agile-giscience-series.net/',
4140
'publisher_name': 'Copernicus Publications',
42-
'is_oa': True,
43-
'issue': 60,
41+
'feed_type': 'oai-pmh',
4442
},
4543
'geo-leo': {
4644
'name': 'GEO-LEO e-docs',
4745
'url': 'https://e-docs.geo-leo.de/server/oai/request?verb=ListRecords&metadataPrefix=oai_dc',
4846
'collection_name': 'GEO-LEO',
4947
'homepage_url': 'https://e-docs.geo-leo.de/',
5048
'publisher_name': 'GEO-LEO',
51-
'is_oa': True,
52-
'issue': 13,
49+
'feed_type': 'oai-pmh',
50+
},
51+
'scientific-data': {
52+
'name': 'Scientific Data',
53+
'url': 'https://www.nature.com/sdata.rss',
54+
'collection_name': 'Scientific Data',
55+
'homepage_url': 'https://www.nature.com/sdata/',
56+
'publisher_name': 'Nature Publishing Group',
57+
'feed_type': 'rss',
5358
},
5459
}
5560

@@ -138,7 +143,6 @@ def handle(self, *args, **options):
138143
config = JOURNAL_CONFIGS[journal_key]
139144

140145
self.stdout.write(self.style.WARNING(f'\n--- Harvesting: {config["name"]} ---'))
141-
self.stdout.write(f'Issue: https://github.com/GeoinformationSystems/optimap/issues/{config["issue"]}')
142146
self.stdout.write(f'URL: {config["url"]}')
143147
if max_records:
144148
self.stdout.write(f'Max records: {max_records}')
@@ -147,9 +151,16 @@ def handle(self, *args, **options):
147151
# Find or create source
148152
source = self._get_or_create_source(config, create_sources)
149153

150-
# Harvest
154+
# Harvest based on feed type
151155
harvest_start = timezone.now()
152-
harvest_oai_endpoint(source.id, user=user, max_records=max_records)
156+
feed_type = config.get('feed_type', 'oai-pmh')
157+
158+
if feed_type == 'rss':
159+
self.stdout.write(f'Feed type: RSS/Atom')
160+
harvest_rss_endpoint(source.id, user=user, max_records=max_records)
161+
else:
162+
self.stdout.write(f'Feed type: OAI-PMH')
163+
harvest_oai_endpoint(source.id, user=user, max_records=max_records)
153164

154165
# Get results
155166
event = HarvestingEvent.objects.filter(source=source).latest('started_at')

publications/tasks.py

Lines changed: 213 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,11 +157,15 @@ def parse_oai_xml_and_save_publications(content, event: HarvestingEvent, max_rec
157157
processed_count = 0
158158
saved_count = 0
159159

160+
# Calculate progress reporting interval (every 10% of records)
161+
total_records = len(records) if hasattr(records, '__len__') else None
162+
log_interval = max(1, total_records // 10) if total_records else 10
163+
160164
for rec in records:
161165
try:
162166
processed_count += 1
163-
if processed_count % 10 == 0:
164-
logger.debug("Processing record %d of %d", processed_count, len(records) if hasattr(records, '__len__') else '?')
167+
if processed_count % log_interval == 0:
168+
logger.debug("Processing record %d of %d", processed_count, total_records if total_records else '?')
165169

166170
if hasattr(rec, "metadata"):
167171
identifiers = rec.metadata.get("identifier", []) + rec.metadata.get("relation", [])
@@ -583,3 +587,210 @@ def regenerate_geopackage_cache():
583587
gpkg_path = convert_geojson_to_geopackage(geojson_path)
584588
cleanup_old_data_dumps(cache_dir, settings.DATA_DUMP_RETENTION)
585589
return gpkg_path
590+
591+
592+
# ============================================================================
593+
# RSS/Atom Feed Harvesting
594+
# ============================================================================
595+
596+
def parse_rss_feed_and_save_publications(feed_url, event: 'HarvestingEvent', max_records=None):
597+
"""
598+
Parse RSS/Atom feed and save publications.
599+
600+
Args:
601+
feed_url: URL of the RSS/Atom feed
602+
event: HarvestingEvent instance
603+
max_records: Maximum number of records to process (optional)
604+
605+
Returns:
606+
tuple: (processed_count, saved_count)
607+
"""
608+
import feedparser
609+
610+
source = event.source
611+
logger.info("Starting RSS/Atom feed parsing for source: %s", source.name)
612+
613+
try:
614+
# Parse the feed
615+
feed = feedparser.parse(feed_url)
616+
617+
if not feed or not hasattr(feed, 'entries'):
618+
logger.error("Failed to parse RSS feed: %s", feed_url)
619+
return 0, 0
620+
621+
entries = feed.entries
622+
logger.info("Found %d entries in RSS feed", len(entries))
623+
624+
if not entries:
625+
logger.warning("No entries found in RSS feed!")
626+
return 0, 0
627+
628+
# Limit records if specified
629+
if max_records:
630+
entries = entries[:max_records]
631+
logger.info("Limited to first %d records", max_records)
632+
633+
processed_count = 0
634+
saved_count = 0
635+
636+
# Calculate progress reporting interval (every 10% of entries)
637+
total_entries = len(entries)
638+
log_interval = max(1, total_entries // 10)
639+
640+
for entry in entries:
641+
try:
642+
processed_count += 1
643+
if processed_count % log_interval == 0:
644+
logger.debug("Processing entry %d of %d", processed_count, total_entries)
645+
646+
# Extract metadata from feed entry
647+
title = entry.get('title', '').strip()
648+
link = entry.get('link', entry.get('id', '')).strip()
649+
650+
# Extract DOI - try multiple fields
651+
doi = None
652+
if 'prism_doi' in entry:
653+
doi = entry.prism_doi.strip()
654+
elif 'dc_identifier' in entry and 'doi' in entry.dc_identifier.lower():
655+
doi_match = DOI_REGEX.search(entry.dc_identifier)
656+
if doi_match:
657+
doi = doi_match.group(0)
658+
659+
# Extract date
660+
published_date = None
661+
date_str = entry.get('updated', entry.get('published', entry.get('dc_date')))
662+
if date_str:
663+
if hasattr(date_str, 'strftime'):
664+
# It's already a datetime
665+
published_date = date_str.strftime('%Y-%m-%d')
666+
else:
667+
# Parse date string
668+
published_date = parse_publication_date(str(date_str))
669+
670+
# Extract abstract/description
671+
abstract = ''
672+
if 'summary' in entry:
673+
abstract = BeautifulSoup(entry.summary, 'html.parser').get_text()
674+
elif 'content' in entry and entry.content:
675+
abstract = BeautifulSoup(entry.content[0].get('value', ''), 'html.parser').get_text()
676+
677+
# Skip if no title
678+
if not title:
679+
logger.warning("Skipping entry with no title: %s", link)
680+
continue
681+
682+
# Skip if no URL/identifier
683+
if not link:
684+
logger.warning("Skipping entry '%s' with no URL", title[:50])
685+
continue
686+
687+
logger.debug("Processing publication: %s", title[:50])
688+
689+
# Check for duplicates by DOI or URL
690+
existing_pub = None
691+
if doi:
692+
existing_pub = Publication.objects.filter(doi=doi).first()
693+
if not existing_pub and link:
694+
existing_pub = Publication.objects.filter(url=link).first()
695+
696+
if existing_pub:
697+
logger.debug("Publication already exists: %s", title[:50])
698+
continue
699+
700+
# Create publication
701+
pub = Publication(
702+
title=title,
703+
doi=doi,
704+
url=link,
705+
abstract=abstract[:5000] if abstract else None, # Limit abstract length
706+
publicationDate=published_date,
707+
source=source,
708+
job=event,
709+
timeperiod_startdate=[],
710+
timeperiod_enddate=[],
711+
geometry=GeometryCollection(), # No spatial data from RSS typically
712+
)
713+
714+
pub.save()
715+
saved_count += 1
716+
logger.debug("Saved publication: %s", title[:50])
717+
718+
except Exception as e:
719+
logger.error("Failed to process entry '%s': %s",
720+
entry.get('title', 'Unknown')[:50], str(e))
721+
continue
722+
723+
logger.info("RSS feed parsing completed for source %s: processed %d entries, saved %d publications",
724+
source.name, processed_count, saved_count)
725+
return processed_count, saved_count
726+
727+
except Exception as e:
728+
logger.error("Failed to parse RSS feed %s: %s", feed_url, str(e))
729+
return 0, 0
730+
731+
732+
def harvest_rss_endpoint(source_id, user=None, max_records=None):
733+
"""
734+
Harvest publications from an RSS/Atom feed.
735+
736+
Args:
737+
source_id: ID of the Source model instance
738+
user: User who initiated the harvest (optional)
739+
max_records: Maximum number of records to harvest (optional)
740+
"""
741+
from publications.models import Source, HarvestingEvent, Publication
742+
743+
source = Source.objects.get(id=source_id)
744+
event = HarvestingEvent.objects.create(source=source, status="in_progress")
745+
746+
try:
747+
feed_url = source.url_field
748+
logger.info("Fetching from RSS feed: %s", feed_url)
749+
750+
processed, saved = parse_rss_feed_and_save_publications(feed_url, event, max_records=max_records)
751+
752+
event.status = "completed"
753+
event.completed_at = timezone.now()
754+
event.save()
755+
756+
new_count = Publication.objects.filter(job=event).count()
757+
spatial_count = Publication.objects.filter(job=event).exclude(geometry__isnull=True).count()
758+
temporal_count = Publication.objects.filter(job=event).exclude(timeperiod_startdate=[]).count()
759+
760+
subject = f"RSS Feed Harvesting Completed for {source.name}"
761+
completed_str = event.completed_at.strftime('%Y-%m-%d %H:%M:%S')
762+
message = (
763+
f"RSS/Atom feed harvesting job details:\n\n"
764+
f"Number of added articles: {new_count}\n"
765+
f"Number of articles with spatial metadata: {spatial_count}\n"
766+
f"Number of articles with temporal metadata: {temporal_count}\n"
767+
f"Source: {source.name}\n"
768+
f"Feed URL: {source.url_field}\n"
769+
f"Job started at: {event.started_at.strftime('%Y-%m-%d %H:%M:%S')}\n"
770+
f"Job completed at: {completed_str}\n"
771+
)
772+
773+
if user and user.email:
774+
send_mail(
775+
subject,
776+
message,
777+
settings.EMAIL_HOST_USER,
778+
[user.email],
779+
fail_silently=False,
780+
)
781+
782+
except Exception as e:
783+
logger.error("RSS feed harvesting failed for source %s: %s", source.url_field, str(e))
784+
event.status = "failed"
785+
event.completed_at = timezone.now()
786+
event.save()
787+
788+
# Send failure notification
789+
if user and user.email:
790+
send_mail(
791+
f"RSS Feed Harvesting Failed for {source.name}",
792+
f"RSS feed harvesting failed for {source.name}\n\nError: {str(e)}\n\nFeed URL: {source.url_field}",
793+
settings.EMAIL_HOST_USER,
794+
[user.email],
795+
fail_silently=True,
796+
)

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,4 @@ pyalex>=0.4.0
4040
python-stdnum>=2.0.0
4141
geopy>=2.4.1
4242
oaipmh-scythe==0.13.0
43+
feedparser==6.0.12

0 commit comments

Comments
 (0)