Adds RSS/Atom feed harvesting support

nuest · nuest · commit 8705efa62dfa · 2025-10-10T11:20:39.000+02:00
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
@@ -5,7 +5,9 @@
       "Bash(git checkout:*)",
       "Bash(pip install:*)",
       "Bash(gh issue view:*)",
-      "Bash(pytest:*)"
+      "Bash(pytest:*)",
+      "Bash(pip search:*)",
+      "Bash(psql:*)"
     ],
     "deny": [],
     "ask": []
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,8 +4,26 @@
 
 ### Added
 
-- Django management command `harvest_journals` for harvesting real OAI-PMH journal sources
-  - Support for ESSD, AGILE-GISS, and GEO-LEO journals
+- **RSS/Atom feed harvesting support** (`publications/tasks.py`)
+  - `parse_rss_feed_and_save_publications()` function for parsing RSS/Atom feeds
+  - `harvest_rss_endpoint()` function for complete RSS harvesting workflow
+  - Support for RDF-based RSS feeds (Scientific Data journal)
+  - DOI extraction from multiple feed fields (prism:doi, dc:identifier)
+  - Duplicate detection by DOI and URL
+  - Abstract/description extraction from feed content
+- feedparser library integration (v6.0.12)
+  - Added to requirements.txt for RSS/Atom feed parsing
+  - Supports RSS 1.0/2.0, Atom, and RDF feeds
+- Django management command `harvest_journals` enhanced for RSS/Atom feeds
+  - Added Scientific Data journal with RSS feed support
+  - Support for both OAI-PMH and RSS/Atom feed types
+  - Automatic feed type detection based on journal configuration
+  - Now supports 4 journals: ESSD, AGILE-GISS, GEO-LEO (OAI-PMH), Scientific Data (RSS)
+- Comprehensive RSS harvesting tests (`RSSFeedHarvestingTests`)
+  - 7 test cases covering RSS parsing, duplicate detection, error handling
+  - Test fixture with sample RDF/RSS feed (`tests/harvesting/rss_feed_sample.xml`)
+  - Tests for max_records limit, invalid feeds, and HTTP errors
+- Django management command `harvest_journals` for harvesting real journal sources
   - Command-line options for journal selection, record limits, and source creation
   - Detailed progress reporting with colored output
   - Statistics for spatial/temporal metadata extraction
diff --git a/README.md b/README.md
@@ -282,9 +282,12 @@ python manage.py harvest_journals --all --user-email admin@optimap.science
 
 **Currently configured journals**:
 
-- `essd` - Earth System Science Data ([Issue #59](https://github.com/GeoinformationSystems/optimap/issues/59))
-- `agile-giss` - AGILE-GISS conference series ([Issue #60](https://github.com/GeoinformationSystems/optimap/issues/60))
-- `geo-leo` - GEO-LEO e-docs repository ([Issue #13](https://github.com/GeoinformationSystems/optimap/issues/13))
+- `essd` - Earth System Science Data (OAI-PMH) ([Issue #59](https://github.com/GeoinformationSystems/optimap/issues/59))
+- `agile-giss` - AGILE-GISS conference series (OAI-PMH) ([Issue #60](https://github.com/GeoinformationSystems/optimap/issues/60))
+- `geo-leo` - GEO-LEO e-docs repository (OAI-PMH) ([Issue #13](https://github.com/GeoinformationSystems/optimap/issues/13))
+- `scientific-data` - Scientific Data (RSS/Atom) ([Issue #58](https://github.com/GeoinformationSystems/optimap/issues/58))
+
+The command supports both OAI-PMH and RSS/Atom feeds, automatically detecting the feed type for each journal.
 
 The command provides detailed progress reporting including:
 
diff --git a/publications/management/commands/harvest_journals.py b/publications/management/commands/harvest_journals.py
@@ -17,39 +17,44 @@
 from django.contrib.auth import get_user_model
 from django.utils import timezone
 from publications.models import Source, HarvestingEvent, Publication
-from publications.tasks import harvest_oai_endpoint
+from publications.tasks import harvest_oai_endpoint, harvest_rss_endpoint
 
 logger = logging.getLogger(__name__)
 User = get_user_model()
 
-# Journal configurations with OAI-PMH endpoints
+# Journal configurations with OAI-PMH and RSS/Atom endpoints
 JOURNAL_CONFIGS = {
     'essd': {
         'name': 'Earth System Science Data',
         'url': 'https://oai-pmh.copernicus.org/oai.php?verb=ListRecords&metadataPrefix=oai_dc&set=essd',
         'collection_name': 'ESSD',
         'homepage_url': 'https://essd.copernicus.org/',
         'publisher_name': 'Copernicus Publications',
-        'is_oa': True,
-        'issue': 59,
+        'feed_type': 'oai-pmh',
     },
     'agile-giss': {
         'name': 'AGILE-GISS',
         'url': 'https://oai-pmh.copernicus.org/oai.php?verb=ListRecords&metadataPrefix=oai_dc&set=agile-giss',
         'collection_name': 'AGILE-GISS',
         'homepage_url': 'https://www.agile-giscience-series.net/',
         'publisher_name': 'Copernicus Publications',
-        'is_oa': True,
-        'issue': 60,
+        'feed_type': 'oai-pmh',
     },
     'geo-leo': {
         'name': 'GEO-LEO e-docs',
         'url': 'https://e-docs.geo-leo.de/server/oai/request?verb=ListRecords&metadataPrefix=oai_dc',
         'collection_name': 'GEO-LEO',
         'homepage_url': 'https://e-docs.geo-leo.de/',
         'publisher_name': 'GEO-LEO',
-        'is_oa': True,
-        'issue': 13,
+        'feed_type': 'oai-pmh',
+    },
+    'scientific-data': {
+        'name': 'Scientific Data',
+        'url': 'https://www.nature.com/sdata.rss',
+        'collection_name': 'Scientific Data',
+        'homepage_url': 'https://www.nature.com/sdata/',
+        'publisher_name': 'Nature Publishing Group',
+        'feed_type': 'rss',
     },
 }
 
@@ -138,7 +143,6 @@ def handle(self, *args, **options):
             config = JOURNAL_CONFIGS[journal_key]
 
             self.stdout.write(self.style.WARNING(f'\n--- Harvesting: {config["name"]} ---'))
-            self.stdout.write(f'Issue: https://github.com/GeoinformationSystems/optimap/issues/{config["issue"]}')
             self.stdout.write(f'URL: {config["url"]}')
             if max_records:
                 self.stdout.write(f'Max records: {max_records}')
@@ -147,9 +151,16 @@ def handle(self, *args, **options):
                 # Find or create source
                 source = self._get_or_create_source(config, create_sources)
 
-                # Harvest
+                # Harvest based on feed type
                 harvest_start = timezone.now()
-                harvest_oai_endpoint(source.id, user=user, max_records=max_records)
+                feed_type = config.get('feed_type', 'oai-pmh')
+
+                if feed_type == 'rss':
+                    self.stdout.write(f'Feed type: RSS/Atom')
+                    harvest_rss_endpoint(source.id, user=user, max_records=max_records)
+                else:
+                    self.stdout.write(f'Feed type: OAI-PMH')
+                    harvest_oai_endpoint(source.id, user=user, max_records=max_records)
 
                 # Get results
                 event = HarvestingEvent.objects.filter(source=source).latest('started_at')
diff --git a/publications/tasks.py b/publications/tasks.py
@@ -157,11 +157,15 @@ def parse_oai_xml_and_save_publications(content, event: HarvestingEvent, max_rec
     processed_count = 0
     saved_count = 0
 
+    # Calculate progress reporting interval (every 10% of records)
+    total_records = len(records) if hasattr(records, '__len__') else None
+    log_interval = max(1, total_records // 10) if total_records else 10
+
     for rec in records:
         try:
             processed_count += 1
-            if processed_count % 10 == 0:
-                logger.debug("Processing record %d of %d", processed_count, len(records) if hasattr(records, '__len__') else '?')
+            if processed_count % log_interval == 0:
+                logger.debug("Processing record %d of %d", processed_count, total_records if total_records else '?')
 
             if hasattr(rec, "metadata"):
                 identifiers = rec.metadata.get("identifier", []) + rec.metadata.get("relation", [])
@@ -583,3 +587,210 @@ def regenerate_geopackage_cache():
     gpkg_path = convert_geojson_to_geopackage(geojson_path)
     cleanup_old_data_dumps(cache_dir, settings.DATA_DUMP_RETENTION)
     return gpkg_path
+
+
+# ============================================================================
+# RSS/Atom Feed Harvesting
+# ============================================================================
+
+def parse_rss_feed_and_save_publications(feed_url, event: 'HarvestingEvent', max_records=None):
+    """
+    Parse RSS/Atom feed and save publications.
+
+    Args:
+        feed_url: URL of the RSS/Atom feed
+        event: HarvestingEvent instance
+        max_records: Maximum number of records to process (optional)
+
+    Returns:
+        tuple: (processed_count, saved_count)
+    """
+    import feedparser
+
+    source = event.source
+    logger.info("Starting RSS/Atom feed parsing for source: %s", source.name)
+
+    try:
+        # Parse the feed
+        feed = feedparser.parse(feed_url)
+
+        if not feed or not hasattr(feed, 'entries'):
+            logger.error("Failed to parse RSS feed: %s", feed_url)
+            return 0, 0
+
+        entries = feed.entries
+        logger.info("Found %d entries in RSS feed", len(entries))
+
+        if not entries:
+            logger.warning("No entries found in RSS feed!")
+            return 0, 0
+
+        # Limit records if specified
+        if max_records:
+            entries = entries[:max_records]
+            logger.info("Limited to first %d records", max_records)
+
+        processed_count = 0
+        saved_count = 0
+
+        # Calculate progress reporting interval (every 10% of entries)
+        total_entries = len(entries)
+        log_interval = max(1, total_entries // 10)
+
+        for entry in entries:
+            try:
+                processed_count += 1
+                if processed_count % log_interval == 0:
+                    logger.debug("Processing entry %d of %d", processed_count, total_entries)
+
+                # Extract metadata from feed entry
+                title = entry.get('title', '').strip()
+                link = entry.get('link', entry.get('id', '')).strip()
+
+                # Extract DOI - try multiple fields
+                doi = None
+                if 'prism_doi' in entry:
+                    doi = entry.prism_doi.strip()
+                elif 'dc_identifier' in entry and 'doi' in entry.dc_identifier.lower():
+                    doi_match = DOI_REGEX.search(entry.dc_identifier)
+                    if doi_match:
+                        doi = doi_match.group(0)
+
+                # Extract date
+                published_date = None
+                date_str = entry.get('updated', entry.get('published', entry.get('dc_date')))
+                if date_str:
+                    if hasattr(date_str, 'strftime'):
+                        # It's already a datetime
+                        published_date = date_str.strftime('%Y-%m-%d')
+                    else:
+                        # Parse date string
+                        published_date = parse_publication_date(str(date_str))
+
+                # Extract abstract/description
+                abstract = ''
+                if 'summary' in entry:
+                    abstract = BeautifulSoup(entry.summary, 'html.parser').get_text()
+                elif 'content' in entry and entry.content:
+                    abstract = BeautifulSoup(entry.content[0].get('value', ''), 'html.parser').get_text()
+
+                # Skip if no title
+                if not title:
+                    logger.warning("Skipping entry with no title: %s", link)
+                    continue
+
+                # Skip if no URL/identifier
+                if not link:
+                    logger.warning("Skipping entry '%s' with no URL", title[:50])
+                    continue
+
+                logger.debug("Processing publication: %s", title[:50])
+
+                # Check for duplicates by DOI or URL
+                existing_pub = None
+                if doi:
+                    existing_pub = Publication.objects.filter(doi=doi).first()
+                if not existing_pub and link:
+                    existing_pub = Publication.objects.filter(url=link).first()
+
+                if existing_pub:
+                    logger.debug("Publication already exists: %s", title[:50])
+                    continue
+
+                # Create publication
+                pub = Publication(
+                    title=title,
+                    doi=doi,
+                    url=link,
+                    abstract=abstract[:5000] if abstract else None,  # Limit abstract length
+                    publicationDate=published_date,
+                    source=source,
+                    job=event,
+                    timeperiod_startdate=[],
+                    timeperiod_enddate=[],
+                    geometry=GeometryCollection(),  # No spatial data from RSS typically
+                )
+
+                pub.save()
+                saved_count += 1
+                logger.debug("Saved publication: %s", title[:50])
+
+            except Exception as e:
+                logger.error("Failed to process entry '%s': %s",
+                           entry.get('title', 'Unknown')[:50], str(e))
+                continue
+
+        logger.info("RSS feed parsing completed for source %s: processed %d entries, saved %d publications",
+                   source.name, processed_count, saved_count)
+        return processed_count, saved_count
+
+    except Exception as e:
+        logger.error("Failed to parse RSS feed %s: %s", feed_url, str(e))
+        return 0, 0
+
+
+def harvest_rss_endpoint(source_id, user=None, max_records=None):
+    """
+    Harvest publications from an RSS/Atom feed.
+
+    Args:
+        source_id: ID of the Source model instance
+        user: User who initiated the harvest (optional)
+        max_records: Maximum number of records to harvest (optional)
+    """
+    from publications.models import Source, HarvestingEvent, Publication
+
+    source = Source.objects.get(id=source_id)
+    event = HarvestingEvent.objects.create(source=source, status="in_progress")
+
+    try:
+        feed_url = source.url_field
+        logger.info("Fetching from RSS feed: %s", feed_url)
+
+        processed, saved = parse_rss_feed_and_save_publications(feed_url, event, max_records=max_records)
+
+        event.status = "completed"
+        event.completed_at = timezone.now()
+        event.save()
+
+        new_count = Publication.objects.filter(job=event).count()
+        spatial_count = Publication.objects.filter(job=event).exclude(geometry__isnull=True).count()
+        temporal_count = Publication.objects.filter(job=event).exclude(timeperiod_startdate=[]).count()
+
+        subject = f"RSS Feed Harvesting Completed for {source.name}"
+        completed_str = event.completed_at.strftime('%Y-%m-%d %H:%M:%S')
+        message = (
+            f"RSS/Atom feed harvesting job details:\n\n"
+            f"Number of added articles: {new_count}\n"
+            f"Number of articles with spatial metadata: {spatial_count}\n"
+            f"Number of articles with temporal metadata: {temporal_count}\n"
+            f"Source: {source.name}\n"
+            f"Feed URL: {source.url_field}\n"
+            f"Job started at: {event.started_at.strftime('%Y-%m-%d %H:%M:%S')}\n"
+            f"Job completed at: {completed_str}\n"
+        )
+
+        if user and user.email:
+            send_mail(
+                subject,
+                message,
+                settings.EMAIL_HOST_USER,
+                [user.email],
+                fail_silently=False,
+            )
+
+    except Exception as e:
+        logger.error("RSS feed harvesting failed for source %s: %s", source.url_field, str(e))
+        event.status = "failed"
+        event.completed_at = timezone.now()
+        event.save()
+
+        # Send failure notification
+        if user and user.email:
+            send_mail(
+                f"RSS Feed Harvesting Failed for {source.name}",
+                f"RSS feed harvesting failed for {source.name}\n\nError: {str(e)}\n\nFeed URL: {source.url_field}",
+                settings.EMAIL_HOST_USER,
+                [user.email],
+                fail_silently=True,
+            )
diff --git a/requirements.txt b/requirements.txt
@@ -40,3 +40,4 @@ pyalex>=0.4.0
 python-stdnum>=2.0.0
 geopy>=2.4.1
 oaipmh-scythe==0.13.0
+feedparser==6.0.12
diff --git a/tests/harvesting/rss_feed_sample.xml b/tests/harvesting/rss_feed_sample.xml
diff --git a/tests/test_harvesting.py b/tests/test_harvesting.py