@@ -157,11 +157,15 @@ def parse_oai_xml_and_save_publications(content, event: HarvestingEvent, max_rec
157157 processed_count = 0
158158 saved_count = 0
159159
160+ # Calculate progress reporting interval (every 10% of records)
161+ total_records = len (records ) if hasattr (records , '__len__' ) else None
162+ log_interval = max (1 , total_records // 10 ) if total_records else 10
163+
160164 for rec in records :
161165 try :
162166 processed_count += 1
163- if processed_count % 10 == 0 :
164- logger .debug ("Processing record %d of %d" , processed_count , len ( records ) if hasattr ( records , '__len__' ) else '?' )
167+ if processed_count % log_interval == 0 :
168+ logger .debug ("Processing record %d of %d" , processed_count , total_records if total_records else '?' )
165169
166170 if hasattr (rec , "metadata" ):
167171 identifiers = rec .metadata .get ("identifier" , []) + rec .metadata .get ("relation" , [])
@@ -583,3 +587,210 @@ def regenerate_geopackage_cache():
583587 gpkg_path = convert_geojson_to_geopackage (geojson_path )
584588 cleanup_old_data_dumps (cache_dir , settings .DATA_DUMP_RETENTION )
585589 return gpkg_path
590+
591+
592+ # ============================================================================
593+ # RSS/Atom Feed Harvesting
594+ # ============================================================================
595+
596+ def parse_rss_feed_and_save_publications (feed_url , event : 'HarvestingEvent' , max_records = None ):
597+ """
598+ Parse RSS/Atom feed and save publications.
599+
600+ Args:
601+ feed_url: URL of the RSS/Atom feed
602+ event: HarvestingEvent instance
603+ max_records: Maximum number of records to process (optional)
604+
605+ Returns:
606+ tuple: (processed_count, saved_count)
607+ """
608+ import feedparser
609+
610+ source = event .source
611+ logger .info ("Starting RSS/Atom feed parsing for source: %s" , source .name )
612+
613+ try :
614+ # Parse the feed
615+ feed = feedparser .parse (feed_url )
616+
617+ if not feed or not hasattr (feed , 'entries' ):
618+ logger .error ("Failed to parse RSS feed: %s" , feed_url )
619+ return 0 , 0
620+
621+ entries = feed .entries
622+ logger .info ("Found %d entries in RSS feed" , len (entries ))
623+
624+ if not entries :
625+ logger .warning ("No entries found in RSS feed!" )
626+ return 0 , 0
627+
628+ # Limit records if specified
629+ if max_records :
630+ entries = entries [:max_records ]
631+ logger .info ("Limited to first %d records" , max_records )
632+
633+ processed_count = 0
634+ saved_count = 0
635+
636+ # Calculate progress reporting interval (every 10% of entries)
637+ total_entries = len (entries )
638+ log_interval = max (1 , total_entries // 10 )
639+
640+ for entry in entries :
641+ try :
642+ processed_count += 1
643+ if processed_count % log_interval == 0 :
644+ logger .debug ("Processing entry %d of %d" , processed_count , total_entries )
645+
646+ # Extract metadata from feed entry
647+ title = entry .get ('title' , '' ).strip ()
648+ link = entry .get ('link' , entry .get ('id' , '' )).strip ()
649+
650+ # Extract DOI - try multiple fields
651+ doi = None
652+ if 'prism_doi' in entry :
653+ doi = entry .prism_doi .strip ()
654+ elif 'dc_identifier' in entry and 'doi' in entry .dc_identifier .lower ():
655+ doi_match = DOI_REGEX .search (entry .dc_identifier )
656+ if doi_match :
657+ doi = doi_match .group (0 )
658+
659+ # Extract date
660+ published_date = None
661+ date_str = entry .get ('updated' , entry .get ('published' , entry .get ('dc_date' )))
662+ if date_str :
663+ if hasattr (date_str , 'strftime' ):
664+ # It's already a datetime
665+ published_date = date_str .strftime ('%Y-%m-%d' )
666+ else :
667+ # Parse date string
668+ published_date = parse_publication_date (str (date_str ))
669+
670+ # Extract abstract/description
671+ abstract = ''
672+ if 'summary' in entry :
673+ abstract = BeautifulSoup (entry .summary , 'html.parser' ).get_text ()
674+ elif 'content' in entry and entry .content :
675+ abstract = BeautifulSoup (entry .content [0 ].get ('value' , '' ), 'html.parser' ).get_text ()
676+
677+ # Skip if no title
678+ if not title :
679+ logger .warning ("Skipping entry with no title: %s" , link )
680+ continue
681+
682+ # Skip if no URL/identifier
683+ if not link :
684+ logger .warning ("Skipping entry '%s' with no URL" , title [:50 ])
685+ continue
686+
687+ logger .debug ("Processing publication: %s" , title [:50 ])
688+
689+ # Check for duplicates by DOI or URL
690+ existing_pub = None
691+ if doi :
692+ existing_pub = Publication .objects .filter (doi = doi ).first ()
693+ if not existing_pub and link :
694+ existing_pub = Publication .objects .filter (url = link ).first ()
695+
696+ if existing_pub :
697+ logger .debug ("Publication already exists: %s" , title [:50 ])
698+ continue
699+
700+ # Create publication
701+ pub = Publication (
702+ title = title ,
703+ doi = doi ,
704+ url = link ,
705+ abstract = abstract [:5000 ] if abstract else None , # Limit abstract length
706+ publicationDate = published_date ,
707+ source = source ,
708+ job = event ,
709+ timeperiod_startdate = [],
710+ timeperiod_enddate = [],
711+ geometry = GeometryCollection (), # No spatial data from RSS typically
712+ )
713+
714+ pub .save ()
715+ saved_count += 1
716+ logger .debug ("Saved publication: %s" , title [:50 ])
717+
718+ except Exception as e :
719+ logger .error ("Failed to process entry '%s': %s" ,
720+ entry .get ('title' , 'Unknown' )[:50 ], str (e ))
721+ continue
722+
723+ logger .info ("RSS feed parsing completed for source %s: processed %d entries, saved %d publications" ,
724+ source .name , processed_count , saved_count )
725+ return processed_count , saved_count
726+
727+ except Exception as e :
728+ logger .error ("Failed to parse RSS feed %s: %s" , feed_url , str (e ))
729+ return 0 , 0
730+
731+
732+ def harvest_rss_endpoint (source_id , user = None , max_records = None ):
733+ """
734+ Harvest publications from an RSS/Atom feed.
735+
736+ Args:
737+ source_id: ID of the Source model instance
738+ user: User who initiated the harvest (optional)
739+ max_records: Maximum number of records to harvest (optional)
740+ """
741+ from publications .models import Source , HarvestingEvent , Publication
742+
743+ source = Source .objects .get (id = source_id )
744+ event = HarvestingEvent .objects .create (source = source , status = "in_progress" )
745+
746+ try :
747+ feed_url = source .url_field
748+ logger .info ("Fetching from RSS feed: %s" , feed_url )
749+
750+ processed , saved = parse_rss_feed_and_save_publications (feed_url , event , max_records = max_records )
751+
752+ event .status = "completed"
753+ event .completed_at = timezone .now ()
754+ event .save ()
755+
756+ new_count = Publication .objects .filter (job = event ).count ()
757+ spatial_count = Publication .objects .filter (job = event ).exclude (geometry__isnull = True ).count ()
758+ temporal_count = Publication .objects .filter (job = event ).exclude (timeperiod_startdate = []).count ()
759+
760+ subject = f"RSS Feed Harvesting Completed for { source .name } "
761+ completed_str = event .completed_at .strftime ('%Y-%m-%d %H:%M:%S' )
762+ message = (
763+ f"RSS/Atom feed harvesting job details:\n \n "
764+ f"Number of added articles: { new_count } \n "
765+ f"Number of articles with spatial metadata: { spatial_count } \n "
766+ f"Number of articles with temporal metadata: { temporal_count } \n "
767+ f"Source: { source .name } \n "
768+ f"Feed URL: { source .url_field } \n "
769+ f"Job started at: { event .started_at .strftime ('%Y-%m-%d %H:%M:%S' )} \n "
770+ f"Job completed at: { completed_str } \n "
771+ )
772+
773+ if user and user .email :
774+ send_mail (
775+ subject ,
776+ message ,
777+ settings .EMAIL_HOST_USER ,
778+ [user .email ],
779+ fail_silently = False ,
780+ )
781+
782+ except Exception as e :
783+ logger .error ("RSS feed harvesting failed for source %s: %s" , source .url_field , str (e ))
784+ event .status = "failed"
785+ event .completed_at = timezone .now ()
786+ event .save ()
787+
788+ # Send failure notification
789+ if user and user .email :
790+ send_mail (
791+ f"RSS Feed Harvesting Failed for { source .name } " ,
792+ f"RSS feed harvesting failed for { source .name } \n \n Error: { str (e )} \n \n Feed URL: { source .url_field } " ,
793+ settings .EMAIL_HOST_USER ,
794+ [user .email ],
795+ fail_silently = True ,
796+ )
0 commit comments