@@ -122,18 +122,20 @@ def create_dataset_stable_id(feed_stable_id, timestamp):
122122 """
123123 return f"{ feed_stable_id } -{ timestamp } "
124124
125- def download_content (self , temporary_file_path ):
125+ def download_content (self , temporary_file_path , feed_id ):
126126 """
127127 Downloads the content of a URL and return the hash of the file
128128 """
129129 file_hash = download_and_get_hash (
130130 self .producer_url ,
131- temporary_file_path ,
131+ file_path = temporary_file_path ,
132+ feed_id = feed_id ,
132133 authentication_type = self .authentication_type ,
133134 api_key_parameter_name = self .api_key_parameter_name ,
134135 credentials = self .feed_credentials ,
135136 logger = self .logger ,
136137 )
138+ self .logger .info (f"hash is: { file_hash } " )
137139 is_zip = zipfile .is_zipfile (temporary_file_path )
138140 return file_hash , is_zip
139141
@@ -193,7 +195,7 @@ def upload_files_to_storage(
193195 )
194196 return blob , extracted_files
195197
196- def upload_dataset (self , public = True ) -> DatasetFile or None :
198+ def upload_dataset (self , feed_id , public = True ) -> DatasetFile or None :
197199 """
198200 Uploads a dataset to a GCP bucket as <feed_stable_id>/latest.zip and
199201 <feed_stable_id>/<feed_stable_id>-<upload_datetime>.zip
@@ -204,7 +206,7 @@ def upload_dataset(self, public=True) -> DatasetFile or None:
204206 try :
205207 self .logger .info ("Accessing URL %s" , self .producer_url )
206208 temp_file_path = self .generate_temp_filename ()
207- file_sha256_hash , is_zip = self .download_content (temp_file_path )
209+ file_sha256_hash , is_zip = self .download_content (temp_file_path , feed_id )
208210 if not is_zip :
209211 self .logger .error (
210212 f"[{ self .feed_stable_id } ] The downloaded file from { self .producer_url } is not a valid ZIP file."
@@ -417,12 +419,14 @@ def _get_unzipped_size(dataset_file):
417419 )
418420
419421 @with_db_session
420- def process_from_producer_url (self , db_session ) -> Optional [DatasetFile ]:
422+ def process_from_producer_url (
423+ self , feed_id , db_session : Session
424+ ) -> Optional [DatasetFile ]:
421425 """
422426 Process the dataset and store new version in GCP bucket if any changes are detected
423427 :return: the DatasetFile object created
424428 """
425- dataset_file = self .upload_dataset ()
429+ dataset_file = self .upload_dataset (feed_id )
426430
427431 if dataset_file is None :
428432 self .logger .info (f"[{ self .feed_stable_id } ] No database update required." )
@@ -543,7 +547,7 @@ def process_dataset(cloud_event: CloudEvent):
543547 if json_payload .get ("use_bucket_latest" , False ):
544548 dataset_file = processor .process_from_bucket ()
545549 else :
546- dataset_file = processor .process_from_producer_url ()
550+ dataset_file = processor .process_from_producer_url (json_payload [ "feed_id" ] )
547551 except Exception as e :
548552 # This makes sure the logger is initialized
549553 logger = get_logger ("process_dataset" , stable_id if stable_id else "UNKNOWN" )
0 commit comments