Infer from schema on error

luccasmmg · luccasmmg · commit 3a2c8486e82c · 2025-07-16T09:27:22.000-03:00
diff --git a/aircan/dags/api_ckan_import_to_bq_v2.py b/aircan/dags/api_ckan_import_to_bq_v2.py
@@ -7,7 +7,7 @@
 
 # Local imports
 from aircan.dependencies.google_cloud.bigquery_handler_v2 import bq_import_csv
-from aircan.dependencies.utils import aircan_status_update
+from aircan.dependencies.utils import aircan_status_update_nhs as aircan_status_update
 
 # Third-party library imports
 from airflow import DAG
@@ -16,7 +16,7 @@
 from airflow.models import Variable
 from airflow.operators.python_operator import PythonOperator
 from airflow.utils.dates import days_ago
-
+import traceback
 
 args = {
     'start_date': days_ago(0),
@@ -71,29 +71,19 @@ def task_import_resource_to_bq(**context):
     logging.info('Importing %s to BQ %s' % (gc_file_url, bq_table_id))
     ckan_conf = context['params'].get('ckan_config', {})
     ckan_conf['resource_id'] = context['params'].get('resource', {}).get('ckan_resource_id')
-    dag_run_id = context['dag_run'].run_id
+    dag_run_id = context['run_id']
     res_id = ckan_conf.get('resource_id')
-    try:
-        bq_import_csv(bq_table_id, gc_file_url, schema, ckan_conf)
-        status_dict = {
-        'dag_run_id': dag_run_id,
-        'resource_id': res_id,
-        'state': 'complete',
-        'message': 'Data ingestion completed successfully for "{res_id}".'.format(
-                    res_id=res_id),
-        'clear_logs': True
-        }
-        aircan_status_update(ckan_site_url, ckan_api_key, status_dict)
-    except Exception as e:
-        status_dict = {
-        'dag_run_id': dag_run_id,
-        'resource_id': res_id,
-        'state': 'failed',
-        'message': str(e),
-        'clear_logs': True
-        }
-        aircan_status_update(ckan_site_url, ckan_api_key, status_dict)
-        raise Exception(str(e))
+    ckan_conf['dag_run_id'] = dag_run_id
+    bq_import_csv(bq_table_id, gc_file_url, schema, ckan_conf)
+    status_dict = {
+    'dag_run_id': dag_run_id,
+    'resource_id': res_id,
+    'state': 'complete',
+    'message': 'Data ingestion completed successfully for "{res_id}".'.format(
+                res_id=res_id),
+    'clear_logs': True
+    }
+    aircan_status_update(ckan_site_url, ckan_api_key, status_dict)
 
 import_resource_to_bq_task = PythonOperator(
     task_id='import_resource_to_bq_v2',
diff --git a/aircan/dependencies/google_cloud/bigquery_handler_v2.py b/aircan/dependencies/google_cloud/bigquery_handler_v2.py
@@ -1,6 +1,6 @@
 from google.cloud import bigquery
 import google.api_core.exceptions
-from aircan.dependencies.utils import AirflowCKANException, aircan_status_update
+from aircan.dependencies.utils import AirflowCKANException, aircan_status_update_nhs as aircan_status_update
 import json
 import logging
 
@@ -13,34 +13,51 @@ def bq_import_csv(table_id, gcs_path, table_schema, ckan_conf):
     try:
         client = bigquery.Client()
 
-        job_config = bigquery.LoadJobConfig()
+        try:
+            job_config = bigquery.LoadJobConfig()
 
-        schema = bq_schema_from_table_schema(table_schema)
-        job_config.schema = schema
+            schema = bq_schema_from_table_schema(table_schema)
+            job_config.schema = schema
 
-        job_config.skip_leading_rows = 1
-        job_config.source_format = bigquery.SourceFormat.CSV
-        # overwrite a Table
-        job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
-        # set 'True' for schema autodetect but turning it off since we define schema in explicitly when publishing data using datapub
-        # job_config.autodetect = True
-        load_job = client.load_table_from_uri(
-            gcs_path, table_id, job_config=job_config
-        )
+            job_config.skip_leading_rows = 1
+            job_config.source_format = bigquery.SourceFormat.CSV
+            # overwrite a Table
+            job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
+            # set 'True' for schema autodetect but turning it off since we define schema in explicitly when publishing data using datapub
+            # job_config.autodetect = True
+            load_job = client.load_table_from_uri(
+                gcs_path, table_id, job_config=job_config
+            )
 
-        load_job.result()  # Waits for table load to complete.
-        destination_table = client.get_table(table_id)
+            load_job.result()  # Waits for table load to complete.
+            destination_table = client.get_table(table_id)
+        except Exception as e:
+            job_config = bigquery.LoadJobConfig()
+
+            job_config.skip_leading_rows = 1
+            job_config.source_format = bigquery.SourceFormat.CSV
+            # overwrite a Table
+            job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
+            # set 'True' for schema autodetect but turning it off since we define schema in explicitly when publishing data using datapub
+            # job_config.autodetect = True
+            load_job = client.load_table_from_uri(
+                gcs_path, table_id, job_config=job_config
+            )
+            load_job.result()  # Waits for table load to complete.
+            destination_table = client.get_table(table_id)
         status_dict = {
             'res_id': ckan_conf.get('resource_id'),
             'state': 'progress',
-            'message': 'Data ingestion is in progress.'
+            'message': 'Data ingestion is in progress.',
+            'dag_run_id': ckan_conf.get('dag_run_id')
         }
         aircan_status_update(ckan_conf.get('site_url'), ckan_conf.get('api_key'), status_dict)
         if destination_table:
             status_dict = {
                 'res_id': ckan_conf.get('resource_id'),
                 'state': 'complete',
-                'message': "Ingession Completed"
+                'message': "Ingession Completed",
+            'dag_run_id': ckan_conf.get('dag_run_id')
             }
             aircan_status_update(ckan_conf.get('site_url'), ckan_conf.get('api_key'), status_dict)
             return {'success': True, 'message': 'BigQuery Table created successfully.'}
@@ -60,6 +77,7 @@ def bq_import_csv(table_id, gcs_path, table_schema, ckan_conf):
         logging.info(e)
         status_dict = {
             'res_id': ckan_conf.get('resource_id'),
+            'dag_run_id': ckan_conf.get('dag_run_id'),
             'state': 'failed',
             'message': str(e)
         }
diff --git a/aircan/dependencies/utils.py b/aircan/dependencies/utils.py
@@ -72,6 +72,42 @@ def days_ago(n, hour=0, minute=0, second=0, microsecond=0):
         time(hour, minute, second, microsecond, tzinfo=timezone.TIMEZONE),
     )
 
+def aircan_status_update_nhs (site_url, ckan_api_key, status_dict):
+    """
+    Update aircan run status like pending, error, process, complete 
+    on ckan with message.
+    """
+    logging.info('Updating data loading status')
+    try:
+        request_data = { 
+            'dag_run_id': status_dict.get('dag_run_id', ''),
+            'resource_id': status_dict.get('res_id', ''),
+            'state': status_dict.get('state', ''),
+            'last_updated': str(datetime.utcnow()),
+            'message': status_dict.get('message', ''),
+        }
+
+        if status_dict.get('error', False):
+            request_data.update({'error': {
+                'message' : status_dict.get('error', '')
+            }})
+
+        url = urljoin(site_url, '/api/3/action/aircan_status_update')
+        response = requests.post(url,
+                        data=json.dumps(request_data),
+                        headers={'Content-Type': 'application/json',
+                                'Authorization': ckan_api_key})
+        print(response.text)
+        if response.status_code == 200:
+            resource_json = response.json()
+            logging.info('Loading status updated successfully in CKAN.')
+            return {'success': True}
+        else:
+            print(response.json())
+            return response.json()
+    except Exception as e:
+        logging.error('Failed to update status in CKAN. {0}'.format(e))
+
 def aircan_status_update(site_url, ckan_api_key, status_dict):
     """
     Update aircan run status like pending, error, process, complete 
@@ -325,4 +361,4 @@ def join_path(path, *paths):
     """
     for p in paths:
         path = os.path.join(path, p)
-    return path
+    return path