Add more detailed statuses (#298)

Pierlou · web-flow · commit 7a000419c3df · 2025-07-24T08:42:59.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@
 - Fill in check throughout the process to send previous steps infos in case of late crash [#293](https://github.com/datagouv/hydra/pull/293)
 - Fix purge csv tables CLI by using the csv db connection [#294](https://github.com/datagouv/hydra/pull/294)
 - Better gz files extraction function name [#295](https://github.com/datagouv/hydra/pull/295)
+- Add more detailed statuses [#297](https://github.com/datagouv/hydra/pull/297)
 
 ## 2.3.0 (2025-07-15)
 
diff --git a/udata_hydra/analysis/csv.py b/udata_hydra/analysis/csv.py
@@ -119,6 +119,7 @@ async def analyse_csv(
         try:
             previous_analysis: dict | None = await get_previous_analysis(resource_id=resource_id)
             if previous_analysis:
+                await Resource.update(resource_id, {"status": "VALIDATING_CSV"})
                 csv_inspection, df = validate_then_detect(
                     file_path=tmp_file.name,
                     previous_analysis=previous_analysis,
diff --git a/udata_hydra/analysis/resource.py b/udata_hydra/analysis/resource.py
@@ -65,7 +65,7 @@ async def analyse_resource(
 
     # Update resource status to ANALYSING_RESOURCE
     resource: Record | None = await Resource.update(
-        resource_id, data={"status": "ANALYSING_RESOURCE"}
+        resource_id, data={"status": "ANALYSING_RESOURCE_HEAD"}
     )
 
     # let's see if we can infer a modification date on early hints based on harvest infos and headers
@@ -83,10 +83,12 @@ async def analyse_resource(
     tmp_file = None
     if change_status != Change.HAS_NOT_CHANGED or force_analysis:
         try:
+            await Resource.update(resource_id, data={"status": "DOWNLOADING_RESOURCE"})
             tmp_file = await download_resource(url, headers, max_size_allowed)
         except IOException:
             dl_analysis["analysis:error"] = "File too large to download"
         else:
+            await Resource.update(resource_id, data={"status": "ANALYSING_DOWNLOADED_RESOURCE"})
             # Get file size
             dl_analysis["analysis:content-length"] = os.path.getsize(tmp_file.name)
             # Get checksum
diff --git a/udata_hydra/db/resource.py b/udata_hydra/db/resource.py
@@ -13,9 +13,12 @@ class Resource:
         "BACKOFF": "backoff period for this domain, will be checked later",
         "CRAWLING_URL": "resource URL currently being crawled",
         "TO_ANALYSE_RESOURCE": "resource to be processed for change, type and size analysis",
-        "ANALYSING_RESOURCE": "currently being processed for change, type and size analysis",
+        "ANALYSING_RESOURCE_HEAD": "currently checking for change, type and size from headers",
+        "DOWNLOADING_RESOURCE": "currently being downloaded",
+        "ANALYSING_DOWNLOADED_RESOURCE": "currently checking for change, type and size from downloaded file",
         "TO_ANALYSE_CSV": "resource content to be analysed by CSV detective",
         "ANALYSING_CSV": "resource content currently being analysed by CSV detective",
+        "VALIDATING_CSV": "resource content being validated using the previous analysis",
         "INSERTING_IN_DB": "currently being inserted in DB",
         "CONVERTING_TO_PARQUET": "currently being converted to Parquet",
         "TO_ANALYSE_GEOJSON": "geojson resource content to be analysed",