Skip to content

Commit 7a00041

Browse files
authored
Add more detailed statuses (#298)
1 parent f792884 commit 7a00041

File tree

4 files changed

+9
-2
lines changed

4 files changed

+9
-2
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
- Fill in check throughout the process to send previous steps infos in case of late crash [#293](https://github.com/datagouv/hydra/pull/293)
88
- Fix purge csv tables CLI by using the csv db connection [#294](https://github.com/datagouv/hydra/pull/294)
99
- Better gz files extraction function name [#295](https://github.com/datagouv/hydra/pull/295)
10+
- Add more detailed statuses [#297](https://github.com/datagouv/hydra/pull/297)
1011

1112
## 2.3.0 (2025-07-15)
1213

udata_hydra/analysis/csv.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ async def analyse_csv(
119119
try:
120120
previous_analysis: dict | None = await get_previous_analysis(resource_id=resource_id)
121121
if previous_analysis:
122+
await Resource.update(resource_id, {"status": "VALIDATING_CSV"})
122123
csv_inspection, df = validate_then_detect(
123124
file_path=tmp_file.name,
124125
previous_analysis=previous_analysis,

udata_hydra/analysis/resource.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ async def analyse_resource(
6565

6666
# Update resource status to ANALYSING_RESOURCE
6767
resource: Record | None = await Resource.update(
68-
resource_id, data={"status": "ANALYSING_RESOURCE"}
68+
resource_id, data={"status": "ANALYSING_RESOURCE_HEAD"}
6969
)
7070

7171
# let's see if we can infer a modification date on early hints based on harvest infos and headers
@@ -83,10 +83,12 @@ async def analyse_resource(
8383
tmp_file = None
8484
if change_status != Change.HAS_NOT_CHANGED or force_analysis:
8585
try:
86+
await Resource.update(resource_id, data={"status": "DOWNLOADING_RESOURCE"})
8687
tmp_file = await download_resource(url, headers, max_size_allowed)
8788
except IOException:
8889
dl_analysis["analysis:error"] = "File too large to download"
8990
else:
91+
await Resource.update(resource_id, data={"status": "ANALYSING_DOWNLOADED_RESOURCE"})
9092
# Get file size
9193
dl_analysis["analysis:content-length"] = os.path.getsize(tmp_file.name)
9294
# Get checksum

udata_hydra/db/resource.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,12 @@ class Resource:
1313
"BACKOFF": "backoff period for this domain, will be checked later",
1414
"CRAWLING_URL": "resource URL currently being crawled",
1515
"TO_ANALYSE_RESOURCE": "resource to be processed for change, type and size analysis",
16-
"ANALYSING_RESOURCE": "currently being processed for change, type and size analysis",
16+
"ANALYSING_RESOURCE_HEAD": "currently checking for change, type and size from headers",
17+
"DOWNLOADING_RESOURCE": "currently being downloaded",
18+
"ANALYSING_DOWNLOADED_RESOURCE": "currently checking for change, type and size from downloaded file",
1719
"TO_ANALYSE_CSV": "resource content to be analysed by CSV detective",
1820
"ANALYSING_CSV": "resource content currently being analysed by CSV detective",
21+
"VALIDATING_CSV": "resource content being validated using the previous analysis",
1922
"INSERTING_IN_DB": "currently being inserted in DB",
2023
"CONVERTING_TO_PARQUET": "currently being converted to Parquet",
2124
"TO_ANALYSE_GEOJSON": "geojson resource content to be analysed",

0 commit comments

Comments
 (0)