Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions env.example
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ PIP_INDEX_URL=https://pypi.python.org/simple/
# Ingestion Parameters
BSS_METADATA_WORKBOOK='gdrive://Database Design/BSS Metadata' # 15XVXFjbom1sScVXbsetnbgAnPpRux2AgNy8w5U8bXdI
BSS_METADATA_STORAGE_OPTIONS='{"token": "service_account", "access": "read_only", "creds": ${GOOGLE_APPLICATION_CREDENTIALS}, "root_file_id": "0AOJ0gJ8sjnO7Uk9PVA"}'
BSS_LABEL_RECOGNITION_WORKBOOK=./BSS_Labels.xlsx # or 'gdrive://Database Design/BSS Labels (${ENV}).xlsx'
BSS_LABEL_RECOGNITION_STORAGE_OPTIONS='{}' # or '{"token": "service_account", "access": "full_control", "creds": ${GOOGLE_APPLICATION_CREDENTIALS}, "root_file_id": "0AOJ0gJ8sjnO7Uk9PVA"}'
BSS_FILES_FOLDER='gdrive://Discovery Folder/Baseline Storage Sheets (BSS)'
BSS_FILES_STORAGE_OPTIONS='{"token": "service_account", "access": "read_only", "creds": ${GOOGLE_APPLICATION_CREDENTIALS}, "root_file_id": "0AOJ0gJ8sjnO7Uk9PVA"}'

Expand Down
2 changes: 2 additions & 0 deletions pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
livelihood_activity_fixture,
livelihood_activity_instances,
livelihood_activity_label_dataframe,
livelihood_activity_label_recognition_dataframe,
livelihood_activity_valid_instances,
livelihood_summary_dataframe,
livelihood_summary_label_dataframe,
Expand Down Expand Up @@ -93,6 +94,7 @@
livelihood_summary_label_dataframe,
all_livelihood_summary_labels_dataframe,
summary_livelihood_summary_labels_dataframe,
livelihood_activity_label_recognition_dataframe,
livelihood_activity_instances,
livelihood_activity_valid_instances,
livelihood_activity_fixture,
Expand Down
312 changes: 169 additions & 143 deletions pipelines/assets/fixtures.py

Large diffs are not rendered by default.

237 changes: 167 additions & 70 deletions pipelines/assets/livelihood_activity.py

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions pipelines/assets/livelihood_activity_regexes.json
Original file line number Diff line number Diff line change
Expand Up @@ -703,6 +703,24 @@
true,
"quantity_produced"
],
[
"(?:wild foods?{separator_pattern} )?{product_pattern}{separator_pattern} \\(?{unit_of_measure_pattern} gathered\\)?",
null,
true,
"quantity_produced"
],
[
"(?:fish|fish \\(?dry\\)?|fish \\(?fresh\\)?){separator_pattern} {product_pattern}{separator_pattern} \\(?{unit_of_measure_pattern} gathered\\)?",
null,
true,
"quantity_produced"
],
[
"{product_pattern}{separator_pattern}\\(?{unit_of_measure_pattern} gathered\\)?",
null,
true,
"quantity_produced"
],
[
"{product_pattern} (?P<season>[1|2]è[m|r]e récolte){separator_pattern} {nbr_pattern} mois",
null,
Expand Down
35 changes: 9 additions & 26 deletions pipelines/assets/other_cash_income.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,11 @@
| 32 | income | | | | | | | | | |
""" # NOQA: E501

import json
import os

import django
import pandas as pd
from dagster import AssetExecutionContext, MetadataValue, Output, asset
from dagster import AssetExecutionContext, Output, asset

from ..configs import BSSMetadataConfig
from ..partitions import bss_instances_partitions_def
Expand Down Expand Up @@ -127,29 +126,30 @@ def summary_other_cash_income_labels_dataframe(
@asset(partitions_def=bss_instances_partitions_def, io_manager_key="json_io_manager")
def other_cash_income_instances(
context: AssetExecutionContext,
config: BSSMetadataConfig,
other_cash_income_dataframe: pd.DataFrame,
livelihood_summary_dataframe: pd.DataFrame,
) -> Output[dict]:
"""
LivelhoodStrategy and LivelihoodActivity instances extracted from the BSS.
"""
if other_cash_income_dataframe.empty:
output = {}
return Output({}, metadata={"message": "No Data2 worksheet found in this BSS"})

output = get_annotated_instances_from_dataframe(
return get_annotated_instances_from_dataframe(
context,
config,
other_cash_income_dataframe,
livelihood_summary_dataframe,
ActivityLabel.LivelihoodActivityType.OTHER_CASH_INCOME,
len(HEADER_ROWS),
)

return output


@asset(partitions_def=bss_instances_partitions_def, io_manager_key="json_io_manager")
def other_cash_income_valid_instances(
context: AssetExecutionContext,
config: BSSMetadataConfig,
other_cash_income_instances: dict,
wealth_characteristic_instances: dict,
) -> Output[dict]:
Expand All @@ -165,16 +165,7 @@ def other_cash_income_valid_instances(
**{"WealthGroup": wealth_characteristic_instances["WealthGroup"]},
**other_cash_income_instances,
}
valid_instances, metadata = validate_instances(context, other_cash_income_instances, partition_key)
metadata = {f"num_{key.lower()}": len(value) for key, value in valid_instances.items()}
metadata["total_instances"] = sum(len(value) for value in valid_instances.values())
metadata["preview"] = MetadataValue.md(
f"```json\n{json.dumps(valid_instances, indent=4, ensure_ascii=False)}\n```"
)
return Output(
valid_instances,
metadata=metadata,
)
return validate_instances(context, config, other_cash_income_instances, partition_key)


@asset(partitions_def=bss_instances_partitions_def, io_manager_key="json_io_manager")
Expand All @@ -186,11 +177,7 @@ def other_cash_income_fixture(
"""
Django fixture for the Livelihood Activities from a BSS.
"""
fixture, metadata = get_fixture_from_instances(other_cash_income_valid_instances)
return Output(
fixture,
metadata=metadata,
)
return get_fixture_from_instances(other_cash_income_valid_instances)


@asset(partitions_def=bss_instances_partitions_def)
Expand All @@ -201,8 +188,4 @@ def imported_other_cash_income_activities(
"""
Imported Django fixtures for a BSS, added to the Django database.
"""
metadata = import_fixture(other_cash_income_fixture)
return Output(
None,
metadata=metadata,
)
return import_fixture(other_cash_income_fixture)
3 changes: 2 additions & 1 deletion pipelines/assets/wealth_characteristic.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,13 +524,14 @@ def wealth_characteristic_instances(
@asset(partitions_def=bss_instances_partitions_def, io_manager_key="json_io_manager")
def wealth_characteristic_valid_instances(
context: AssetExecutionContext,
config: BSSMetadataConfig,
wealth_characteristic_instances,
) -> Output[dict]:
"""
Valid WealthGroup and WealthGroupCharacteristicValue instances from a BSS, ready to be loaded via a Django fixture.
"""
partition_key = context.asset_partition_key_for_output()
valid_instances, metadata = validate_instances(context, wealth_characteristic_instances, partition_key)
valid_instances, metadata = validate_instances(context, config, wealth_characteristic_instances, partition_key)
metadata = {f"num_{key.lower()}": len(value) for key, value in valid_instances.items()}
metadata["total_instances"] = sum(len(value) for value in valid_instances.values())
metadata["preview"] = MetadataValue.md(
Expand Down
35 changes: 9 additions & 26 deletions pipelines/assets/wild_foods.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,11 @@
| 85 | TOTAL FISHING KCALS (%) | 0.009088932377 | 0.005577299413 | 0 | 0.009639776763 | 0.01133165595 | 0 | 0 | 0.009708632311 | 0 |
""" # NOQA: E501

import json
import os

import django
import pandas as pd
from dagster import AssetExecutionContext, MetadataValue, Output, asset
from dagster import AssetExecutionContext, Output, asset

from ..configs import BSSMetadataConfig
from ..partitions import bss_instances_partitions_def
Expand Down Expand Up @@ -137,29 +136,30 @@ def summary_wild_foods_labels_dataframe(
@asset(partitions_def=bss_instances_partitions_def, io_manager_key="json_io_manager")
def wild_foods_instances(
context: AssetExecutionContext,
config: BSSMetadataConfig,
wild_foods_dataframe: pd.DataFrame,
livelihood_summary_dataframe: pd.DataFrame,
) -> Output[dict]:
"""
LivelhoodStrategy and LivelihoodActivity instances extracted from the BSS.
"""
if wild_foods_dataframe.empty:
output = {}
return Output({}, metadata={"message": "No Data3 worksheet found in this BSS"})

output = get_annotated_instances_from_dataframe(
return get_annotated_instances_from_dataframe(
context,
config,
wild_foods_dataframe,
livelihood_summary_dataframe,
ActivityLabel.LivelihoodActivityType.WILD_FOODS,
len(HEADER_ROWS),
)

return output


@asset(partitions_def=bss_instances_partitions_def, io_manager_key="json_io_manager")
def wild_foods_valid_instances(
context: AssetExecutionContext,
config: BSSMetadataConfig,
wild_foods_instances: dict,
wealth_characteristic_instances: dict,
) -> Output[dict]:
Expand All @@ -175,16 +175,7 @@ def wild_foods_valid_instances(
**{"WealthGroup": wealth_characteristic_instances["WealthGroup"]},
**wild_foods_instances,
}
valid_instances, metadata = validate_instances(context, wild_foods_instances, partition_key)
metadata = {f"num_{key.lower()}": len(value) for key, value in valid_instances.items()}
metadata["total_instances"] = sum(len(value) for value in valid_instances.values())
metadata["preview"] = MetadataValue.md(
f"```json\n{json.dumps(valid_instances, indent=4, ensure_ascii=False)}\n```"
)
return Output(
valid_instances,
metadata=metadata,
)
return validate_instances(context, config, wild_foods_instances, partition_key)


@asset(partitions_def=bss_instances_partitions_def, io_manager_key="json_io_manager")
Expand All @@ -196,11 +187,7 @@ def wild_foods_fixture(
"""
Django fixture for the Livelihood Activities from a BSS.
"""
fixture, metadata = get_fixture_from_instances(wild_foods_valid_instances)
return Output(
fixture,
metadata=metadata,
)
return get_fixture_from_instances(wild_foods_valid_instances)


@asset(partitions_def=bss_instances_partitions_def)
Expand All @@ -211,8 +198,4 @@ def imported_wild_foods_activities(
"""
Imported Django fixtures for a BSS, added to the Django database.
"""
metadata = import_fixture(wild_foods_fixture)
return Output(
None,
metadata=metadata,
)
return import_fixture(wild_foods_fixture)
6 changes: 6 additions & 0 deletions pipelines/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ class BSSMetadataConfig(Config):
bss_metadata_workbook: str = EnvVar("BSS_METADATA_WORKBOOK")
# The fsspec storage options for the BSS metadata spreadsheet
bss_metadata_storage_options: dict = json.loads(EnvVar("BSS_METADATA_STORAGE_OPTIONS").get_value("{}"))
# The fspec path of the spreadsheet containing the BSS Labels and their recognition mechanism
bss_label_recognition_workbook: str = EnvVar("BSS_LABEL_RECOGNITION_WORKBOOK")
# The fsspec storage options for the BSS label recognition spreadsheet
bss_label_recognition_storage_options: dict = json.loads(
EnvVar("BSS_LABEL_RECOGNITION_STORAGE_OPTIONS").get_value("{}")
)
# The fspec path of the root folder containing the BSSs
# For example:
# "/home/user/Temp/Baseline Storage Sheets (BSS)"
Expand Down
30 changes: 30 additions & 0 deletions pipelines_tests/test_assets/test_livelihood_activity_regexes.json
Original file line number Diff line number Diff line change
Expand Up @@ -829,5 +829,35 @@
"attribute": "payment_per_time",
"product_id": "grain",
"unit_of_measure_id": "kg"
},
"wild food: avocado (kg gathered)": {
"is_start": true,
"product_id": "avocado",
"unit_of_measure_id": "kg",
"attribute": "quantity_produced"
},
"mangoes (kg gathered)": {
"is_start": true,
"product_id": "mangoes",
"unit_of_measure_id": "kg",
"attribute": "quantity_produced"
},
"okra - kg gathered": {
"is_start": true,
"product_id": "okra",
"unit_of_measure_id": "kg",
"attribute": "quantity_produced"
},
"Fish (dry) : Tilapia (dry/smoked) (kg gathered)": {
"is_start": true,
"product_id": "tilapia (dry/smoked)",
"unit_of_measure_id": "kg",
"attribute": "quantity_produced"
},
"Fish type 2 (dried) - kg gathered": {
"is_start": true,
"product_id": "fish type 2 (dried)",
"unit_of_measure_id": "kg",
"attribute": "quantity_produced"
}
}
11 changes: 9 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
[project]
name = "hea-database-development"
version = "0.1.0"
description = "The HEA Database manages HEA Baseline data."
readme = "README.md"
requires-python = ">=3.12"

[tool.ruff]
line-length = 119
target-version = 'py310'
target-version = 'py312'
exclude = [
'.eggs', # exclude a few common directories in the
'.git', # root of the project
Expand All @@ -27,7 +34,7 @@ docstring-quotes = "double"

[tool.black]
line-length = 119
target-version = ['py310']
target-version = ['py312']
include = '\.pyi?$'
exclude = '''

Expand Down
2 changes: 1 addition & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ djangorestframework-gis==1.1
djangorestframework-xml==2.0.0
docutils
factory-boy==3.2.1
git+https://github.com/American-Institutes-for-Research/gdrivefs.git@e870c19e1d730635e3760e7ae21eebf9ddda765e
git+https://github.com/American-Institutes-for-Research/gdrivefs.git@f4ec53446e6a27be2e368b24dadfa9081e1272f2
googletrans-py==4.0.0
# Required for rendering Dagster graphs in Jupyter notebooks
graphviz==0.21
Expand Down