diff --git a/pipelines/assets/baseline.py b/pipelines/assets/baseline.py index c170f15..67ebb1a 100644 --- a/pipelines/assets/baseline.py +++ b/pipelines/assets/baseline.py @@ -78,17 +78,15 @@ def get_wealth_group_dataframe( # In the Summary columns in the Data, Data2, Data3 worksheets, the Wealth # Group Category is in Row 4 (District)rather than Row 3 (Wealth Group Category) # so do a second lookup to update the blank rows. - # If this doesn't find any new values, then it's because in a WB worksheet - # there are no extra Wealth Group Categories on Row 4 - try: + # Note that in a WB worksheet there are no extra Wealth Group Categories on Row 4 + if worksheet_name != "WB": wealth_group_df = wealthgroupcategorylookup.do_lookup( wealth_group_df, "district", "wealth_group_category", update=True ) # Remove the duplicate wealth_group_category_original column created by the second do_lookup(), # which otherwise causes problems when trying to merge dataframes, e.g. when building the wealth_group_df. wealth_group_df = wealth_group_df.loc[:, ~wealth_group_df.columns.duplicated()] - except ValueError: - pass + # Check if there are unrecognized wealth group categories and report wealth_group_missing_category_df = wealth_group_df[ wealth_group_df["wealth_group_category"].isnull() @@ -266,7 +264,7 @@ def baseline_instances( } try: - preview = json.dumps(result, indent=4) + preview = json.dumps(result, indent=4, ensure_ascii=False) except TypeError as e: raise ValueError("Cannot serialize Community fixture to JSON. Failing dict is\n %s" % result) from e @@ -359,7 +357,7 @@ def community_instances(context: AssetExecutionContext, config: BSSMetadataConfi result = {"Community": community_df.to_dict(orient="records")} try: - preview = json.dumps(result, indent=4) + preview = json.dumps(result, indent=4, ensure_ascii=False) except TypeError as e: raise ValueError("Cannot serialize Community fixture to JSON. Failing dict is\n %s" % result) from e diff --git a/pipelines/assets/fixtures.py b/pipelines/assets/fixtures.py index 13be77b..5e0574c 100644 --- a/pipelines/assets/fixtures.py +++ b/pipelines/assets/fixtures.py @@ -220,7 +220,7 @@ def validate_instances( metadata = {f"num_{key.lower()}": len(value) for key, value in instances.items()} metadata["total_instances"] = sum(len(value) for value in instances.values()) - metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(instances, indent=4)}\n```") + metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(instances, indent=4, ensure_ascii=False)}\n```") return instances, metadata @@ -287,7 +287,7 @@ def get_fixture_from_instances(instance_dict: dict[str, list[dict]]) -> tuple[li metadata[f'num_{str(model._meta).split(".")[-1]}'] += 1 metadata["total_instances"] = len(fixture) - metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(fixture, indent=4)}\n```") + metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(fixture, indent=4, ensure_ascii=False)}\n```") return fixture, metadata @@ -300,7 +300,7 @@ def import_fixture(fixture: list[dict]) -> dict: # We need to use a .verbose_json file extension for Django to use the correct serializer. with tempfile.NamedTemporaryFile(mode="w+", suffix=".verbose_json") as f: # Write the fixture to a temporary file so that Django can access it - f.write(json.dumps(fixture)) + f.write(json.dumps(fixture, indent=4, ensure_ascii=False)) f.seek(0) call_command(verbose_load_data.Command(), f.name, verbosity=2, format="verbose_json", stdout=output_buffer) @@ -309,7 +309,7 @@ def import_fixture(fixture: list[dict]) -> dict: for instance in fixture: metadata[f'num_{instance["model"].split(".")[-1]}'] += 1 metadata["total_instances"] = len(fixture) - metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(fixture, indent=4)}\n```") + metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(fixture, indent=4, ensure_ascii=False)}\n```") metadata["output"] = MetadataValue.md(f"```\n{output_buffer.getvalue()}\n```") return metadata diff --git a/pipelines/assets/livelihood_activity.py b/pipelines/assets/livelihood_activity.py index 8ea24d9..f539a5f 100644 --- a/pipelines/assets/livelihood_activity.py +++ b/pipelines/assets/livelihood_activity.py @@ -1061,7 +1061,7 @@ def get_instances_from_dataframe( ) * 100 ), - "preview": MetadataValue.md(f"```json\n{json.dumps(result, indent=4)}\n```"), + "preview": MetadataValue.md(f"```json\n{json.dumps(result, indent=4, ensure_ascii=False)}\n```"), } if not unrecognized_labels.empty: metadata["unrecognized_labels"] = MetadataValue.md(unrecognized_labels.to_markdown(index=False)) @@ -1116,7 +1116,9 @@ def livelihood_activity_valid_instances( valid_instances, metadata = validate_instances(context, livelihood_activity_instances, partition_key) metadata = {f"num_{key.lower()}": len(value) for key, value in valid_instances.items()} metadata["total_instances"] = sum(len(value) for value in valid_instances.values()) - metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(valid_instances, indent=4)}\n```") + metadata["preview"] = MetadataValue.md( + f"```json\n{json.dumps(valid_instances, indent=4, ensure_ascii=False)}\n```" + ) return Output( valid_instances, metadata=metadata, diff --git a/pipelines/assets/other_cash_income.py b/pipelines/assets/other_cash_income.py index b6d0c48..edf3957 100644 --- a/pipelines/assets/other_cash_income.py +++ b/pipelines/assets/other_cash_income.py @@ -171,7 +171,9 @@ def other_cash_income_valid_instances( valid_instances, metadata = validate_instances(context, other_cash_income_instances, partition_key) metadata = {f"num_{key.lower()}": len(value) for key, value in valid_instances.items()} metadata["total_instances"] = sum(len(value) for value in valid_instances.values()) - metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(valid_instances, indent=4)}\n```") + metadata["preview"] = MetadataValue.md( + f"```json\n{json.dumps(valid_instances, indent=4, ensure_ascii=False)}\n```" + ) return Output( valid_instances, metadata=metadata, diff --git a/pipelines/assets/wealth_characteristic.py b/pipelines/assets/wealth_characteristic.py index 5a0d12f..9641f5f 100644 --- a/pipelines/assets/wealth_characteristic.py +++ b/pipelines/assets/wealth_characteristic.py @@ -450,7 +450,7 @@ def wealth_characteristic_instances( ) * 100 ), - "preview": MetadataValue.md(f"```json\n{json.dumps(result, indent=4)}\n```"), + "preview": MetadataValue.md(f"```json\n{json.dumps(result, indent=4, ensure_ascii=False)}\n```"), } if not unrecognized_labels.empty: metadata["unrecognized_labels"] = MetadataValue.md(unrecognized_labels.to_markdown(index=False)) @@ -473,7 +473,9 @@ def wealth_characteristic_valid_instances( valid_instances, metadata = validate_instances(context, wealth_characteristic_instances, partition_key) metadata = {f"num_{key.lower()}": len(value) for key, value in valid_instances.items()} metadata["total_instances"] = sum(len(value) for value in valid_instances.values()) - metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(valid_instances, indent=4)}\n```") + metadata["preview"] = MetadataValue.md( + f"```json\n{json.dumps(valid_instances, indent=4, ensure_ascii=False)}\n```" + ) return Output( valid_instances, metadata=metadata, diff --git a/pipelines/assets/wild_foods.py b/pipelines/assets/wild_foods.py index c1f3a71..4ab7ece 100644 --- a/pipelines/assets/wild_foods.py +++ b/pipelines/assets/wild_foods.py @@ -181,7 +181,9 @@ def wild_foods_valid_instances( valid_instances, metadata = validate_instances(context, wild_foods_instances, partition_key) metadata = {f"num_{key.lower()}": len(value) for key, value in valid_instances.items()} metadata["total_instances"] = sum(len(value) for value in valid_instances.values()) - metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(valid_instances, indent=4)}\n```") + metadata["preview"] = MetadataValue.md( + f"```json\n{json.dumps(valid_instances, indent=4, ensure_ascii=False)}\n```" + ) return Output( valid_instances, metadata=metadata, diff --git a/pipelines/resources.py b/pipelines/resources.py index 36c3109..3260241 100644 --- a/pipelines/resources.py +++ b/pipelines/resources.py @@ -70,7 +70,7 @@ def dump_to_path(self, context: OutputContext, obj: Any, path: "UPath"): self.unlink(path) with path.open("w") as file: - file.write(json.dumps(obj, indent=4)) + file.write(json.dumps(obj, indent=4, ensure_ascii=False)) def load_from_path(self, context: InputContext, path: "UPath") -> Any: with path.open("r") as file: