-
Notifications
You must be signed in to change notification settings - Fork 1
Test Empire with NANs #121
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,4 @@ | ||
| import logging | ||
| from datetime import date, datetime | ||
| from textwrap import dedent | ||
| from typing import Annotated | ||
|
|
@@ -199,6 +200,21 @@ def monthly_ds( | |
| f"Column name collision after renaming for data on {df_date}, trying to squish duplicates", | ||
| ) | ||
| df = df.groupby(df.columns, axis=1).first() | ||
|
|
||
| # Avoid attempting to convert the time column to numeric inside | ||
| # clean_up_dtypes_and_nas, which causes unnecessary exceptions. | ||
| if "time" in df.columns: | ||
| time_col = df["time"] | ||
| df_wo_time = df.drop(columns=["time"]) | ||
| df_wo_time = clean_up_dtypes_and_nas( | ||
| df_wo_time, | ||
| na_values="NAN", | ||
| logger=context.log, | ||
| ) | ||
| df_wo_time["time"] = time_col | ||
| df = df_wo_time | ||
| else: | ||
| df = clean_up_dtypes_and_nas(df, na_values="NAN", logger=context.log) | ||
| daily_dfs.append(df) | ||
|
|
||
| df = pd.concat(daily_dfs, ignore_index=True) | ||
|
|
@@ -367,3 +383,25 @@ def build_defs() -> dg.Definitions: | |
| defs = dg.Definitions.merge(defs, dataset_defs) | ||
|
|
||
| return defs | ||
|
|
||
|
|
||
| def clean_up_dtypes_and_nas( | ||
| df: pd.DataFrame, | ||
| na_values: None | str | list[str] = None, | ||
| logger: logging.Logger | None = None, | ||
| ) -> pd.DataFrame: | ||
| """Clean up data types and NA values in a dataframe""" | ||
| df = df.copy() | ||
| if not logger: | ||
| logger = logging.getLogger(__name__) | ||
| if na_values is not None: | ||
| df = df.replace(na_values, pd.NA).dropna() | ||
|
|
||
| for c in df.columns: | ||
| try: | ||
| df[c] = pd.to_numeric(df[c]) | ||
|
Comment on lines
+400
to
+402
|
||
| except (ValueError, TypeError) as e: | ||
| if logger: | ||
| logger.warning(f"Could not convert column {c} to numeric: {e}") | ||
|
|
||
| return df | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The dropna() call will remove entire rows if any column contains NA values after replacement. This could lead to unexpected data loss if only specific columns had NAN values. Consider using dropna() with the 'subset' parameter to target specific columns, or use dropna(how='all') to only drop rows where all values are NA. Alternatively, you could use df.replace() with inplace=False and handle NA values column by column.