use month names in summary index (#121)

alorenzo175 · web-flow · commit 71e5ea808cc8 · 2021-02-04T08:48:19.000-07:00
* use month names in summary index closes #108. Also makes convert_to_arrow work more broadly without assuming much about the dataframe structure * cov * more tests
diff --git a/api/solarperformanceinsight_api/compute.py b/api/solarperformanceinsight_api/compute.py
@@ -1,3 +1,4 @@
+import calendar
 import datetime as dt
 from functools import partial
 import json
@@ -353,7 +354,8 @@ def run_performance_job(job: models.StoredJob, si: storage.StorageInterface):
     month_summary.insert(
         len(month_summary.columns), "average_daytime_cell_temperature", avg_cell_temp
     )
-    month_summary.index.name = "month"  # type: ignore
+    month_name_index = pd.Index([calendar.month_name[i] for i in months], name="month")
+    month_summary.index = month_name_index
 
     result_list.extend(
         [
diff --git a/api/solarperformanceinsight_api/tests/test_compute.py b/api/solarperformanceinsight_api/tests/test_compute.py
@@ -455,6 +455,6 @@ def test_run_performance_job(stored_job, auth0_id, nocommit_transaction, mocker)
     assert len(month_df.index) == 12
     ser = month_df.iloc[0]
     assert len(ser) == 5
-    assert ser.loc["month"] == 1.0
+    assert ser.loc["month"] == "January"
     assert abs(ser.loc["total_energy"] - 2.0) < 1e-8
     assert abs(ser.loc["plane_of_array_insolation"] - 1.0) < 1e-8
diff --git a/api/solarperformanceinsight_api/tests/test_utils.py b/api/solarperformanceinsight_api/tests/test_utils.py
@@ -208,16 +208,16 @@ def test_validate_dataframe(inp, cols, exp):
             ),
             pa.Table.from_arrays(
                 [
+                    pa.array([0.1, 0.2], type=pa.float32()),
                     pa.array(
                         [
                             dt.datetime(2020, 1, 1, tzinfo=dt.timezone.utc),
                             dt.datetime(2020, 1, 2, tzinfo=dt.timezone.utc),
                         ],
                         type=pa.timestamp("s", tz="UTC"),
                     ),
-                    pa.array([0.1, 0.2], type=pa.float32()),
                 ],
-                names=["time", "a"],
+                names=["a", "time"],
             ),
         ),
         (
@@ -233,30 +233,30 @@ def test_validate_dataframe(inp, cols, exp):
             ),
             pa.Table.from_arrays(
                 [
+                    pa.array([-999, 129], type=pa.int64()),
                     pa.array(
                         [
                             dt.datetime(2020, 1, 1, tzinfo=dt.timezone.utc),
                             dt.datetime(2020, 1, 2, tzinfo=dt.timezone.utc),
                         ],
                         type=pa.timestamp("s", tz="UTC"),
                     ),
-                    pa.array([-999, 129], type=pa.float32()),
                     pa.array([0.1, 0.2], type=pa.float32()),
                 ],
-                names=["time", "b", "a"],
+                names=["b", "time", "a"],
             ),
         ),
-        httpfail(
+        (
             pd.DataFrame(
                 {"a": [0.1, 0.2], "time": ["one", "two"]},
             ),
-            None,
-        ),
-        httpfail(
-            pd.DataFrame(
-                {"a": [0.1, 0.2], "b": ["one", "two"]},
+            pa.Table.from_arrays(
+                [
+                    pa.array([0.1, 0.2], type=pa.float32()),
+                    pa.array(["one", "two"]),
+                ],
+                names=["a", "time"],
             ),
-            None,
         ),
         # non-localized ok
         (
@@ -272,19 +272,48 @@ def test_validate_dataframe(inp, cols, exp):
             ),
             pa.Table.from_arrays(
                 [
+                    pa.array([-999, 129], type=pa.int64()),
                     pa.array(
                         [
                             dt.datetime(2020, 1, 1),
                             dt.datetime(2020, 1, 2),
                         ],
                         type=pa.timestamp("s"),
                     ),
-                    pa.array([-999, 129], type=pa.float32()),
                     pa.array([0.1, 0.2], type=pa.float32()),
                 ],
-                names=["time", "b", "a"],
+                names=["b", "time", "a"],
             ),
         ),
+        (
+            pd.DataFrame(
+                {"nanfloat": [None, 1.0], "nans": [pd.NA, pd.NA], "str": ["a", "b"]}
+            ),
+            pa.Table.from_arrays(
+                [
+                    pa.array([None, 1.0], type=pa.float32()),
+                    pa.array([None, None], type=pa.null()),
+                    pa.array(["a", "b"], type=pa.string()),
+                ],
+                names=["nanfloat", "nans", "str"],
+            ),
+        ),
+        httpfail(
+            pd.DataFrame(
+                {
+                    "nanint": [pd.NA, 3],  # arrow doesn't like this
+                }
+            ),
+            None,
+        ),
+        httpfail(
+            pd.DataFrame(
+                {
+                    "nanstr": [pd.NA, "string"],
+                }
+            ),
+            None,
+        ),
     ),
 )
 def test_convert_to_arrow(df, tbl):
diff --git a/api/solarperformanceinsight_api/utils.py b/api/solarperformanceinsight_api/utils.py
@@ -6,7 +6,7 @@
 from fastapi import HTTPException
 import pandas as pd
 from pandas.errors import EmptyDataError, ParserError  # type: ignore
-from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype  # type: ignore
+import pandas.api.types as pdtypes  # type: ignore
 import pyarrow as pa  # type: ignore
 
 
@@ -103,7 +103,7 @@ def validate_dataframe(df: pd.DataFrame, columns: List[str]) -> Set[str]:
             status_code=400, detail="Data is missing column(s) " + ", ".join(diff)
         )
     if "time" in expected:
-        if not is_datetime64_any_dtype(df["time"]):
+        if not pdtypes.is_datetime64_any_dtype(df["time"]):
             raise HTTPException(
                 status_code=400,
                 detail='"time" column could not be parsed as a timestamp',
@@ -117,7 +117,7 @@ def validate_dataframe(df: pd.DataFrame, columns: List[str]) -> Set[str]:
             )
     bad_types = []
     for col in expected - {"time"}:
-        if not is_numeric_dtype(df[col]):
+        if not pdtypes.is_numeric_dtype(df[col]):
             bad_types.append(col)
     if bad_types:
         raise HTTPException(
@@ -160,25 +160,28 @@ def reindex_timeseries(
     return newdf, extra, missing
 
 
-def convert_to_arrow(df: pd.DataFrame) -> pa.Table:
-    """Convert a DataFrame into an Arrow Table setting a time column to
-    have second precision and any other columns to use float32"""
-    cols = df.columns
-    # save on storage by using single floats
-    schema = pa.schema((col, pa.float32()) for col in cols if col != "time")
-    if "time" in cols:
-        if not is_datetime64_any_dtype(df["time"]):
-            raise HTTPException(
-                status_code=400, detail='"time" column is not a datetime'
-            )
-        if not hasattr(df["time"].dtype, "tz"):  # type: ignore
-            tz = None
-        else:
-            tz = df["time"].dtype.tz  # type: ignore
+def _map_pandas_val_to_arrow_dtypes(ser: pd.Series) -> pa.DataType:
+    # save on storage w/ second precisison timestamps and float32
+    dtype = ser.dtype  # type: ignore
+    if pdtypes.is_datetime64_any_dtype(dtype):
+        return pa.timestamp("s", tz=getattr(dtype, "tz", None))
+    elif pdtypes.is_float_dtype(dtype):
+        return pa.float32()
+    else:
+        return pa.array(ser, from_pandas=True).type
+
 
-        # no need to save timestamps at ns precision
-        schema = schema.insert(0, pa.field("time", pa.timestamp("s", tz=tz)))
+def convert_to_arrow(df: pd.DataFrame) -> pa.Table:
+    """Convert a DataFrame into an Arrow Table setting datetime columns to
+    have second precision, float columns to be float32, and infer other types.
+    Errors are likely if the first row of a column is NA and the column isn't a
+    float.
+    """
     try:
+        schema = pa.schema(
+            (col, _map_pandas_val_to_arrow_dtypes(val))
+            for col, val in df.iloc[:1].items()  # type: ignore
+        )
         table = pa.Table.from_pandas(df, schema=schema)
     except pa.lib.ArrowInvalid as err:
         logger.error(err.args[0])

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+import calendar`
`1`	`2`	`import datetime as dt`
`2`	`3`	`from functools import partial`
`3`	`4`	`import json`
`@@ -353,7 +354,8 @@ def run_performance_job(job: models.StoredJob, si: storage.StorageInterface):`
`353`	`354`	`month_summary.insert(`
`354`	`355`	`len(month_summary.columns), "average_daytime_cell_temperature", avg_cell_temp`
`355`	`356`	`)`
`356`		`- month_summary.index.name = "month" # type: ignore`
	`357`	`+ month_name_index = pd.Index([calendar.month_name[i] for i in months], name="month")`
	`358`	`+ month_summary.index = month_name_index`
`357`	`359`
`358`	`360`	`result_list.extend(`
`359`	`361`	`[`