Skip to content

Commit 71e5ea8

Browse files
authored
use month names in summary index (#121)
* use month names in summary index closes #108. Also makes convert_to_arrow work more broadly without assuming much about the dataframe structure * cov * more tests
1 parent f0eb7f7 commit 71e5ea8

File tree

4 files changed

+69
-35
lines changed

4 files changed

+69
-35
lines changed

api/solarperformanceinsight_api/compute.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import calendar
12
import datetime as dt
23
from functools import partial
34
import json
@@ -353,7 +354,8 @@ def run_performance_job(job: models.StoredJob, si: storage.StorageInterface):
353354
month_summary.insert(
354355
len(month_summary.columns), "average_daytime_cell_temperature", avg_cell_temp
355356
)
356-
month_summary.index.name = "month" # type: ignore
357+
month_name_index = pd.Index([calendar.month_name[i] for i in months], name="month")
358+
month_summary.index = month_name_index
357359

358360
result_list.extend(
359361
[

api/solarperformanceinsight_api/tests/test_compute.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,6 @@ def test_run_performance_job(stored_job, auth0_id, nocommit_transaction, mocker)
455455
assert len(month_df.index) == 12
456456
ser = month_df.iloc[0]
457457
assert len(ser) == 5
458-
assert ser.loc["month"] == 1.0
458+
assert ser.loc["month"] == "January"
459459
assert abs(ser.loc["total_energy"] - 2.0) < 1e-8
460460
assert abs(ser.loc["plane_of_array_insolation"] - 1.0) < 1e-8

api/solarperformanceinsight_api/tests/test_utils.py

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -208,16 +208,16 @@ def test_validate_dataframe(inp, cols, exp):
208208
),
209209
pa.Table.from_arrays(
210210
[
211+
pa.array([0.1, 0.2], type=pa.float32()),
211212
pa.array(
212213
[
213214
dt.datetime(2020, 1, 1, tzinfo=dt.timezone.utc),
214215
dt.datetime(2020, 1, 2, tzinfo=dt.timezone.utc),
215216
],
216217
type=pa.timestamp("s", tz="UTC"),
217218
),
218-
pa.array([0.1, 0.2], type=pa.float32()),
219219
],
220-
names=["time", "a"],
220+
names=["a", "time"],
221221
),
222222
),
223223
(
@@ -233,30 +233,30 @@ def test_validate_dataframe(inp, cols, exp):
233233
),
234234
pa.Table.from_arrays(
235235
[
236+
pa.array([-999, 129], type=pa.int64()),
236237
pa.array(
237238
[
238239
dt.datetime(2020, 1, 1, tzinfo=dt.timezone.utc),
239240
dt.datetime(2020, 1, 2, tzinfo=dt.timezone.utc),
240241
],
241242
type=pa.timestamp("s", tz="UTC"),
242243
),
243-
pa.array([-999, 129], type=pa.float32()),
244244
pa.array([0.1, 0.2], type=pa.float32()),
245245
],
246-
names=["time", "b", "a"],
246+
names=["b", "time", "a"],
247247
),
248248
),
249-
httpfail(
249+
(
250250
pd.DataFrame(
251251
{"a": [0.1, 0.2], "time": ["one", "two"]},
252252
),
253-
None,
254-
),
255-
httpfail(
256-
pd.DataFrame(
257-
{"a": [0.1, 0.2], "b": ["one", "two"]},
253+
pa.Table.from_arrays(
254+
[
255+
pa.array([0.1, 0.2], type=pa.float32()),
256+
pa.array(["one", "two"]),
257+
],
258+
names=["a", "time"],
258259
),
259-
None,
260260
),
261261
# non-localized ok
262262
(
@@ -272,19 +272,48 @@ def test_validate_dataframe(inp, cols, exp):
272272
),
273273
pa.Table.from_arrays(
274274
[
275+
pa.array([-999, 129], type=pa.int64()),
275276
pa.array(
276277
[
277278
dt.datetime(2020, 1, 1),
278279
dt.datetime(2020, 1, 2),
279280
],
280281
type=pa.timestamp("s"),
281282
),
282-
pa.array([-999, 129], type=pa.float32()),
283283
pa.array([0.1, 0.2], type=pa.float32()),
284284
],
285-
names=["time", "b", "a"],
285+
names=["b", "time", "a"],
286286
),
287287
),
288+
(
289+
pd.DataFrame(
290+
{"nanfloat": [None, 1.0], "nans": [pd.NA, pd.NA], "str": ["a", "b"]}
291+
),
292+
pa.Table.from_arrays(
293+
[
294+
pa.array([None, 1.0], type=pa.float32()),
295+
pa.array([None, None], type=pa.null()),
296+
pa.array(["a", "b"], type=pa.string()),
297+
],
298+
names=["nanfloat", "nans", "str"],
299+
),
300+
),
301+
httpfail(
302+
pd.DataFrame(
303+
{
304+
"nanint": [pd.NA, 3], # arrow doesn't like this
305+
}
306+
),
307+
None,
308+
),
309+
httpfail(
310+
pd.DataFrame(
311+
{
312+
"nanstr": [pd.NA, "string"],
313+
}
314+
),
315+
None,
316+
),
288317
),
289318
)
290319
def test_convert_to_arrow(df, tbl):

api/solarperformanceinsight_api/utils.py

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from fastapi import HTTPException
77
import pandas as pd
88
from pandas.errors import EmptyDataError, ParserError # type: ignore
9-
from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype # type: ignore
9+
import pandas.api.types as pdtypes # type: ignore
1010
import pyarrow as pa # type: ignore
1111

1212

@@ -103,7 +103,7 @@ def validate_dataframe(df: pd.DataFrame, columns: List[str]) -> Set[str]:
103103
status_code=400, detail="Data is missing column(s) " + ", ".join(diff)
104104
)
105105
if "time" in expected:
106-
if not is_datetime64_any_dtype(df["time"]):
106+
if not pdtypes.is_datetime64_any_dtype(df["time"]):
107107
raise HTTPException(
108108
status_code=400,
109109
detail='"time" column could not be parsed as a timestamp',
@@ -117,7 +117,7 @@ def validate_dataframe(df: pd.DataFrame, columns: List[str]) -> Set[str]:
117117
)
118118
bad_types = []
119119
for col in expected - {"time"}:
120-
if not is_numeric_dtype(df[col]):
120+
if not pdtypes.is_numeric_dtype(df[col]):
121121
bad_types.append(col)
122122
if bad_types:
123123
raise HTTPException(
@@ -160,25 +160,28 @@ def reindex_timeseries(
160160
return newdf, extra, missing
161161

162162

163-
def convert_to_arrow(df: pd.DataFrame) -> pa.Table:
164-
"""Convert a DataFrame into an Arrow Table setting a time column to
165-
have second precision and any other columns to use float32"""
166-
cols = df.columns
167-
# save on storage by using single floats
168-
schema = pa.schema((col, pa.float32()) for col in cols if col != "time")
169-
if "time" in cols:
170-
if not is_datetime64_any_dtype(df["time"]):
171-
raise HTTPException(
172-
status_code=400, detail='"time" column is not a datetime'
173-
)
174-
if not hasattr(df["time"].dtype, "tz"): # type: ignore
175-
tz = None
176-
else:
177-
tz = df["time"].dtype.tz # type: ignore
163+
def _map_pandas_val_to_arrow_dtypes(ser: pd.Series) -> pa.DataType:
164+
# save on storage w/ second precisison timestamps and float32
165+
dtype = ser.dtype # type: ignore
166+
if pdtypes.is_datetime64_any_dtype(dtype):
167+
return pa.timestamp("s", tz=getattr(dtype, "tz", None))
168+
elif pdtypes.is_float_dtype(dtype):
169+
return pa.float32()
170+
else:
171+
return pa.array(ser, from_pandas=True).type
172+
178173

179-
# no need to save timestamps at ns precision
180-
schema = schema.insert(0, pa.field("time", pa.timestamp("s", tz=tz)))
174+
def convert_to_arrow(df: pd.DataFrame) -> pa.Table:
175+
"""Convert a DataFrame into an Arrow Table setting datetime columns to
176+
have second precision, float columns to be float32, and infer other types.
177+
Errors are likely if the first row of a column is NA and the column isn't a
178+
float.
179+
"""
181180
try:
181+
schema = pa.schema(
182+
(col, _map_pandas_val_to_arrow_dtypes(val))
183+
for col, val in df.iloc[:1].items() # type: ignore
184+
)
182185
table = pa.Table.from_pandas(df, schema=schema)
183186
except pa.lib.ArrowInvalid as err:
184187
logger.error(err.args[0])

0 commit comments

Comments
 (0)