Skip to content

Commit 146c7ff

Browse files
authored
Update to use pandas v2.* (#932)
* updates for pandas 2.2 * pytables 3.9 * input checker message failbacks * fix veh type categoricals * restore original pandas read_csv NaNs * is_monotonic_increasing * fix disagg acc sorting * drop unused indexes * update pipeline ref * temporarily disable sharrow in vehicle alloc * fix dtype problem * ensure MAX index does not overflow * sort on join to preserve index ordering from old pandas * local compute test simplifies debugging * more robust conversion to pyarrow * rewrite df.eval to fast_eval * change xarray pin * fix zarr pin * update numpy and dask pins * wrap raw fast_eval in pd.Series * don't skip sharrow in veh alloc * rebuild ref pipeline * make fast_eval more robust * revise external targets * prefer public API * Update activitysim-dev-base.yml restore accidentally removed larch * add note about why fast_eval exists and how to undo it
1 parent 6a3c88d commit 146c7ff

27 files changed

+308
-80
lines changed

.github/workflows/core_tests.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -275,11 +275,11 @@ jobs:
275275
- region: Standard 1-Zone Example (MTC)
276276
region-org: ActivitySim
277277
region-repo: activitysim-prototype-mtc
278-
region-branch: extended
278+
region-branch: pandas2
279279
- region: Standard 2-Zone Example (SANDAG)
280280
region-org: ActivitySim
281281
region-repo: sandag-abm3-example
282-
region-branch: main
282+
region-branch: pandas2
283283
fail-fast: false
284284
defaults:
285285
run:

activitysim/abm/models/disaggregate_accessibility.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ class DisaggregateAccessibilitySettings(PydanticReadable, extra="forbid"):
158158
"""
159159
Disaggreate accessibility table is grouped by the "by" cols above and the KEEP_COLS are averaged
160160
across the group. Initializing the below as NA if not in the auto ownership level, they are skipped
161-
in the groupby mean and the values are correct.
161+
in the groupby mean and the values are correct.
162162
(It's a way to avoid having to update code to reshape the table and introduce new functionality there.)
163163
If none, will keep all of the columns with "accessibility" in the name.
164164
"""
@@ -581,7 +581,7 @@ def expand_template_zones(self, tables):
581581
_expanded = pd.DataFrame(util.named_product(**index_params)).set_index("index")
582582

583583
# Use result to join template onto expanded table of zones
584-
ex_table = _expanded.join(master_template).reset_index()
584+
ex_table = _expanded.join(master_template).sort_index().reset_index()
585585

586586
# Concatenate a new unique set of ids
587587
cols = ["home_zone_id", "proto_household_id", "proto_person_id"]
@@ -654,7 +654,9 @@ def create_proto_pop(self):
654654
.set_index("index")
655655
.rename(columns={"hhid": hhid})
656656
)
657-
persons = rep.join(persons).sort_values(hhid).reset_index(drop=True)
657+
persons = (
658+
rep.join(persons, sort=True).sort_values(hhid).reset_index(drop=True)
659+
)
658660
persons[perid] = persons.index + 1
659661

660662
# Assign persons to tours
@@ -730,6 +732,7 @@ def merge_persons(self):
730732

731733
perid = self.params["proto_persons"]["index_col"]
732734
persons_merged.set_index(perid, inplace=True, drop=True)
735+
persons_merged = persons_merged.sort_index()
733736
self.proto_pop["proto_persons_merged"] = persons_merged
734737

735738
# Store in pipeline

activitysim/abm/models/input_checker.py

Lines changed: 34 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -301,34 +301,42 @@ def report_errors(state, input_checker_settings, v_warnings, v_errors):
301301

302302
for warn in warns:
303303
if "dataframe validator" in str(warn.message):
304-
file_logger.warning(
305-
"Failed dataframe validator: "
306-
+ str(warn.message).split("\n")[-1]
307-
)
308-
elif "element-wise validator" in str(warn.message):
309-
if "DataFrameSchema" in str(warn.message):
310-
file_logger.warning(
311-
"Failed element-wise validator: <"
312-
+ str(warn.message).split("\n")[0].split(" ")[1]
313-
+ table_name
314-
+ ")>\n\t"
315-
+ str(warn.message)
316-
.split("failure cases:\n")[0]
317-
.split("\n")[-2]
318-
+ "\n\tfailure cases:\n\t"
319-
+ "\n\t".join(
320-
str(warn.message)
321-
.split("failure cases:\n")[1]
322-
.split("\n")
323-
)
324-
)
325-
else:
304+
try:
326305
file_logger.warning(
327-
"Failed element-wise validator: <"
328-
+ " ".join(str(warn.message).split("\n")[0].split(" ")[1:3])
329-
+ "\n\t"
330-
+ "\n\t".join(str(warn.message).split("\n")[1:])
306+
"Failed dataframe validator: "
307+
+ str(warn.message).split("\n")[-1]
331308
)
309+
except Exception:
310+
file_logger.warning(warn)
311+
elif "element-wise validator" in str(warn.message):
312+
try:
313+
if "DataFrameSchema" in str(warn.message):
314+
file_logger.warning(
315+
"Failed element-wise validator: <"
316+
+ str(warn.message).split("\n")[0].split(" ")[1]
317+
+ table_name
318+
+ ")>\n\t"
319+
+ str(warn.message)
320+
.split("failure cases:\n")[0]
321+
.split("\n")[-2]
322+
+ "\n\tfailure cases:\n\t"
323+
+ "\n\t".join(
324+
str(warn.message)
325+
.split("failure cases:\n")[1]
326+
.split("\n")
327+
)
328+
)
329+
else:
330+
file_logger.warning(
331+
"Failed element-wise validator: <"
332+
+ " ".join(
333+
str(warn.message).split("\n")[0].split(" ")[1:3]
334+
)
335+
+ "\n\t"
336+
+ "\n\t".join(str(warn.message).split("\n")[1:])
337+
)
338+
except Exception:
339+
file_logger.warning(warn)
332340
else:
333341
file_logger.warning(warn)
334342
file_logger.warning("\n")

activitysim/abm/models/school_escorting.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -634,7 +634,7 @@ def school_escorting(
634634
state.add_table("tours", tours)
635635
state.get_rn_generator().drop_channel("tours")
636636
state.get_rn_generator().add_channel("tours", tours)
637-
state.add_table("escort_bundles", escort_bundles)
637+
state.add_table("escort_bundles", escort_bundles.reset_index(drop=True))
638638
# save school escorting tours and trips in pipeline so we can overwrite results from downstream models
639639
state.add_table("school_escort_tours", school_escort_tours)
640640
state.add_table("school_escort_trips", school_escort_trips)

activitysim/abm/models/trip_departure_choice.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,7 @@ def apply_stage_two_model(
404404
trace_label: str,
405405
compute_settings: ComputeSettings | None = None,
406406
):
407-
if not trips.index.is_monotonic:
407+
if not trips.index.is_monotonic_increasing:
408408
trips = trips.sort_index()
409409

410410
# Assign the duration of the appropriate leg to the trip

activitysim/abm/models/util/school_escort_tours_trips.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ def create_chauf_escort_trips(bundles):
353353
"outbound",
354354
"purpose",
355355
]
356-
).reset_index()
356+
).reset_index(drop=True)
357357

358358
# numbering trips such that outbound escorting trips must come first and inbound trips must come last
359359
outbound_trip_num = -1 * (
@@ -539,7 +539,7 @@ def create_escortee_trips(bundles):
539539
# create a new trip for each escortee destination
540540
escortee_trips = escortee_trips.explode(
541541
["destination", "escort_participants", "school_escort_trip_num", "purpose"]
542-
).reset_index()
542+
).reset_index(drop=True)
543543

544544
# numbering trips such that outbound escorting trips must come first and inbound trips must come last
545545
# this comes in handy when merging trips to others in the tour decided downstream

activitysim/abm/models/vehicle_allocation.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,7 @@ def vehicle_allocation(
261261
]
262262

263263
# set choice for non-household vehicle option
264+
choices["choice"] = choices["choice"].astype(veh_choice_dtype)
264265
choices.loc[
265266
choices["alt_choice"] == alts_from_spec[-1], "choice"
266267
] = alts_from_spec[-1]

activitysim/cli/create.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import glob
44
import hashlib
5+
import importlib.resources
56
import logging
67
import os
78
import shutil
@@ -21,14 +22,15 @@
2122

2223
def _example_path(resource):
2324
resource = os.path.join(EXAMPLES_DIR, resource)
24-
path = pkg_resources.resource_filename(PACKAGE, resource)
25-
26-
return path
25+
return importlib.resources.as_file(
26+
importlib.resources.files(PACKAGE).joinpath(resource)
27+
)
2728

2829

2930
def _load_manifest():
30-
with open(_example_path(MANIFEST), "r") as f:
31-
manifest = yaml.safe_load(f.read())
31+
with _example_path(MANIFEST) as f_pth:
32+
with open(f_pth, "r") as f:
33+
manifest = yaml.safe_load(f.read())
3234

3335
assert manifest, f"error: could not load {MANIFEST}"
3436
return {example["name"]: example for example in manifest}
@@ -177,8 +179,9 @@ def get_example(
177179
)
178180

179181
else:
180-
for asset_path in glob.glob(_example_path(assets)):
181-
copy_asset(asset_path, target_path, dirs_exist_ok=True)
182+
with _example_path(assets) as pth:
183+
for asset_path in glob.glob(str(pth)):
184+
copy_asset(asset_path, target_path, dirs_exist_ok=True)
182185

183186
print(f"copied! new project files are in {os.path.abspath(dest_path)}")
184187

activitysim/core/assign.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,36 @@ def read_assignment_spec(
9696
"""
9797

9898
try:
99-
cfg = pd.read_csv(file_name, comment="#")
99+
# we use an explicit list of na_values, these are the values that
100+
# Pandas version 1.5 recognized as NaN by default. Notably absent is
101+
# 'None' which is used in some spec files to be the object `None` not
102+
# the float value NaN.
103+
cfg = pd.read_csv(
104+
file_name,
105+
comment="#",
106+
na_values=[
107+
"",
108+
"#N/A",
109+
"#N/A N/A",
110+
"#NA",
111+
"-1.#IND",
112+
"-1.#QNAN",
113+
"-NaN",
114+
"-nan",
115+
"1.#IND",
116+
"1.#QNAN",
117+
"<NA>",
118+
"N/A",
119+
"NA",
120+
"NULL",
121+
"NaN",
122+
"n/a",
123+
"nan",
124+
"null",
125+
],
126+
keep_default_na=False,
127+
)
128+
100129
except Exception as e:
101130
logger.error(f"Error reading spec file: {file_name}")
102131
logger.error(str(e))

activitysim/core/fast_eval.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING, Any
4+
5+
import pandas as pd
6+
from pandas import eval as _eval
7+
8+
if TYPE_CHECKING:
9+
from collections.abc import Hashable, Iterator, Mapping, Sequence
10+
11+
from pandas._typing import ArrayLike
12+
13+
14+
def _get_cleaned_column_resolvers(
15+
df: pd.DataFrame, raw: bool = True
16+
) -> dict[Hashable, ArrayLike | pd.Series]:
17+
"""
18+
Return the special character free column resolvers of a dataframe.
19+
20+
Column names with special characters are 'cleaned up' so that they can
21+
be referred to by backtick quoting.
22+
Used in :meth:`DataFrame.eval`.
23+
"""
24+
from pandas import Series
25+
from pandas.core.computation.parsing import clean_column_name
26+
27+
if isinstance(df, pd.Series):
28+
return {clean_column_name(df.name): df}
29+
30+
# CHANGED FROM PANDAS: do not even convert the arrays to pd.Series, just
31+
# give the raw arrays to the compute engine. This is potentially a breaking
32+
# change if any of the operations in the eval string require a pd.Series.
33+
if raw:
34+
# Performance tradeoff: in the dict below, we iterate over `df.items`,
35+
# which yields tuples of (column_name, data as pd.Series). This is marginally
36+
# slower than iterating over `df.columns` and `df._iter_column_arrays()`,
37+
# but the latter is not in Pandas' public API, and may be removed in the future.
38+
return {
39+
clean_column_name(k): v for k, v in df.items() if not isinstance(k, int)
40+
}
41+
42+
# CHANGED FROM PANDAS: do not call df.dtype inside the dict comprehension loop
43+
# This update has been made in https://github.com/pandas-dev/pandas/pull/59573,
44+
# but appears not to have been released yet as of pandas 2.2.3
45+
dtypes = df.dtypes
46+
47+
return {
48+
clean_column_name(k): Series(
49+
v, copy=False, index=df.index, name=k, dtype=dtypes[k]
50+
).__finalize__(df)
51+
for k, v in zip(df.columns, df._iter_column_arrays())
52+
if not isinstance(k, int)
53+
}
54+
55+
56+
def fast_eval(df: pd.DataFrame, expr: str, **kwargs) -> Any | None:
57+
"""
58+
Evaluate a string describing operations on DataFrame columns.
59+
60+
Operates on columns only, not specific rows or elements. This allows
61+
`eval` to run arbitrary code, which can make you vulnerable to code
62+
injection if you pass user input to this function.
63+
64+
This function is a wrapper that replaces :meth:`~pandas.DataFrame.eval`
65+
with a more efficient version than in the default pandas library (as
66+
of pandas 2.2.3). It is recommended to use this function instead of
67+
:meth:`~pandas.DataFrame.eval` for better performance. However, if you
68+
encounter issues with this function, you can switch back to the default
69+
pandas eval by changing the function call from `fast_eval(df, ...)` to
70+
`df.eval(...)`.
71+
72+
Parameters
73+
----------
74+
expr : str
75+
The expression string to evaluate.
76+
**kwargs
77+
See the documentation for :meth:`~pandas.DataFrame.eval` for complete
78+
details on the keyword arguments accepted.
79+
80+
Returns
81+
-------
82+
ndarray, scalar, or pandas object
83+
The result of the evaluation.
84+
"""
85+
86+
inplace = False
87+
kwargs["level"] = kwargs.pop("level", 0) + 1
88+
index_resolvers = df._get_index_resolvers()
89+
column_resolvers = _get_cleaned_column_resolvers(df)
90+
resolvers = column_resolvers, index_resolvers
91+
if "target" not in kwargs:
92+
kwargs["target"] = df
93+
kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers
94+
95+
try:
96+
return pd.Series(
97+
_eval(expr, inplace=inplace, **kwargs), index=df.index, name=expr
98+
).__finalize__(df)
99+
except Exception as e:
100+
# Initially assume that the exception is caused by the potentially
101+
# breaking change in _get_cleaned_column_resolvers, and try again
102+
# TODO: what kind of exception should be caught here so it is less broad
103+
column_resolvers = _get_cleaned_column_resolvers(df, raw=False)
104+
resolvers = column_resolvers, index_resolvers
105+
kwargs["resolvers"] = kwargs["resolvers"][:-2] + resolvers
106+
return _eval(expr, inplace=inplace, **kwargs)

0 commit comments

Comments
 (0)