Skip to content

Commit 1027026

Browse files
authored
Skim name conflicts (#939)
* identify and flag naming conflicts in 2d vs 3d skims * add skims doc * run on workflow_dispatch * add link to docs * write error message in exception also * add error message example * code block formatting * unit tests
1 parent 2e5f732 commit 1027026

File tree

5 files changed

+344
-32
lines changed

5 files changed

+344
-32
lines changed

.github/workflows/core_tests.yml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ jobs:
3636
uses: actions/setup-python@v5
3737
with:
3838
python-version-file: ".python-version"
39-
39+
4040
- name: Install activitysim
4141
run: |
4242
uv sync --locked --only-group github-action
@@ -128,6 +128,8 @@ jobs:
128128
run: |
129129
uv run pytest --pyargs activitysim.cli
130130
131+
- run: uv run pytest test/test_skim_name_conflicts.py
132+
- run: uv run pytest test/random_seed/test_random_seed.py
131133

132134
builtin_regional_models:
133135
needs: foundation
@@ -166,7 +168,7 @@ jobs:
166168
uses: actions/setup-python@v5
167169
with:
168170
python-version-file: ".python-version"
169-
171+
170172
- name: Install activitysim
171173
run: |
172174
uv sync --locked --only-group github-action
@@ -235,7 +237,7 @@ jobs:
235237
uses: actions/setup-python@v5
236238
with:
237239
python-version-file: ".python-version"
238-
240+
239241
- name: Install activitysim
240242
run: |
241243
uv sync --locked --only-group github-action
@@ -277,7 +279,7 @@ jobs:
277279
uses: actions/setup-python@v5
278280
with:
279281
python-version-file: ".python-version"
280-
282+
281283
- name: Install activitysim
282284
run: |
283285
uv sync --locked --only-group github-action
@@ -310,7 +312,7 @@ jobs:
310312
uses: actions/setup-python@v5
311313
with:
312314
python-version-file: ".python-version"
313-
315+
314316
- name: Install activitysim
315317
run: |
316318
uv sync --locked --only-group github-action

activitysim/core/skim_dataset.py

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import glob
44
import logging
55
import os
6+
import re
67
import time
78
from functools import partial
89
from pathlib import Path
@@ -499,8 +500,6 @@ def _apply_digital_encoding(dataset, digital_encodings):
499500
As modified
500501
"""
501502
if digital_encodings:
502-
import re
503-
504503
# apply once, before saving to zarr, will stick around in cache
505504
for encoding in digital_encodings:
506505
logger.info(f"applying zarr digital-encoding: {encoding}")
@@ -753,6 +752,68 @@ def load_skim_dataset_to_shared_memory(state, skim_tag="taz") -> xr.Dataset:
753752
d = None # skims are not stored in shared memory, so we need to load them
754753
do_not_save_zarr = False
755754

755+
potential_conflicts = network_los_preload.skims_info.get(skim_tag).skim_conflicts
756+
if potential_conflicts:
757+
# There are some conflicts in the skims, where both time-dependent
758+
# and time-agnostic skim with the same name are present. We need
759+
# to check we have sufficient ignore rules in place to correct this
760+
# condition.
761+
762+
def _should_ignore(ignore, x):
763+
if isinstance(ignore, str):
764+
ignore = [ignore]
765+
if ignore is not None:
766+
for i in ignore:
767+
if re.match(i, x):
768+
return True
769+
return False
770+
771+
ignore = state.settings.omx_ignore_patterns
772+
problems = []
773+
for time_agnostic, time_dependent in potential_conflicts.items():
774+
# option 1, ignore all the time-dependent skims
775+
# if this is fulfilled, we are ok and can proceed
776+
if all((_should_ignore(ignore, i)) for i in time_dependent):
777+
continue
778+
# option 2, ignore the time-agnostic skim
779+
# if this is fulfilled, we are ok and can proceed
780+
if _should_ignore(ignore, time_agnostic):
781+
continue
782+
# otherwise, we have a problem. collect all the problems
783+
# and raise an error at the end listing all of them
784+
problems.append(time_agnostic)
785+
if problems:
786+
solution_1 = "__.+'\n - '^".join(problems)
787+
solution_2 = "$'\n - '^".join(problems)
788+
# we have a problem, raise an error
789+
error_message = (
790+
f"skims {problems} are present in both time-dependent and time-agnostic formats.\n"
791+
"Please add ignore rules to the omx_ignore_patterns setting to resolve this issue.\n"
792+
"To ignore the time dependent skims, add the following to your settings file:\n"
793+
"\n"
794+
"omx_ignore_patterns:\n"
795+
f" - '^{solution_1}__.+'\n"
796+
"\n"
797+
"To ignore the time agnostic skims, add the following to your settings file:\n"
798+
"\n"
799+
"omx_ignore_patterns:\n"
800+
f" - '^{solution_2}$'\n"
801+
"\n"
802+
"You can also do some variation or combination of the two, as long as you resolve\n"
803+
"the conflict(s). In addition, note that minor edits to model spec files may be\n"
804+
"needed to accommodate these changes in how skim data is represented (e.g. changing\n"
805+
"`odt_skims` to `od_skims`, or similar modifications wherever the offending variable\n"
806+
"names are used). Alternatively, you can modify the skim data in the source files to\n"
807+
"remove the naming conflicts, which is typically done upstream of ActivitySim in\n"
808+
"whatever tool you are using to create the skims in the first place.\n"
809+
"\n"
810+
"See [https://activitysim.github.io/?q=skims] for more information.\n"
811+
)
812+
# write the error message to the log
813+
logger.error(error_message)
814+
# raise an error to stop the run, put the entire error message there also
815+
raise ValueError(error_message)
816+
756817
if d is None:
757818
time_periods = _dedupe_time_periods(network_los_preload)
758819
if zarr_file:

activitysim/core/skim_dict_factory.py

Lines changed: 58 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import os
99
import warnings
1010
from abc import ABC
11+
from collections import defaultdict
1112

1213
import numpy as np
1314
import openmatrix as omx
@@ -86,6 +87,7 @@ def __init__(self, state, skim_tag, network_los):
8687
self.offset_map_name = None
8788
self.offset_map = None
8889
self.omx_keys = None
90+
self.skim_conflicts = None
8991
self.base_keys = None
9092
self.block_offsets = None
9193

@@ -117,7 +119,12 @@ def load_skim_info(self, state, skim_tag):
117119
logger.debug(f"load_skim_info {skim_tag} reading {omx_file_path}")
118120

119121
with omx.open_file(omx_file_path, mode="r") as omx_file:
120-
# fixme call to omx_file.shape() failing in windows p3.5
122+
123+
# Check the shape of the skims. All skim files loaded within this
124+
# loop need to have the same shape. For the first file, the
125+
# shape is set to the omx_shape attribute, so we know what shape
126+
# to expect. For subsequent files, we check that the shape is the
127+
# same as the first file. If not, we raise an error.
121128
if self.omx_shape is None:
122129
self.omx_shape = tuple(
123130
int(i) for i in omx_file.shape()
@@ -127,13 +134,24 @@ def load_skim_info(self, state, skim_tag):
127134
int(i) for i in omx_file.shape()
128135
), f"Mismatch shape {self.omx_shape} != {omx_file.shape()}"
129136

137+
# Check that all the matrix names are unique across all the
138+
# omx files. This check is only looking at the name as stored in
139+
# the file, and is not processing any time period transformations.
140+
# If duplicate names are found, a warning is issued. This is not
141+
# a fatal error, but it is inefficient and may be symptom of a
142+
# deeper problem.
130143
for skim_name in omx_file.listMatrices():
131144
if skim_name in self.omx_manifest:
132145
warnings.warn(
133146
f"duplicate skim '{skim_name}' found in {self.omx_manifest[skim_name]} and {omx_file.filename}"
134147
)
135148
self.omx_manifest[skim_name] = omx_file_path
136149

150+
# We load the offset map if it exists. This is expected to be
151+
# a 1D array of integers that that gives ID values for each TAZ
152+
# in the skims. ActivitySim expects there to be only one such
153+
# mapping, although it can appear multiple times (e.g. once in
154+
# each file).
137155
for m in omx_file.listMappings():
138156
if self.offset_map is None:
139157
self.offset_map_name = m
@@ -146,21 +164,54 @@ def load_skim_info(self, state, skim_tag):
146164
f"Multiple mappings in omx file: {self.offset_map_name} != {m}"
147165
)
148166

149-
# - omx_keys dict maps skim key to omx_key
150-
# DISTWALK: DISTWALK
151-
# ('DRV_COM_WLK_BOARDS', 'AM'): DRV_COM_WLK_BOARDS__AM, ...
167+
# Create the `omx_keys` mapping, which connects skim key to omx_key.
168+
# The skim key is either a single string that names a skim that is not
169+
# time-dependent, or a 2-tuple of strings which names a skim and a time
170+
# period. The omx_key is the original name of the skim in the omx file.
171+
# For non-time-dependent skims, the omx_key is the same as the skim key,
172+
# e.g. DISTWALK: DISTWALK. For time-dependent skims, the omx_key is the
173+
# skim key with the time period appended,
174+
# e.g. ('DRV_COM_WLK_BOARDS', 'AM'): DRV_COM_WLK_BOARDS__AM.
152175
self.omx_keys = dict()
153176
for skim_name in self.omx_manifest.keys():
154177
key1, sep, key2 = skim_name.partition("__")
155-
156178
# - ignore composite tags not in dim3_tags_to_load
157179
if dim3_tags_to_load and sep and key2 not in dim3_tags_to_load:
180+
# If a skim is found that has a time period that is not one of
181+
# the known named time periods, a warning is issued, and that
182+
# skim is ignored. This is not a fatal error, but it may be a
183+
# symptom of a deeper problem.
184+
warnings.warn(f"skim '{key1}' has unknown time period '{key2}'")
158185
continue
159-
160186
skim_key = (key1, key2) if sep else key1
161-
162187
self.omx_keys[skim_key] = skim_name
163188

189+
# Create a skim_conflicts set, which identifies any skims that have both
190+
# time-dependent and time-agnostic versions. This condition in and of
191+
# itself is not a fatal error, as it is possible to have both types of skims
192+
# in the same data when using the legacy codebase. When using skim_dataset
193+
# instead of skim_dictionary (the former is required when using sharrow) this
194+
# condition is no longer allowed, although we can potentially recover if
195+
# the user has specified instructions that certain skim variables are not
196+
# to be loaded. The recovery option is checked later, in the skim_dataset
197+
# module, as that is where the skim variables are actually loaded.
198+
time_dependent_skims = defaultdict(set)
199+
time_agnostic_skims = set()
200+
for k, v in self.omx_keys.items():
201+
if isinstance(k, tuple):
202+
time_dependent_skims[k[0]].add(v)
203+
else:
204+
time_agnostic_skims.add(k)
205+
self.skim_conflicts = {
206+
k: v for k, v in time_dependent_skims.items() if k in time_agnostic_skims
207+
}
208+
if self.skim_conflicts:
209+
msg = "some skims have both time-dependent and time-agnostic versions:"
210+
for k in self.skim_conflicts:
211+
msg += f"\n- {k}"
212+
warnings.warn(msg)
213+
214+
# Count the number of skims in the omx file
164215
self.num_skims = len(self.omx_keys)
165216

166217
# - key1_subkeys dict maps key1 to dict of subkeys with that key1

0 commit comments

Comments
 (0)