33import os
44from pathlib import Path
55from typing import Collection
6+ import pickle
7+ from datetime import datetime
68
79import numpy as np
810import pandas as pd
@@ -46,6 +48,8 @@ def location_choice_model(
4648 settings_file = "{name}_model_settings.yaml" ,
4749 landuse_file = "{name}_landuse.csv" ,
4850 return_data = False ,
51+ alt_values_to_feather = False ,
52+ chunking_size = None ,
4953):
5054 model_selector = name .replace ("_location" , "" )
5155 model_selector = model_selector .replace ("_destination" , "" )
@@ -59,12 +63,42 @@ def _read_csv(filename, **kwargs):
5963 filename = filename .format (name = name )
6064 return pd .read_csv (os .path .join (edb_directory , filename ), ** kwargs )
6165
66+ def _read_feather (filename , ** kwargs ):
67+ filename = filename .format (name = name )
68+ return pd .read_feather (os .path .join (edb_directory , filename ), ** kwargs )
69+
70+ def _to_feather (df , filename , ** kwargs ):
71+ filename = filename .format (name = name )
72+ return df .to_feather (os .path .join (edb_directory , filename ), ** kwargs )
73+
74+ def _read_pickle (filename , ** kwargs ):
75+ filename = filename .format (name = name )
76+ return pd .read_pickle (os .path .join (edb_directory , filename ))
77+
78+ def _to_pickle (df , filename , ** kwargs ):
79+ filename = filename .format (name = name )
80+ return df .to_pickle (os .path .join (edb_directory , filename ))
81+
82+ def _file_exists (filename ):
83+ filename = filename .format (name = name )
84+ return os .path .exists (os .path .join (edb_directory , filename ))
85+
6286 coefficients = _read_csv (
6387 coefficients_file ,
6488 index_col = "coefficient_name" ,
6589 )
6690 spec = _read_csv (spec_file , comment = "#" )
67- alt_values = _read_csv (alt_values_file )
91+
92+ # read alternative values either as csv or feather file
93+ alt_values_fea_file = alt_values_file .replace (".csv" , ".fea" )
94+ if os .path .exists (
95+ os .path .join (edb_directory , alt_values_fea_file .format (name = name ))
96+ ):
97+ alt_values = _read_feather (alt_values_fea_file )
98+ else :
99+ alt_values = _read_csv (alt_values_file )
100+ if alt_values_to_feather :
101+ _to_feather (df = alt_values , filename = alt_values_fea_file )
68102 chooser_data = _read_csv (chooser_file )
69103 landuse = _read_csv (landuse_file , index_col = "zone_id" )
70104 master_size_spec = _read_csv (size_spec_file )
@@ -152,7 +186,48 @@ def _read_csv(filename, **kwargs):
152186
153187 chooser_index_name = chooser_data .columns [0 ]
154188 x_co = chooser_data .set_index (chooser_index_name )
155- x_ca = cv_to_ca (alt_values .set_index ([chooser_index_name , alt_values .columns [1 ]]))
189+
190+ def split (a , n ):
191+ k , m = divmod (len (a ), n )
192+ return (a [i * k + min (i , m ) : (i + 1 ) * k + min (i + 1 , m )] for i in range (n ))
193+
194+ # process x_ca with cv_to_ca with or without chunking
195+ x_ca_pickle_file = "{name}_x_ca.pkl"
196+ if chunking_size == None :
197+ x_ca = cv_to_ca (
198+ alt_values .set_index ([chooser_index_name , alt_values .columns [1 ]])
199+ )
200+ elif _file_exists (x_ca_pickle_file ):
201+ # if pickle file from previous x_ca processing exist, load it to save time
202+ time_start = datetime .now ()
203+ x_ca = _read_pickle (x_ca_pickle_file )
204+ print (
205+ f"x_ca data loaded from { name } _x_ca.fea - time elapsed { (datetime .now () - time_start ).total_seconds ()} "
206+ )
207+ else :
208+ time_start = datetime .now ()
209+ # calculate num_chunks based on chunking_size (or max number of rows per chunk)
210+ num_chunks = int (len (alt_values ) / chunking_size )
211+ all_person_ids = list (alt_values ["person_id" ].unique ())
212+ split_ids = list (split (all_person_ids , num_chunks ))
213+ x_ca_list = []
214+ i = 0
215+ for chunk_ids in split_ids :
216+ alt_values_i = alt_values [alt_values ["person_id" ].isin (chunk_ids )]
217+ x_ca_i = cv_to_ca (
218+ alt_values_i .set_index ([chooser_index_name , alt_values_i .columns [1 ]])
219+ )
220+ x_ca_list .append (x_ca_i )
221+ print (
222+ f"\r x_ca_i compute done for chunk { i } /{ num_chunks } - time elapsed { (datetime .now () - time_start ).total_seconds ()} "
223+ )
224+ i = i + 1
225+ x_ca = pd .concat (x_ca_list , axis = 0 )
226+ # save final x_ca result as pickle file to save time for future data loading
227+ _to_pickle (df = x_ca , filename = x_ca_pickle_file )
228+ print (
229+ f"x_ca compute done - time elapsed { (datetime .now () - time_start ).total_seconds ()} "
230+ )
156231
157232 if CHOOSER_SEGMENT_COLUMN_NAME is not None :
158233 # label segments with names
0 commit comments