@@ -93,9 +93,40 @@ struct DataHandler{
93
93
preallocated_read_data:: PR
94
94
end
95
95
96
+
97
+ """
98
+ _check_file_paths_varnames(file_paths, varnames, regridder_type, compose_function)
99
+
100
+ Check consistency of `file_paths`, `varnames`, `regridder_type`, and `compose_function` for
101
+ our current `DataHandler`.
102
+ """
103
+ function _check_file_paths_varnames (
104
+ file_paths,
105
+ varnames,
106
+ regridder_type,
107
+ compose_function,
108
+ )
109
+ # Verify that the number of file paths and variable names are consistent
110
+ if length (varnames) == 1
111
+ # Multiple files are not not supported by TempestRegridder
112
+ (length (file_paths) > 1 && regridder_type == :TempestRegridder ) &&
113
+ error (" TempestRegridder does not support multiple input files" )
114
+ else
115
+ # We have multiple variables
116
+ # This is not supported by TempestRegridder
117
+ regridder_type == :TempestRegridder &&
118
+ error (" TempestRegridder does not support multiple input variables" )
119
+
120
+ # We need a compose_function when passed multiple variables
121
+ compose_function == identity && error (
122
+ " `compose_function` must be specified when using multiple input variables" ,
123
+ )
124
+ end
125
+ end
126
+
96
127
"""
97
- DataHandler(file_paths::Union{AbstractString, AbstractArray{<:AbstractString}} ,
98
- varnames::Union{AbstractString, AbstractArray{<:AbstractString}} ,
128
+ DataHandler(file_paths,
129
+ varnames,
99
130
target_space::ClimaCore.Spaces.AbstractSpace;
100
131
start_date::Dates.DateTime = Dates.DateTime(1979, 1, 1),
101
132
regridder_type = nothing,
104
135
file_reader_kwargs = ())
105
136
106
137
Create a `DataHandler` to read `varnames` from `file_paths` and remap them to `target_space`.
107
- `file_paths` may contain either one path for all variables or one path for each variable.
108
- In the latter case, the entries of `file_paths` and `varnames` are expected to match based on position.
138
+
139
+ This function supports reading across multiple files and composing variables that are in
140
+ different files.
141
+
142
+
143
+ `file_paths` may contain either one path for all variables or one path for each variable. In
144
+ the latter case, the entries of `file_paths` and `varnames` are expected to match based on
145
+ position.
109
146
110
147
The DataHandler maintains an LRU cache of Fields that were previously computed.
111
148
@@ -114,7 +151,21 @@ Creating this object results in the file being accessed (to preallocate some mem
114
151
Positional arguments
115
152
=====================
116
153
117
- - `file_paths`: Paths of the NetCDF file(s) that contain the input data.
154
+ - `file_paths`: Paths of the NetCDF file(s) that contain the input data. `file_paths` should
155
+ be as "do-what-I-mean" as possible, meaning that it should behave as you expect.
156
+
157
+ To be specific, there are three options for `file_paths`:
158
+ - It is a string that points to a single NetCDF file.
159
+ - It is a list that points to multiple NetCDF files. In this case, we support two modes:
160
+ 1. if `varnames` is a vector with the number of entries as `file_paths`, we assume that
161
+ each file contains a different variable.
162
+ 2. otherwise, we assume that each file contains all the variables and is temporal chunk.
163
+ - It is a list of lists of paths to NetCDF files, where the inner list identifies temporal
164
+ chunks of a given variable, and the outer list identifies different variables
165
+ (supporting the mode where different variables live in different files and their time
166
+ development is split across multiple files). In other words, `file_paths[i]` is the list
167
+ of files that define the temporal evolution of `varnames[i]`
168
+
118
169
- `varnames`: Names of the datasets in the NetCDF that have to be read and processed.
119
170
- `target_space`: Space where the simulation is run, where the data has to be regridded to.
120
171
@@ -138,13 +189,16 @@ everything more type stable.)
138
189
It can be a NamedTuple, or a Dictionary that maps Symbols to values.
139
190
- `file_reader_kwargs`: Additional keywords to be passed to the constructor of the file reader.
140
191
It can be a NamedTuple, or a Dictionary that maps Symbols to values.
141
- - `compose_function`: Function to combine multiple input variables into a single data variable.
142
- The default, to be used in the case of one input variable, is the identity.
143
- Note that the order of `varnames` must match the argument order of `compose_function`.
192
+ - `compose_function`: Function to combine multiple input variables into a single data
193
+ variable. The default, to be used in the case of one input variable,
194
+ is the identity. The compose function has to take N arguments, where
195
+ N is the number of variables in `varnames`, and return a scalar.
196
+ The order of the arguments in `compose_function` has to match the order
197
+ of `varnames`. This function will be broadcasted to data read from file.
144
198
"""
145
199
function DataHandling. DataHandler (
146
- file_paths:: Union{AbstractString, AbstractArray{<:AbstractString}} ,
147
- varnames:: Union{AbstractString, AbstractArray{<:AbstractString}} ,
200
+ file_paths,
201
+ varnames,
148
202
target_space:: ClimaCore.Spaces.AbstractSpace ;
149
203
start_date:: Union{Dates.DateTime, Dates.Date} = Dates. DateTime (1979 , 1 , 1 ),
150
204
regridder_type = nothing ,
@@ -182,38 +236,51 @@ function DataHandling.DataHandler(
182
236
varnames = [varnames]
183
237
end
184
238
185
- # Verify that the number of file paths and variable names match
186
- (length (file_paths) > 1 && length (file_paths) != length (varnames)) && error (
187
- " Number of file paths ($(length (file_paths)) ) and variable names ($(length (varnames)) ) do not match." ,
188
- )
189
-
190
- # Verify that `compose_function` is specified when using multiple input variables
191
- (length (varnames) > 1 && compose_function == identity) && error (
192
- " `compose_function` must be specified when using multiple input variables" ,
193
- )
194
-
195
- # Verify that `compose_function` is identity when using a single input variable
196
- (length (varnames) == 1 && compose_function != identity) && error (
197
- " `compose_function` must be identity when using a single input variable" ,
198
- )
199
-
200
- # TempestRegridder does not support multiple input variables
201
- (length (varnames) > 1 && regridder_type == :TempestRegridder ) &&
202
- error (" TempestRegridder does not support multiple input variables" )
203
-
204
239
# Determine which regridder to use if not already specified
205
240
regridder_type =
206
241
isnothing (regridder_type) ? Regridders. default_regridder_type () :
207
242
regridder_type
208
243
209
- # Construct a file reader, which deals with ingesting data and is possibly buffered/cached, for each input file
210
- file_readers = Dict {String, AbstractFileReader} ()
211
- all_vars_in_same_file = length (file_paths) == 1
212
- for (i, varname) in enumerate (varnames)
213
- file_path = all_vars_in_same_file ? first (file_paths) : file_paths[i]
214
- file_readers[varname] =
215
- NCFileReader (file_path, varname; file_reader_kwargs... )
244
+ _check_file_paths_varnames (
245
+ file_paths,
246
+ varnames,
247
+ regridder_type,
248
+ compose_function,
249
+ )
250
+
251
+ # We have to deal with the case with have 1 FileReader (with possibly multiple files),
252
+ # or with N FileReaders (for when variables are split across files, and with possibly
253
+ # multiple files). To accommodate all these cases, we cast everything into the format
254
+ # where we have a list of lists, where the outer list is along variable names, and the
255
+ # inner list is along times. This is the most general input we expect from this
256
+ # constructor.
257
+
258
+ is_file_paths_list_of_lists = ! (first (file_paths) isa AbstractString)
259
+
260
+ if ! is_file_paths_list_of_lists
261
+ # If is_file_paths_list_of_lists not already a list of lists, we have two options:
262
+ # 1. file_paths identifies the temporal development of the variables
263
+ # 2. file_paths identifies different variables
264
+
265
+ # We use as heuristic that when the number of files provided is the same as the
266
+ # number of variables, that means that the files include different variables
267
+ if length (file_paths) == length (varnames)
268
+ # One file per variable
269
+ file_paths = [[f] for f in file_paths]
270
+ else
271
+ # Every file per every variable
272
+ file_paths = [copy (file_paths) for _ in varnames]
273
+ end
216
274
end
275
+ # Now, we have a list of lists, where file_paths[i] is the list of files that define the
276
+ # temporal evolution of varnames[i]
277
+
278
+ # Construct the file readers, which deals with ingesting data and is possibly
279
+ # buffered/cached, for each variable
280
+ file_readers = Dict (
281
+ varname => NCFileReader (paths, varname; file_reader_kwargs... ) for
282
+ (paths, varname) in zip (file_paths, varnames)
283
+ )
217
284
218
285
# Verify that the spatial dimensions are the same for each variable
219
286
@assert length (
@@ -248,9 +315,11 @@ function DataHandling.DataHandler(
248
315
regridder_kwargs = merge ((; regrid_dir), regridder_kwargs)
249
316
end
250
317
251
- # Note: using one arbitrary element of `varnames` and of `file_paths` assumes
252
- # that all input variables will use the same regridding
253
- regridder_args = (target_space, first (varnames), first (file_paths))
318
+ # Note: using one arbitrary element of `varnames` and of `file_paths`
319
+ # assumes that all input variables will use the same regridding (there
320
+ # are two firsts in file_paths because we now have a list of lists)
321
+ regridder_args =
322
+ (target_space, first (varnames), first (first (file_paths)))
254
323
elseif regridder_type == :InterpolationsRegridder
255
324
regridder_args = (target_space,)
256
325
end
0 commit comments