Skip to content

Commit 82b77e7

Browse files
authored
sizer data imports (#359)
* added doc to website Fixes #357 * testing that cpc load still works * updated the exmples * updated example
1 parent 21d6d2b commit 82b77e7

File tree

7 files changed

+957
-96
lines changed

7 files changed

+957
-96
lines changed

docs/_toc.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ parts:
1313
- file: examples/distribution_evolution
1414
- file: examples/distribution_ambient
1515
- file: examples/ionparticle_coagulation
16+
- file: examples/loading_data_part1
1617
- caption: Documentation
1718
numbered: false
1819
chapters:

docs/examples/loading_data_part1.ipynb

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
" # Loading Data from a File\n",
7+
" # Loading Data from a File Part 1\n",
88
"\n",
9-
" This example shows how to load data from a file into a Stream object. These\n",
10-
" are usufull for doing some automated analysis, but you can just pull\n",
11-
" data from a file and do whatever you want with it."
9+
" This example shows how to load data from a file and automate the cleaning, \n",
10+
" formatting, and processing of the data.\n",
11+
"\n",
12+
" If you have a lot of data and repetitive tasks, you can use the scripts at\n",
13+
" the end of this example to clean up you import process."
1214
]
1315
},
1416
{
@@ -40,7 +42,7 @@
4042
},
4143
{
4244
"cell_type": "code",
43-
"execution_count": 16,
45+
"execution_count": 2,
4446
"metadata": {},
4547
"outputs": [
4648
{
@@ -308,6 +310,7 @@
308310
"filename_regex: *.csv\n",
309311
"MIN_SIZE_BYTES: 10\n",
310312
"data_loading_function: general_1d_load\n",
313+
"header_row: 0\n",
311314
"data_checks: {'characters': [10, 100], 'char_counts': {',': 4}, 'skip_rows': 0, 'skip_end': 0}\n",
312315
"data_column: [1, 2]\n",
313316
"data_header: ['data 1', 'data 2']\n",
@@ -364,7 +367,7 @@
364367
},
365368
{
366369
"cell_type": "code",
367-
"execution_count": 8,
370+
"execution_count": 9,
368371
"metadata": {},
369372
"outputs": [
370373
{
@@ -389,6 +392,7 @@
389392
" 'filename_regex': '*.csv',\n",
390393
" 'MIN_SIZE_BYTES': 10,\n",
391394
" 'data_loading_function':'general_1d_load',\n",
395+
" 'header_row': 0,\n",
392396
" 'data_checks': {'characters': [\n",
393397
" 10,\n",
394398
" 100],\n",
@@ -417,7 +421,7 @@
417421
},
418422
{
419423
"cell_type": "code",
420-
"execution_count": 9,
424+
"execution_count": 10,
421425
"metadata": {},
422426
"outputs": [
423427
{
@@ -441,7 +445,7 @@
441445
},
442446
{
443447
"cell_type": "code",
444-
"execution_count": 10,
448+
"execution_count": 11,
445449
"metadata": {},
446450
"outputs": [
447451
{
@@ -465,12 +469,24 @@
465469
" linestyle=\"none\",\n",
466470
" marker=\".\",)\n",
467471
"plt.tick_params(rotation=-35)\n",
468-
"ax.set_xlabel(\"Time (epoch)\")\n",
472+
"ax.set_xlabel(\"Time (UTC)\")\n",
469473
"ax.set_ylabel(\"Data\")\n",
470474
"ax.legend()\n",
471475
"plt.show()\n",
472476
"fig.tight_layout()"
473477
]
478+
},
479+
{
480+
"cell_type": "markdown",
481+
"metadata": {},
482+
"source": [
483+
"## Summary\n",
484+
"\n",
485+
"We covered how to load data from a file and automate the cleaning, formatting,\n",
486+
"and processing of the data. We then showed how to generate a settings\n",
487+
"dictionary and use that to load the data into a `Stream` object. This is\n",
488+
"useful if you have a lot of data and repetitive tasks. Doing this method also loads and combines multiple files into one `Stream` object.\n"
489+
]
474490
}
475491
],
476492
"metadata": {

docs/examples/loading_data_part2.ipynb

Lines changed: 714 additions & 0 deletions
Large diffs are not rendered by default.

particula/data/loader.py

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,7 @@ def general_data_formatter(
351351
time_column: Union[int, List[int]],
352352
time_format: str,
353353
delimiter: str = ',',
354+
header_row: int = 0,
354355
date_offset: str = None,
355356
seconds_shift: int = 0,
356357
timezone_identifier: str = 'UTC'
@@ -383,6 +384,14 @@ def general_data_formatter(
383384
A tuple containing two np.array objects: the first contains the
384385
epoch times, and the second contains the data.
385386
"""
387+
388+
# find str matching in header row and gets index
389+
if isinstance(data_column[0], str):
390+
data_header = data[header_row].split(delimiter)
391+
# Get data column indices
392+
data_column = [data_header.index(x)
393+
for x in data_column]
394+
386395
# Check the data format
387396
data = data_format_checks(data, data_checks)
388397

@@ -408,6 +417,7 @@ def sizer_data_formatter(
408417
time_column: int,
409418
time_format: str,
410419
delimiter: str = ',',
420+
header_row: int = 0,
411421
date_offset: str = None,
412422
seconds_shift: int = 0,
413423
timezone_identifier: str = 'UTC'
@@ -443,25 +453,33 @@ def sizer_data_formatter(
443453
"""
444454

445455
# Get Dp range and columns
446-
data_header = data[data_sizer_reader["header_rows"]].split(delimiter)
456+
data_header = data[header_row].split(delimiter)
457+
# check if start and end keywords are in the header
458+
if data_sizer_reader["Dp_start_keyword"] not in data_header:
459+
# rise error with snip of data header
460+
raise ValueError(
461+
f"Cannot find '{data_sizer_reader['Dp_start_keyword']}' in header"\
462+
+ f" {data_header[:20]}..."
463+
)
464+
if data_sizer_reader["Dp_end_keyword"] not in data_header:
465+
# rise error with snip of data header
466+
raise ValueError(
467+
f"Cannot find '{data_sizer_reader['Dp_end_keyword']}' in header"\
468+
+ f" {data_header[:20]}..."
469+
)
447470
dp_range = [
448471
data_header.index(data_sizer_reader["Dp_start_keyword"]),
449472
data_header.index(data_sizer_reader["Dp_end_keyword"])
450473
]
451474
dp_columns = list(range(dp_range[0]+1, dp_range[1]))
452-
dp_header = [data_header[i] for i in dp_columns]
475+
header = [data_header[i] for i in dp_columns]
453476
# change from np.array
454477

455-
# Get data columns
456-
data_column = [
457-
data_header.index(x) for x in data_sizer_reader["list_of_data_headers"]
458-
]
459-
460478
# Format data
461479
data = data_format_checks(data, data_checks)
462480

463481
# Get data arrays
464-
epoch_time, data_smps_2d = sample_data(
482+
epoch_time, data_2d = sample_data(
465483
data,
466484
time_column,
467485
time_format,
@@ -471,16 +489,6 @@ def sizer_data_formatter(
471489
seconds_shift=seconds_shift,
472490
timezone_identifier=timezone_identifier
473491
)
474-
epoch_time, data_smps_1d = sample_data(
475-
data,
476-
time_column,
477-
time_format,
478-
data_column,
479-
delimiter,
480-
date_offset,
481-
seconds_shift=seconds_shift,
482-
timezone_identifier=timezone_identifier
483-
)
484492

485493
if "convert_scale_from" in data_sizer_reader:
486494
if data_sizer_reader["convert_scale_from"] == "dw":
@@ -493,13 +501,13 @@ def sizer_data_formatter(
493501
" Either dw/dlogdp or dw must be specified."
494502
)
495503
for i in range(len(epoch_time)):
496-
data_smps_2d[i, :] = convert.convert_sizer_dn(
497-
diameter=np.array(dp_header).astype(float),
498-
dn_dlogdp=data_smps_2d[i, :],
504+
data_2d[i, :] = convert.convert_sizer_dn(
505+
diameter=np.array(header).astype(float),
506+
dn_dlogdp=data_2d[i, :],
499507
inverse=inverse
500508
)
501509

502-
return epoch_time, dp_header, data_smps_2d, data_smps_1d
510+
return epoch_time, data_2d, header
503511

504512

505513
def non_standard_date_location(

0 commit comments

Comments
 (0)