Skip to content

Commit 1e04d5a

Browse files
authored
BDF upload (#113)
* Allow parquet upload * Allow adding new samples from zip * Normalize incoming bdf * Handle nested metadata in snapshots * Handle Cycle or cycle_number in snapshots * Allow app use without servers * Still add battinfo without xlsx * Update integration tests * `Processed snapshots` -> `Data`, cleanup * Fix sort - don't allow overlapping df * Make data_parse module, normalize dtypes
1 parent 96ae4c4 commit 1e04d5a

18 files changed

+350
-178
lines changed

aurora_cycler_manager/analysis.py

Lines changed: 59 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from xlsxwriter import Workbook
2525

2626
from aurora_cycler_manager.config import get_config
27-
from aurora_cycler_manager.data_bundle import (
27+
from aurora_cycler_manager.data_parse import (
2828
SampleDataBundle,
2929
get_cycles_summary,
3030
get_cycling,
@@ -83,22 +83,47 @@ def _sort_times(start_times: list | np.ndarray, end_times: list | np.ndarray) ->
8383
# Sort by reverse end time, then by start time
8484
sorted_positions = np.lexsort((valid_ends * -1, valid_starts))
8585
sorted_starts = valid_starts[sorted_positions]
86-
87-
# Remove duplicate start times, keep only the first element (longest)
88-
unique_mask = np.concatenate(([True], sorted_starts[1:] != sorted_starts[:-1]))
86+
sorted_ends = valid_ends[sorted_positions]
87+
88+
# Keep only non-overlapping intervals
89+
keep_mask = np.ones(len(sorted_starts), dtype=bool)
90+
current_max_end = -np.inf
91+
for i in range(len(sorted_starts)):
92+
if sorted_starts[i] >= current_max_end:
93+
current_max_end = sorted_ends[i]
94+
elif sorted_ends[i] <= current_max_end:
95+
keep_mask[i] = False
96+
else:
97+
current_max_end = sorted_ends[i]
8998

9099
# Map back to original indices
91-
return valid_indices[sorted_positions[unique_mask]]
100+
return valid_indices[sorted_positions[keep_mask]]
92101

93102

94-
def merge_metadata(job_files: list[Path], metadatas: list[dict]) -> dict:
103+
def merge_metadata(job_files: list[Path], metadatas: list[dict], sample_id: str) -> dict:
95104
"""Merge several job metadata, add provenance, replace sample data with latest from db."""
96-
sample_id = metadatas[0].get("sample_data", {}).get("Sample ID", "")
105+
# Get sample data from database
97106
sample_data = get_sample_data(sample_id)
98-
# Merge glossary dicts
107+
108+
# Flatten / merge glossary dicts
99109
glossary = {}
100-
for g in [m.get("glossary", {}) for m in metadatas]:
101-
glossary.update(g)
110+
for m in metadatas:
111+
g = m.get("glossary", {})
112+
if isinstance(g, list):
113+
for item in g:
114+
glossary.update(item)
115+
elif g:
116+
glossary.update(g)
117+
118+
# Flatten job_data to one list
119+
job_data = []
120+
for m in metadatas:
121+
jd = m.get("job_data", {})
122+
if isinstance(jd, list):
123+
job_data.extend(jd)
124+
elif jd:
125+
job_data.append(jd)
126+
102127
return {
103128
"provenance": {
104129
"aurora_metadata": {
@@ -110,10 +135,12 @@ def merge_metadata(job_files: list[Path], metadatas: list[dict]) -> dict:
110135
"datetime": datetime.now(timezone.utc).isoformat(),
111136
},
112137
},
113-
"original_file_provenance": {str(f): m["provenance"] for f, m in zip(job_files, metadatas, strict=True)},
138+
"original_file_provenance": {
139+
str(f): m.get("provenance") for f, m in zip(job_files, metadatas, strict=True)
140+
},
114141
},
115142
"sample_data": sample_data,
116-
"job_data": [m.get("job_data", {}) for m in metadatas],
143+
"job_data": job_data,
117144
"glossary": glossary,
118145
}
119146

@@ -157,7 +184,18 @@ def calc_dq(df: pl.DataFrame) -> pl.DataFrame:
157184
def merge_dfs(dfs: list[pl.DataFrame]) -> tuple[pl.DataFrame, pl.DataFrame | None]:
158185
"""Merge cycling dataframes and add cycles. Seperate out EIS."""
159186
for i, df in enumerate(dfs):
160-
dfs[i] = df.with_columns(pl.lit(i).alias("job_number"))
187+
exprs = [pl.lit(i).alias("job_number")]
188+
if "loop_number" not in df.columns:
189+
exprs.append(pl.lit(0).alias("loop_number"))
190+
if "cycle_number" not in df.columns:
191+
if "Cycle" in df.columns:
192+
exprs.append(pl.col("Cycle").alias("cycle_number"))
193+
else:
194+
exprs.append(pl.lit(0).alias("cycle_number"))
195+
dfs[i] = df.with_columns(exprs)
196+
197+
if "dQ (mAh)" not in df.columns:
198+
dfs[i] = calc_dq(dfs[i])
161199

162200
df = pl.concat(dfs, how="diagonal")
163201

@@ -172,13 +210,6 @@ def merge_dfs(dfs: list[pl.DataFrame]) -> tuple[pl.DataFrame, pl.DataFrame | Non
172210

173211
if not df.is_empty():
174212
df = df.sort("uts")
175-
if "loop_number" not in df.columns:
176-
df = df.with_columns(pl.lit(0).alias("loop_number"))
177-
else:
178-
df = df.with_columns(pl.col("loop_number").fill_null(0))
179-
180-
if "dQ (mAh)" not in df.columns:
181-
df = calc_dq(df)
182213

183214
# Increment step if any job, cycle, or loop changes
184215
df = df.with_columns(pl.struct(["job_number", "cycle_number", "loop_number"]).rle_id().add(1).alias("Step"))
@@ -206,7 +237,7 @@ def merge_dfs(dfs: list[pl.DataFrame]) -> tuple[pl.DataFrame, pl.DataFrame | Non
206237
)
207238

208239
# Join back to main dataframe
209-
df = df.join(step_stats.select(["Step", "Cycle"]), on="Step", how="left")
240+
df = df.drop("Cycle", strict=False).join(step_stats.select(["Step", "Cycle"]), on="Step", how="left")
210241

211242
# EIS merge - find last non-zero cycle before the EIS
212243
if eis_df is not None:
@@ -769,7 +800,7 @@ def analyse_sample(sample_id: str) -> SampleDataBundle:
769800
df, eis_df = merge_dfs(dfs)
770801

771802
# Merge metadatas together
772-
metadata = merge_metadata(job_files, metadatas)
803+
metadata = merge_metadata(job_files, metadatas, sample_id)
773804

774805
# Get sample and job data
775806
sample_data = metadata.get("sample_data", {})
@@ -941,9 +972,9 @@ def shrink_all_samples(sampleid_contains: str = "") -> None:
941972
sampleid_contains (str, optional): only shrink samples with this string in the sampleid
942973
943974
"""
944-
for batch_folder in Path(CONFIG["Processed snapshots folder path"]).iterdir():
945-
if batch_folder.is_dir():
946-
for sample_folder in batch_folder.iterdir():
975+
for run_folder in Path(CONFIG["Data folder path"]).iterdir():
976+
if run_folder.is_dir():
977+
for sample_folder in run_folder.iterdir():
947978
sample_id = sample_folder.name
948979
if sampleid_contains and sampleid_contains not in sample_id:
949980
continue
@@ -983,9 +1014,9 @@ def analyse_all_samples(
9831014
else:
9841015
samples_to_analyse = []
9851016

986-
for batch_folder in Path(CONFIG["Processed snapshots folder path"]).iterdir():
987-
if batch_folder.is_dir():
988-
for sample in batch_folder.iterdir():
1017+
for run_folder in Path(CONFIG["Data folder path"]).iterdir():
1018+
if run_folder.is_dir():
1019+
for sample in run_folder.iterdir():
9891020
if sampleid_contains and sampleid_contains not in sample.name:
9901021
continue
9911022
if mode != "always" and sample.name not in samples_to_analyse:

aurora_cycler_manager/battinfo_utils.py

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,44 @@
1313
logger = logging.getLogger(__name__)
1414

1515

16+
blank_coin_cell = coin_cell = {
17+
"@context": [
18+
"https://w3id.org/emmo/domain/battery/context",
19+
{
20+
"schema": "https://schema.org/",
21+
"emmo": "https://w3id.org/emmo#",
22+
"echem": "https://w3id.org/emmo/domain/electrochemistry#",
23+
"battery": "https://w3id.org/emmo/domain/battery#",
24+
"chemical": "https://w3id.org/emmo/domain/chemical-substance#",
25+
"unit": "https://qudt.org/vocab/unit/",
26+
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
27+
},
28+
],
29+
"@type": "CoinCell",
30+
"schema:version": "1.1.16",
31+
"hasPositiveElectrode": {
32+
"@type": "Electrode",
33+
"hasCurrentCollector": {"@type": "CurrentCollector"},
34+
"hasCoating": {"@type": "ElectrodeCoating"},
35+
},
36+
"hasNegativeElectrode": {
37+
"@type": "Electrode",
38+
"hasCurrentCollector": {"@type": "CurrentCollector"},
39+
"hasCoating": {"@type": "ElectrodeCoating"},
40+
},
41+
"hasCase": {
42+
"@type": "R2032",
43+
"hasComponent": [
44+
{"@type": "CellLid"},
45+
{"@type": "CellCan"},
46+
],
47+
},
48+
"hasComponent": [
49+
{"@type": "Spring"},
50+
],
51+
}
52+
53+
1654
def _deep_merge_dicts(target: dict, source: dict) -> dict:
1755
"""Recursively merge source into target."""
1856
for k, v in source.items():
@@ -109,12 +147,19 @@ def insert_dict_in_jsonld(
109147
raise TypeError(msg)
110148

111149

112-
def merge_battinfo_with_db_data(battinfo_jsonld: dict, sample_data: dict) -> dict:
150+
def merge_battinfo_with_db_data(
151+
battinfo_jsonld: dict, sample_data: dict, *, allow_empty_battinfo: bool = False
152+
) -> dict:
113153
"""Merge info from the database with BattINFO ontology."""
114154
coin_cell = find_coin_cell(battinfo_jsonld)
115155
if coin_cell is None:
116-
msg = "Could not find CoinCell in JSON-LD"
117-
raise ValueError(msg)
156+
if allow_empty_battinfo:
157+
# Make a default coin cell
158+
battinfo_jsonld = blank_coin_cell.copy()
159+
coin_cell = battinfo_jsonld
160+
else:
161+
msg = "Could not find CoinCell in JSON-LD"
162+
raise ValueError(msg)
118163

119164
# Sample ID and CCID (barcode)
120165
if sample_data.get("Barcode"):

aurora_cycler_manager/bdf_converter.py

Lines changed: 0 additions & 55 deletions
This file was deleted.

aurora_cycler_manager/config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,10 @@ def _read_config_file() -> dict:
146146

147147
config["User config path"] = user_config_path
148148

149+
# Also accept "Data folder path" - will be prefered in future as it contains more than just snapshots
150+
if not config.get("Data folder path"):
151+
config["Data folder path"] = config.get("Processed snapshots folder path")
152+
149153
# For SSH connections, paths must be str | None, does not accept Path
150154
if config.get("SSH private key path"):
151155
config["SSH private key path"] = str(config["SSH private key path"])
Lines changed: 56 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import polars as pl
1313

1414
from aurora_cycler_manager.config import get_config
15-
from aurora_cycler_manager.dicts import bdf_to_aurora_map
15+
from aurora_cycler_manager.dicts import aurora_dtypes, aurora_to_bdf_map, bdf_to_aurora_map
1616
from aurora_cycler_manager.stdlib_utils import run_from_sample
1717

1818
CONFIG = get_config()
@@ -24,10 +24,11 @@ def read_cycling(file: str | Path) -> pl.DataFrame:
2424
if file.suffix == ".parquet":
2525
df = pl.read_parquet(file)
2626
if "voltage_volt" in df.columns: # bdf
27-
return df.rename(bdf_to_aurora_map, strict=False)
28-
return df
27+
return bdf_to_aurora(df)
28+
return df.cast({k: v for k, v in aurora_dtypes.items() if k in df.columns}, strict=False)
2929
if file.suffix == ".h5":
30-
return pl.DataFrame(pd.read_hdf(file))
30+
df = pl.DataFrame(pd.read_hdf(file))
31+
return df.cast({k: v for k, v in aurora_dtypes.items() if k in df.columns}, strict=False)
3132
msg = f"Unsupported file format {file.suffix}"
3233
raise ValueError(msg)
3334

@@ -36,7 +37,7 @@ def read_metadata(file: str | Path) -> dict:
3637
"""Read metadata from aurora-style parquet/hdf5 file."""
3738
file = Path(file)
3839
if file.suffix == ".parquet":
39-
return json.loads(pl.read_parquet_metadata(file)["AURORA:metadata"])
40+
return json.loads(pl.read_parquet_metadata(file).get("AURORA:metadata", "{}"))
4041
if file.suffix == ".h5":
4142
with h5py.File(file, "r") as f:
4243
return json.loads(f["metadata"][()])
@@ -47,7 +48,7 @@ def read_metadata(file: str | Path) -> dict:
4748
def get_sample_folder(sample_id: str) -> Path:
4849
"""Get sample data folder."""
4950
run_id = run_from_sample(sample_id)
50-
return CONFIG["Processed snapshots folder path"] / run_id / sample_id
51+
return CONFIG["Data folder path"] / run_id / sample_id
5152

5253

5354
def get_cycling(sample_id: str) -> pl.DataFrame:
@@ -197,3 +198,52 @@ def metadata(self) -> dict | None:
197198
if self._preloaded["metadata"] is not None:
198199
return self._preloaded["metadata"]
199200
return get_metadata(self.sample_id)
201+
202+
203+
##### BDF convertsion #####
204+
205+
206+
def aurora_to_bdf(df: pl.DataFrame) -> pl.DataFrame:
207+
"""Convert an Aurora dataframe to BDF compliant dataframe."""
208+
df.select([k for k in aurora_to_bdf_map if k in df.columns])
209+
df = df.rename(aurora_to_bdf_map, strict=False)
210+
if df.is_empty():
211+
return df.with_columns(pl.lit(None).alias("test_time_second"))
212+
t0 = df["unix_time_second"][0]
213+
return df.with_columns((pl.col("unix_time_second") - t0).alias("test_time_second"))
214+
215+
216+
def bdf_to_aurora(df: pl.DataFrame) -> pl.DataFrame:
217+
"""Convert a BDF compliant dataframe to Aurora."""
218+
exprs = []
219+
if "test_time_millisecond" in df.columns:
220+
exprs += [(pl.col("test_time_millisecond") / 1000).alias("test_time_second")]
221+
if "date_time_millisecond" in df.columns:
222+
exprs += [(pl.col("date_time_millisecond") / 1000).alias("unix_time_second")]
223+
if "cycle_dimensionless" in df.columns:
224+
exprs += [(pl.col("cycle_dimensionless")).alias("cycle_count")]
225+
df = df.with_columns(exprs)
226+
df = df.select([k for k in bdf_to_aurora_map if k in df.columns])
227+
df = df.rename(bdf_to_aurora_map, strict=False)
228+
if "uts" not in df:
229+
msg = "Aurora dataframes must include unix time in seconds."
230+
raise ValueError(msg)
231+
return df.cast({k: v for k, v in aurora_dtypes.items() if k in df.columns}, strict=False)
232+
233+
234+
def aurora_to_bdf_parquet(aurora_full_file: str | Path, bdf_file: str | Path | None = None) -> None:
235+
"""Convert Aurora full file to BDF parquet file."""
236+
aurora_full_file = Path(aurora_full_file)
237+
df = read_cycling(aurora_full_file)
238+
metadata = read_metadata(aurora_full_file)
239+
240+
# Convert to BDF style columns
241+
df = aurora_to_bdf(df)
242+
243+
# Save parquet file
244+
if not bdf_file:
245+
bdf_file = aurora_full_file.with_suffix(".bdf.parquet")
246+
else:
247+
bdf_file = Path(bdf_file).with_suffix(".bdf.parquet")
248+
bdf_file.parent.mkdir(exist_ok=True)
249+
df.write_parquet(bdf_file, compression="brotli", metadata={"AURORA:metadata": json.dumps(metadata)})

0 commit comments

Comments
 (0)