EmpaEconversion
diff --git a/‎aurora_cycler_manager/analysis.py‎
Lines changed: 59 additions & 28 deletions b/‎aurora_cycler_manager/analysis.py‎
Lines changed: 59 additions & 28 deletions
diff --git a/‎aurora_cycler_manager/battinfo_utils.py‎
Lines changed: 48 additions & 3 deletions b/‎aurora_cycler_manager/battinfo_utils.py‎
Lines changed: 48 additions & 3 deletions
diff --git a/‎aurora_cycler_manager/bdf_converter.py‎
Lines changed: 0 additions & 55 deletions b/‎aurora_cycler_manager/bdf_converter.py‎
Lines changed: 0 additions & 55 deletions
diff --git a/‎aurora_cycler_manager/config.py‎
Lines changed: 4 additions & 0 deletions b/‎aurora_cycler_manager/config.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎aurora_cycler_manager/data_bundle.py‎ ‎aurora_cycler_manager/data_parse.py‎aurora_cycler_manager/data_bundle.py renamed to aurora_cycler_manager/data_parse.py
Lines changed: 56 additions & 6 deletions b/‎aurora_cycler_manager/data_bundle.py‎ ‎aurora_cycler_manager/data_parse.py‎aurora_cycler_manager/data_bundle.py renamed to aurora_cycler_manager/data_parse.py
Lines changed: 56 additions & 6 deletions
@@ -24,7 +24,7 @@
 from xlsxwriter import Workbook
 
 from aurora_cycler_manager.config import get_config
-from aurora_cycler_manager.data_bundle import (
+from aurora_cycler_manager.data_parse import (
     SampleDataBundle,
     get_cycles_summary,
     get_cycling,
@@ -83,22 +83,47 @@ def _sort_times(start_times: list | np.ndarray, end_times: list | np.ndarray) ->
     # Sort by reverse end time, then by start time
     sorted_positions = np.lexsort((valid_ends * -1, valid_starts))
     sorted_starts = valid_starts[sorted_positions]
-
-    # Remove duplicate start times, keep only the first element (longest)
-    unique_mask = np.concatenate(([True], sorted_starts[1:] != sorted_starts[:-1]))
+    sorted_ends = valid_ends[sorted_positions]
+
+    # Keep only non-overlapping intervals
+    keep_mask = np.ones(len(sorted_starts), dtype=bool)
+    current_max_end = -np.inf
+    for i in range(len(sorted_starts)):
+        if sorted_starts[i] >= current_max_end:
+            current_max_end = sorted_ends[i]
+        elif sorted_ends[i] <= current_max_end:
+            keep_mask[i] = False
+        else:
+            current_max_end = sorted_ends[i]
 
     # Map back to original indices
-    return valid_indices[sorted_positions[unique_mask]]
+    return valid_indices[sorted_positions[keep_mask]]
 
 
-def merge_metadata(job_files: list[Path], metadatas: list[dict]) -> dict:
+def merge_metadata(job_files: list[Path], metadatas: list[dict], sample_id: str) -> dict:
     """Merge several job metadata, add provenance, replace sample data with latest from db."""
-    sample_id = metadatas[0].get("sample_data", {}).get("Sample ID", "")
+    # Get sample data from database
     sample_data = get_sample_data(sample_id)
-    # Merge glossary dicts
+
+    # Flatten / merge glossary dicts
     glossary = {}
-    for g in [m.get("glossary", {}) for m in metadatas]:
-        glossary.update(g)
+    for m in metadatas:
+        g = m.get("glossary", {})
+        if isinstance(g, list):
+            for item in g:
+                glossary.update(item)
+        elif g:
+            glossary.update(g)
+
+    # Flatten job_data to one list
+    job_data = []
+    for m in metadatas:
+        jd = m.get("job_data", {})
+        if isinstance(jd, list):
+            job_data.extend(jd)
+        elif jd:
+            job_data.append(jd)
+
     return {
         "provenance": {
             "aurora_metadata": {
@@ -110,10 +135,12 @@ def merge_metadata(job_files: list[Path], metadatas: list[dict]) -> dict:
                     "datetime": datetime.now(timezone.utc).isoformat(),
                 },
             },
-            "original_file_provenance": {str(f): m["provenance"] for f, m in zip(job_files, metadatas, strict=True)},
+            "original_file_provenance": {
+                str(f): m.get("provenance") for f, m in zip(job_files, metadatas, strict=True)
+            },
         },
         "sample_data": sample_data,
-        "job_data": [m.get("job_data", {}) for m in metadatas],
+        "job_data": job_data,
         "glossary": glossary,
     }
 
@@ -157,7 +184,18 @@ def calc_dq(df: pl.DataFrame) -> pl.DataFrame:
 def merge_dfs(dfs: list[pl.DataFrame]) -> tuple[pl.DataFrame, pl.DataFrame | None]:
     """Merge cycling dataframes and add cycles. Seperate out EIS."""
     for i, df in enumerate(dfs):
-        dfs[i] = df.with_columns(pl.lit(i).alias("job_number"))
+        exprs = [pl.lit(i).alias("job_number")]
+        if "loop_number" not in df.columns:
+            exprs.append(pl.lit(0).alias("loop_number"))
+        if "cycle_number" not in df.columns:
+            if "Cycle" in df.columns:
+                exprs.append(pl.col("Cycle").alias("cycle_number"))
+            else:
+                exprs.append(pl.lit(0).alias("cycle_number"))
+        dfs[i] = df.with_columns(exprs)
+
+        if "dQ (mAh)" not in df.columns:
+            dfs[i] = calc_dq(dfs[i])
 
     df = pl.concat(dfs, how="diagonal")
 
@@ -172,13 +210,6 @@ def merge_dfs(dfs: list[pl.DataFrame]) -> tuple[pl.DataFrame, pl.DataFrame | Non
 
     if not df.is_empty():
         df = df.sort("uts")
-        if "loop_number" not in df.columns:
-            df = df.with_columns(pl.lit(0).alias("loop_number"))
-        else:
-            df = df.with_columns(pl.col("loop_number").fill_null(0))
-
-        if "dQ (mAh)" not in df.columns:
-            df = calc_dq(df)
 
         # Increment step if any job, cycle, or loop changes
         df = df.with_columns(pl.struct(["job_number", "cycle_number", "loop_number"]).rle_id().add(1).alias("Step"))
@@ -206,7 +237,7 @@ def merge_dfs(dfs: list[pl.DataFrame]) -> tuple[pl.DataFrame, pl.DataFrame | Non
         )
 
         # Join back to main dataframe
-        df = df.join(step_stats.select(["Step", "Cycle"]), on="Step", how="left")
+        df = df.drop("Cycle", strict=False).join(step_stats.select(["Step", "Cycle"]), on="Step", how="left")
 
         # EIS merge - find last non-zero cycle before the EIS
         if eis_df is not None:
@@ -769,7 +800,7 @@ def analyse_sample(sample_id: str) -> SampleDataBundle:
     df, eis_df = merge_dfs(dfs)
 
     # Merge metadatas together
-    metadata = merge_metadata(job_files, metadatas)
+    metadata = merge_metadata(job_files, metadatas, sample_id)
 
     # Get sample and job data
     sample_data = metadata.get("sample_data", {})
@@ -941,9 +972,9 @@ def shrink_all_samples(sampleid_contains: str = "") -> None:
         sampleid_contains (str, optional): only shrink samples with this string in the sampleid
 
     """
-    for batch_folder in Path(CONFIG["Processed snapshots folder path"]).iterdir():
-        if batch_folder.is_dir():
-            for sample_folder in batch_folder.iterdir():
+    for run_folder in Path(CONFIG["Data folder path"]).iterdir():
+        if run_folder.is_dir():
+            for sample_folder in run_folder.iterdir():
                 sample_id = sample_folder.name
                 if sampleid_contains and sampleid_contains not in sample_id:
                     continue
@@ -983,9 +1014,9 @@ def analyse_all_samples(
     else:
         samples_to_analyse = []
 
-    for batch_folder in Path(CONFIG["Processed snapshots folder path"]).iterdir():
-        if batch_folder.is_dir():
-            for sample in batch_folder.iterdir():
+    for run_folder in Path(CONFIG["Data folder path"]).iterdir():
+        if run_folder.is_dir():
+            for sample in run_folder.iterdir():
                 if sampleid_contains and sampleid_contains not in sample.name:
                     continue
                 if mode != "always" and sample.name not in samples_to_analyse:
 
@@ -13,6 +13,44 @@
 logger = logging.getLogger(__name__)
 
 
+blank_coin_cell = coin_cell = {
+    "@context": [
+        "https://w3id.org/emmo/domain/battery/context",
+        {
+            "schema": "https://schema.org/",
+            "emmo": "https://w3id.org/emmo#",
+            "echem": "https://w3id.org/emmo/domain/electrochemistry#",
+            "battery": "https://w3id.org/emmo/domain/battery#",
+            "chemical": "https://w3id.org/emmo/domain/chemical-substance#",
+            "unit": "https://qudt.org/vocab/unit/",
+            "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
+        },
+    ],
+    "@type": "CoinCell",
+    "schema:version": "1.1.16",
+    "hasPositiveElectrode": {
+        "@type": "Electrode",
+        "hasCurrentCollector": {"@type": "CurrentCollector"},
+        "hasCoating": {"@type": "ElectrodeCoating"},
+    },
+    "hasNegativeElectrode": {
+        "@type": "Electrode",
+        "hasCurrentCollector": {"@type": "CurrentCollector"},
+        "hasCoating": {"@type": "ElectrodeCoating"},
+    },
+    "hasCase": {
+        "@type": "R2032",
+        "hasComponent": [
+            {"@type": "CellLid"},
+            {"@type": "CellCan"},
+        ],
+    },
+    "hasComponent": [
+        {"@type": "Spring"},
+    ],
+}
+
+
 def _deep_merge_dicts(target: dict, source: dict) -> dict:
     """Recursively merge source into target."""
     for k, v in source.items():
@@ -109,12 +147,19 @@ def insert_dict_in_jsonld(
         raise TypeError(msg)
 
 
-def merge_battinfo_with_db_data(battinfo_jsonld: dict, sample_data: dict) -> dict:
+def merge_battinfo_with_db_data(
+    battinfo_jsonld: dict, sample_data: dict, *, allow_empty_battinfo: bool = False
+) -> dict:
     """Merge info from the database with BattINFO ontology."""
     coin_cell = find_coin_cell(battinfo_jsonld)
     if coin_cell is None:
-        msg = "Could not find CoinCell in JSON-LD"
-        raise ValueError(msg)
+        if allow_empty_battinfo:
+            # Make a default coin cell
+            battinfo_jsonld = blank_coin_cell.copy()
+            coin_cell = battinfo_jsonld
+        else:
+            msg = "Could not find CoinCell in JSON-LD"
+            raise ValueError(msg)
 
     # Sample ID and CCID (barcode)
     if sample_data.get("Barcode"):
 
@@ -146,6 +146,10 @@ def _read_config_file() -> dict:
 
     config["User config path"] = user_config_path
 
+    # Also accept "Data folder path" - will be prefered in future as it contains more than just snapshots
+    if not config.get("Data folder path"):
+        config["Data folder path"] = config.get("Processed snapshots folder path")
+
     # For SSH connections, paths must be str | None, does not accept Path
     if config.get("SSH private key path"):
         config["SSH private key path"] = str(config["SSH private key path"])
 
@@ -12,7 +12,7 @@
 import polars as pl
 
 from aurora_cycler_manager.config import get_config
-from aurora_cycler_manager.dicts import bdf_to_aurora_map
+from aurora_cycler_manager.dicts import aurora_dtypes, aurora_to_bdf_map, bdf_to_aurora_map
 from aurora_cycler_manager.stdlib_utils import run_from_sample
 
 CONFIG = get_config()
@@ -24,10 +24,11 @@ def read_cycling(file: str | Path) -> pl.DataFrame:
     if file.suffix == ".parquet":
         df = pl.read_parquet(file)
         if "voltage_volt" in df.columns:  # bdf
-            return df.rename(bdf_to_aurora_map, strict=False)
-        return df
+            return bdf_to_aurora(df)
+        return df.cast({k: v for k, v in aurora_dtypes.items() if k in df.columns}, strict=False)
     if file.suffix == ".h5":
-        return pl.DataFrame(pd.read_hdf(file))
+        df = pl.DataFrame(pd.read_hdf(file))
+        return df.cast({k: v for k, v in aurora_dtypes.items() if k in df.columns}, strict=False)
     msg = f"Unsupported file format {file.suffix}"
     raise ValueError(msg)
 
@@ -36,7 +37,7 @@ def read_metadata(file: str | Path) -> dict:
     """Read metadata from aurora-style parquet/hdf5 file."""
     file = Path(file)
     if file.suffix == ".parquet":
-        return json.loads(pl.read_parquet_metadata(file)["AURORA:metadata"])
+        return json.loads(pl.read_parquet_metadata(file).get("AURORA:metadata", "{}"))
     if file.suffix == ".h5":
         with h5py.File(file, "r") as f:
             return json.loads(f["metadata"][()])
@@ -47,7 +48,7 @@ def read_metadata(file: str | Path) -> dict:
 def get_sample_folder(sample_id: str) -> Path:
     """Get sample data folder."""
     run_id = run_from_sample(sample_id)
-    return CONFIG["Processed snapshots folder path"] / run_id / sample_id
+    return CONFIG["Data folder path"] / run_id / sample_id
 
 
 def get_cycling(sample_id: str) -> pl.DataFrame:
@@ -197,3 +198,52 @@ def metadata(self) -> dict | None:
         if self._preloaded["metadata"] is not None:
             return self._preloaded["metadata"]
         return get_metadata(self.sample_id)
+
+
+##### BDF convertsion #####
+
+
+def aurora_to_bdf(df: pl.DataFrame) -> pl.DataFrame:
+    """Convert an Aurora dataframe to BDF compliant dataframe."""
+    df.select([k for k in aurora_to_bdf_map if k in df.columns])
+    df = df.rename(aurora_to_bdf_map, strict=False)
+    if df.is_empty():
+        return df.with_columns(pl.lit(None).alias("test_time_second"))
+    t0 = df["unix_time_second"][0]
+    return df.with_columns((pl.col("unix_time_second") - t0).alias("test_time_second"))
+
+
+def bdf_to_aurora(df: pl.DataFrame) -> pl.DataFrame:
+    """Convert a BDF compliant dataframe to Aurora."""
+    exprs = []
+    if "test_time_millisecond" in df.columns:
+        exprs += [(pl.col("test_time_millisecond") / 1000).alias("test_time_second")]
+    if "date_time_millisecond" in df.columns:
+        exprs += [(pl.col("date_time_millisecond") / 1000).alias("unix_time_second")]
+    if "cycle_dimensionless" in df.columns:
+        exprs += [(pl.col("cycle_dimensionless")).alias("cycle_count")]
+    df = df.with_columns(exprs)
+    df = df.select([k for k in bdf_to_aurora_map if k in df.columns])
+    df = df.rename(bdf_to_aurora_map, strict=False)
+    if "uts" not in df:
+        msg = "Aurora dataframes must include unix time in seconds."
+        raise ValueError(msg)
+    return df.cast({k: v for k, v in aurora_dtypes.items() if k in df.columns}, strict=False)
+
+
+def aurora_to_bdf_parquet(aurora_full_file: str | Path, bdf_file: str | Path | None = None) -> None:
+    """Convert Aurora full file to BDF parquet file."""
+    aurora_full_file = Path(aurora_full_file)
+    df = read_cycling(aurora_full_file)
+    metadata = read_metadata(aurora_full_file)
+
+    # Convert to BDF style columns
+    df = aurora_to_bdf(df)
+
+    # Save parquet file
+    if not bdf_file:
+        bdf_file = aurora_full_file.with_suffix(".bdf.parquet")
+    else:
+        bdf_file = Path(bdf_file).with_suffix(".bdf.parquet")
+        bdf_file.parent.mkdir(exist_ok=True)
+    df.write_parquet(bdf_file, compression="brotli", metadata={"AURORA:metadata": json.dumps(metadata)})