Add refactored data processing module to reporting

flora-hofmann-frequenz · flora-hofmann-frequenz · commit 3625b426e432 · 2025-06-20T15:20:27.000+02:00
Signed-off-by: Flora &lt;flora.hofmann@frequenz.com&gt;
diff --git a/src/frequenz/lib/notebooks/reporting/data_processing.py b/src/frequenz/lib/notebooks/reporting/data_processing.py
@@ -0,0 +1,251 @@
+# License: MIT
+# Copyright © 2025 Frequenz Energy-as-a-Service GmbH
+
+"""Data processing utilities for microgrid energy reporting.
+
+Overview:
+---------
+This module provides reusable data transformation functions for generating
+energy reports from microgrid component data. It supports components such as
+photovoltaic systems (PV), batteries, and grid meters.
+
+Functions in this module follow a modular, composable pipeline and are typically
+called in a sequence within reporting notebooks. Each function accepts structured
+data and configuration, then returns a modified DataFrame or summary statistics.
+
+Core responsibilities:
+- Timezone localization to a unified timezone
+- Enrichment of data with derived metrics (e.g., PV self-consumption)
+- Renaming of technical column names based on configuration
+- Preparation of summary and analysis-ready DataFrames
+
+Assumptions:
+------------
+- The input DataFrame contains at minimum:
+  - A timestamp column (`"timestamp"`)
+  - Grid and consumption data columns (e.g., `"grid"`, `"consumption"`)
+- PV and battery metrics require additional fields
+  (e.g., `"pv_neg"`, `"battery_pos"`)
+- The timestamp column must be timezone-aware or UTC-naive
+- A configuration object (`mcfg`) must provide `component_type_ids(...)`
+- `component_types` must be a list containing any of:
+  `"grid"`, `"consumption"`, `"pv"`, `"battery"`, `"chp"`, `"ev"`
+
+Outputs:
+--------
+Each function returns one of:
+- A mutated or extended `pd.DataFrame`
+- A tuple of DataFrames for staged reporting
+- A dictionary of summary statistics
+- A long-format DataFrame for visualization or further analysis
+"""
+
+
+from typing import Any, Dict, List, Tuple
+
+import pandas as pd
+
+# Constants for column names
+TZ_NAME = "Europe/Berlin"
+COLUMN_TIMESTAMP = "timestamp"
+COLUMN_GRID = "grid"
+COLUMN_GRID_NAMED = "Netzanschluss"
+COLUMN_NET_IMPORT = "Netzbezug"
+COLUMN_CONSUMPTION = "consumption"
+COLUMN_CONSUMPTION_NAMED = "Brutto Gesamtverbrauch"
+COLUMN_BATTERY = "battery"
+COLUMN_BATTERY_NAMED = "Batterie Durchsatz"
+COLUMN_PV_PROD = "PV Produktion"
+COLUMN_PV_FEEDIN = "PV Einspeisung"
+COLUMN_PV_SELF = "PV Eigenverbrauch"
+COLUMN_PV_BAT = "PV in Batterie"
+COLUMN_PV_SHARE = "PV Eigenverbrauchsanteil"
+
+
+def _get_rename_map(component_types: List[str]) -> Dict[str, str]:
+    """Return a mapping from raw column names to human-readable German names.
+
+    Args:
+        component_types: List of component types such as 'pv', 'battery'.
+
+    Returns:
+        A dictionary mapping internal column names to localized names.
+    """
+    rename_map: Dict[str, str] = {
+        COLUMN_TIMESTAMP: "Zeitpunkt",
+        COLUMN_GRID: COLUMN_GRID_NAMED,
+        COLUMN_CONSUMPTION: COLUMN_CONSUMPTION_NAMED,
+    }
+
+    if "battery" in component_types:
+        rename_map[COLUMN_BATTERY] = COLUMN_BATTERY_NAMED
+
+    if "pv" in component_types:
+        rename_map.update(
+            {
+                "pv": "PV Durchsatz",
+                "pv_prod": COLUMN_PV_PROD,
+                "pv_self": COLUMN_PV_SELF,
+                "pv_bat": COLUMN_PV_BAT,
+                "pv_feedin": COLUMN_PV_FEEDIN,
+                "pv_self_consumption_share": COLUMN_PV_SHARE,
+            }
+        )
+
+    return rename_map
+
+
+def convert_timezone(df: pd.DataFrame) -> pd.DataFrame:
+    """Convert 'timestamp' column to Europe/Berlin timezone.
+
+    Args:
+        df: DataFrame with a 'timestamp' column.
+
+    Returns:
+        A copy of the DataFrame with localized timezone.
+    """
+    df = df.copy()
+    assert COLUMN_TIMESTAMP in df.columns, df
+    if df[COLUMN_TIMESTAMP].dt.tz is None:
+        df[COLUMN_TIMESTAMP] = df[COLUMN_TIMESTAMP].dt.tz_localize("UTC")
+    df[COLUMN_TIMESTAMP] = df[COLUMN_TIMESTAMP].dt.tz_convert(TZ_NAME)
+    return df
+
+
+def process_grid_data(df: pd.DataFrame) -> pd.DataFrame:
+    """Add 'Netzbezug' column for positive grid consumption.
+
+    Args:
+        df: DataFrame with grid consumption values.
+
+    Returns:
+        DataFrame with an additional column for clipped grid import.
+    """
+    df = df.copy()
+    df[COLUMN_NET_IMPORT] = df[COLUMN_GRID_NAMED].clip(lower=0)
+    return df
+
+
+def compute_pv_metrics(df: pd.DataFrame, component_types: List[str]) -> pd.DataFrame:
+    """Compute PV-related metrics and add them to the DataFrame.
+
+    Args:
+        df: Input DataFrame with PV and consumption data.
+        component_types: List of present component types.
+
+    Returns:
+        DataFrame with additional columns for PV metrics.
+    """
+    df = df.copy()
+    df["pv_prod"] = -df["pv_neg"]
+    df["pv_excess"] = (df["pv_prod"] - df[COLUMN_CONSUMPTION]).clip(lower=0)
+    if "battery" in component_types:
+        df["pv_bat"] = df[["pv_excess", "battery_pos"]].min(axis=1)
+    else:
+        df["pv_bat"] = 0
+    df["pv_feedin"] = df["pv_excess"] - df["pv_bat"]
+    df["pv_self"] = (df["pv_prod"] - df["pv_excess"]).clip(lower=0)
+    df["pv_self_consumption_share"] = df["pv_self"] / df[COLUMN_CONSUMPTION].replace(
+        0, pd.NA
+    )
+    return df
+
+
+def apply_renaming(
+    df: pd.DataFrame, component_types: List[str], mcfg: Any
+) -> pd.DataFrame:
+    """Apply full renaming: static columns and dynamic component columns.
+
+    Args:
+        df: Input DataFrame.
+        component_types: List of present component types.
+        mcfg: Configuration object with component metadata.
+
+    Returns:
+        DataFrame with renamed columns.
+    """
+    df = df.copy()
+    rename_map = _get_rename_map(component_types)
+
+    single_comp = [col for col in df.columns if col.isdigit()]
+    if "battery" in component_types:
+        battery_ids = {
+            str(i)
+            for i in mcfg.component_type_ids(
+                component_type="battery", component_category="meter"
+            )
+        }
+        rename_map.update(
+            {col: f"Batterie #{col}" for col in single_comp if col in battery_ids}
+        )
+
+    if "pv" in component_types:
+        pv_ids = {
+            str(i)
+            for i in mcfg.component_type_ids(
+                component_type="pv", component_category="meter"
+            )
+        }
+        rename_map.update({col: f"PV #{col}" for col in single_comp if col in pv_ids})
+
+    return df.rename(columns=rename_map)
+
+
+def prepare_reporting_dfs(
+    df: pd.DataFrame, component_types: List[str], mcfg: Any
+) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """Create master and renamed DataFrames based on component types and config.
+
+    Args:
+        df: Input data frame with raw microgrid data.
+        component_types: List of component types present.
+        mcfg: Configuration object with component metadata.
+
+    Returns:
+        Tuple of (master DataFrame, fully renamed DataFrame).
+    """
+    df = df.reset_index(drop=True)
+    df = convert_timezone(df)
+    df_renamed = apply_renaming(df, component_types, mcfg)
+    df_renamed = process_grid_data(df_renamed)
+
+    master_df = df_renamed[_get_master_columns(df_renamed.columns, component_types)]
+    return master_df, df_renamed
+
+
+def _get_master_columns(
+    columns: pd.Index[str], component_types: List[str]
+) -> List[str]:
+    """Determine relevant columns for the master DataFrame based on component types.
+
+    Args:
+        columns: List of column names.
+        component_types: List of present component types.
+
+    Returns:
+        List of relevant column names for the master DataFrame.
+    """
+    cols = [
+        "Zeitpunkt",
+        COLUMN_GRID_NAMED,
+        COLUMN_NET_IMPORT,
+        COLUMN_CONSUMPTION_NAMED,
+    ]
+
+    if "battery" in component_types:
+        cols.append(COLUMN_BATTERY_NAMED)
+
+    if "pv" in component_types:
+        cols += [
+            "PV Durchsatz",
+            COLUMN_PV_PROD,
+            COLUMN_PV_SELF,
+            COLUMN_PV_FEEDIN,
+        ]
+        if "battery" in component_types:
+            cols += [COLUMN_PV_BAT, COLUMN_PV_SHARE]
+
+    # Add individual component columns like "PV #1", "Batterie #3", etc.
+    cols += [col for col in columns if "#" in col]
+
+    return cols