Fix minor issue with processing flow variables

phumthep · phumthep · commit 8c02fa6bce2b · 2025-05-13T17:08:49.000-04:00
diff --git a/src/pownet/core/output.py b/src/pownet/core/output.py
@@ -390,42 +390,76 @@ def get_max_line_usage(
         line_locations: pd.DataFrame,
         rated_line_capacities: dict[tuple[str, str], int],
     ) -> pd.DataFrame:
+        """Calculates the maximum utilization for each transmission line.
+
+        This function takes the flow results from an optimization model,
+        determines the peak flow on each line over the entire simulation horizon,
+        and then calculates the utilization of each line as a percentage of its
+        rated capacity. It also merges location data for the lines.
+
+        Args:
+            flow_variables (pd.DataFrame): DataFrame containing flow values for each
+                line at each timestep. Expected columns: 'node_a', 'node_b',
+                'value' (flow magnitude), and 'hour'.
+            line_locations (pd.DataFrame): DataFrame containing location or other
+                metadata for each line. Expected to be indexed by a
+                MultiIndex ('source', 'sink').
+            rated_line_capacities (dict[tuple[str, str], int]): Dictionary mapping
+                line tuples (source_node, sink_node) to their rated
+                power capacity (e.g., in MW).
+
+        Returns:
+            pd.DataFrame: A DataFrame indexed by ('source', 'sink') with columns
+                including 'max_line_usage' (peak flow / rated capacity),
+                columns from `line_locations`, and 'rated_capacity'.
+        """
 
         # Prevent unintentional modification to the original dataframe
-        flow_variables = flow_variables.copy()
+        flow_vars = flow_variables.copy()
 
-        flow_variables = flow_variables.rename(
+        # Standardize column names and remove unnecessary columns
+        flow_vars = flow_vars.rename(
             columns={"node_a": "source", "node_b": "sink"}
-        ).drop("hour", axis=1)
-        # Flow can be negative due to flow being directional
-        flow_variables["value"] = flow_variables["value"].abs()
+        ).drop(
+            "hour", axis=1
+        )  # Assuming 'hour' is not needed for max usage across all time
 
         # Find the max_value for each line segment across the whole time horizon
-        flow_variables["max_value"] = flow_variables.groupby(["source", "sink"])[
+        # Flow variables are non-negative, so we can use max() to find the peak flow.
+        flow_vars["max_value"] = flow_vars.groupby(["source", "sink"])[
             "value"
         ].transform("max")
+
         # Drop duplicates because we are only interested in the maximum flow
-        # over the whole simulation
-        flow_variables = flow_variables.drop_duplicates(subset=["source", "sink"])
+        # over the whole simulation for each unique line
+        flow_vars = flow_vars.drop_duplicates(subset=["source", "sink"])
 
-        # Max utilization rate
-        flow_variables["max_line_usage"] = flow_variables.apply(
+        # Calculate maximum utilization rate
+        # Ensure that the (row["source"], row["sink"]) tuple exactly matches the keys in rated_line_capacities
+        flow_vars["max_line_usage"] = flow_vars.apply(
             lambda row: row["max_value"]
             / rated_line_capacities[(row["source"], row["sink"])],
             axis=1,
         ).round(4)
 
-        flow_variables = flow_variables[["source", "sink", "max_line_usage"]].set_index(
-            ["source", "sink"]
-        )
-        flow_variables = flow_variables.merge(
+        # Select and re-index the DataFrame
+        flow_vars = flow_vars[
+            ["source", "sink", "max_value", "max_line_usage"]
+        ].set_index(["source", "sink"])
+
+        # Merge with line location data
+        # The index of flow_vars is now (source, sink)
+        # line_locations should also be indexed by (source, sink) for a clean merge
+        flow_vars = flow_vars.merge(
             line_locations, how="left", left_index=True, right_index=True
         )
-        # Append rated capacities
-        flow_variables["rated_capacity"] = [
-            rated_line_capacities[idx] for idx in flow_variables.index
+
+        # Ensure that the index of flow_vars (which is (source, sink))
+        # correctly aligns with the keys in rated_line_capacities
+        flow_vars["rated_capacity"] = [
+            rated_line_capacities[idx] for idx in flow_vars.index
         ]
-        return flow_variables
+        return flow_vars
 
     def get_fuel_mix(self, hourly_generation: pd.DataFrame) -> pd.DataFrame:
         """Return the fuel mix (%) for the whole simulation period."""
diff --git a/src/pownet/core/visualizer.py b/src/pownet/core/visualizer.py
@@ -256,7 +256,7 @@ def plot_lmp(
     def plot_line_usage(
         self,
         max_line_usage: pd.DataFrame,
-        output_folder: str,
+        output_folder: str = None,
     ) -> None:
         """Flow variables must have the max_line_usage column"""
         max_line_usage = create_geoseries_columns(max_line_usage)
@@ -273,7 +273,7 @@ def get_linewidth(capacity):
             max_linewidth = 6
             # Scale capacity to between 1 and 10 to avoid log(0) errors
             scaled_capacity = 1 + 9 * (capacity - min_capacity) / (
-                max_capacity - min_capacity
+                max_capacity - min_capacity + 0.0001  # to avoid division by zero
             )
             log_capacity = np.log10(scaled_capacity)
             # Scale the log value to the desired linewidth range.
@@ -324,7 +324,7 @@ def get_linewidth(capacity):
         plt.tight_layout()
         plt.subplots_adjust(bottom=0.2)
 
-        if output_folder is not None:
+        if output_folder:
             figure_name = f"{self.model_id}_line_usage.png"
             fig = ax.get_figure()
             fig.savefig(
diff --git a/src/pownet/data_utils.py b/src/pownet/data_utils.py
@@ -327,27 +327,49 @@ def parse_flow_variables(
     solution: pd.DataFrame, sim_horizon: int, step_k: int
 ) -> pd.DataFrame:
     """
-    The flow variables are in the (node, node, t) format.
+    Parses flow variables from the solution DataFrame.
+    The flow variables are expected in the format:
+    flow_fwd[node_a,node_b,t] or flow_bwd[node_a,node_b,t].
 
     Args:
-        solution: The solution DataFrame.
-        sim_horizon: The length of the simulation horizon.
-        step_k: The current simulation period.
+        solution: The solution DataFrame with a 'varname' column.
+        sim_horizon: The length of the simulation horizon for a single step_k (e.g., 24 hours).
+        step_k: The current simulation period (1-indexed).
 
     Returns:
-        pd.DataFrame: The flow variables DataFrame
+        pd.DataFrame: A DataFrame with parsed flow variables, including
+                      columns for 'node_a', 'node_b', 'type' (fwd/bwd),
+                      'value', 'timestep' (relative to step_k), and 'hour' (absolute).
     """
-    flow_var_pattern = r"flow\[(.+),(.+),(\d+)\]"
-    cur_flow_vars = solution[solution["varname"].str.match(flow_var_pattern)].copy()
+    # Matches flow_fwd[node_a,node_b,t] or flow_bwd[node_a,node_b,t]
+    # It captures the type (fwd or bwd), node_a, node_b, and t.
+    flow_var_pattern = r"flow_(fwd|bwd)\[([^,]+),([^,]+),(\d+)\]"
 
-    cur_flow_vars[["node_a", "node_b", "timestep"]] = cur_flow_vars[
-        "varname"
-    ].str.extract(flow_var_pattern, expand=True)
+    # Filter rows that match the flow variable pattern
+    flow_vars_mask = solution["varname"].str.contains(
+        r"flow_(?:fwd|bwd)\[.+,.+,\d+\]", regex=True
+    )
+    cur_flow_vars = solution[flow_vars_mask].copy()
 
+    if cur_flow_vars.empty:
+        return pd.DataFrame(
+            columns=["node_a", "node_b", "type", "value", "timestep", "hour"]
+        )
+
+    # Extract components from varname
+    extracted_data = cur_flow_vars["varname"].str.extract(flow_var_pattern, expand=True)
+    cur_flow_vars[["type", "node_a", "node_b", "timestep"]] = extracted_data
+
+    # Convert timestep to integer
     cur_flow_vars["timestep"] = cur_flow_vars["timestep"].astype(int)
+
+    # Calculate absolute hour
+    # Assuming sim_horizon is the number of timesteps within one step_k
+    # and step_k is 1-indexed.
     cur_flow_vars["hour"] = cur_flow_vars["timestep"] + sim_horizon * (step_k - 1)
-    cur_flow_vars = cur_flow_vars.drop("varname", axis=1)
-    return cur_flow_vars
+
+    final_columns = ["node_a", "node_b", "value", "type", "timestep", "hour"]
+    return cur_flow_vars[final_columns]
 
 
 def parse_syswide_variables(