Skip to content

Commit a79d38b

Browse files
feat: export steps as dataframe
1 parent dc4c718 commit a79d38b

File tree

4 files changed

+428
-0
lines changed

4 files changed

+428
-0
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from ._dataframe_utilities import convert_steps_to_dataframe
2+
3+
# flake8: noqa
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
from typing import Any, Dict, List
2+
3+
import pandas as pd
4+
from nisystemlink.clients.testmonitor.models import (
5+
Step,
6+
StepProjection,
7+
)
8+
from pandas import DataFrame
9+
10+
11+
def convert_steps_to_dataframe(steps: List[Step]) -> DataFrame:
12+
"""Converts a list of steps into a normalized dataframe.
13+
14+
- A new column would be created for unique `properties` across all steps. The property
15+
columns would be named in the format `properties.property_name`.
16+
- `Inputs` and `Outputs` are converted from a list of name-value pairs to a dict and then
17+
normalized - similar to properties.
18+
- For each `parameter` entry in `data`, a new row is added in the dataframe, with all the
19+
other values are duplicated.
20+
21+
Args:
22+
steps: A list of steps.
23+
24+
Returns:
25+
DataFrame:
26+
- A Pandas DataFrame containing the steps data. The DataFrame would consist of all the
27+
fields in the input steps.
28+
- A new column would be created for unique `properties` across all steps. The property
29+
columns would be named in the format `properties.property_name`.
30+
- `Inputs` and `Outputs` are converted from a list of name-value pairs to a dict and then
31+
normalized - similar to properties.
32+
- For each `parameter` entry in `data`, a new row is added in the dataframe, with all the
33+
other values are duplicated.
34+
"""
35+
DATA_PARAMETERS = "data.parameters"
36+
37+
restructured_steps = __restructure_steps(steps)
38+
39+
# checking if `data` exists in the steps. the following logic is specific to process `data` field
40+
if steps and steps[0].data:
41+
steps_dataframe = pd.json_normalize(restructured_steps, sep=".").explode(
42+
DATA_PARAMETERS, ignore_index=True
43+
)
44+
steps_dataframe = pd.concat(
45+
[
46+
steps_dataframe.drop(columns=[DATA_PARAMETERS]),
47+
pd.json_normalize(steps_dataframe[DATA_PARAMETERS]).add_prefix(
48+
f"{DATA_PARAMETERS}."
49+
),
50+
],
51+
axis=1,
52+
)
53+
else:
54+
steps_dataframe = pd.json_normalize(restructured_steps, sep=".")
55+
56+
grouped_columns = __group_step_columns(steps_dataframe.columns)
57+
58+
return steps_dataframe.reindex(columns=grouped_columns)
59+
60+
61+
def __restructure_steps(steps: List[Step]) -> List[Dict[str, Any]]:
62+
"""Restructures a list of step responses by converting input and output lists into dictionaries.
63+
64+
Each dictionary maps input/output names to their corresponding values, making it easier to
65+
normalize the data into a DataFrame. Without this transformation, inputs and outputs would
66+
remain as lists within a single cell.
67+
68+
Args:
69+
steps: A list of step responses retrieved from the API.
70+
71+
Returns:
72+
List[Step]: Restructured steps - modification involves the conversion of list of inputs and outputs
73+
into dictionaries respectively.
74+
"""
75+
restructured_steps = []
76+
77+
for step in steps:
78+
step_dict = step.dict(exclude_none=True)
79+
step_dict[StepProjection.INPUTS.lower()] = (
80+
{item.name: item.value for item in step.inputs} if step.inputs else {}
81+
)
82+
step_dict[StepProjection.OUTPUTS.lower()] = (
83+
{item.name: item.value for item in step.outputs} if step.outputs else {}
84+
)
85+
86+
restructured_steps.append(step_dict)
87+
88+
return restructured_steps
89+
90+
91+
def __group_step_columns(df_columns: List[str]) -> List[str]:
92+
"""Groups and orders dataframe columns into predefined categories to maintain a consistent structure.
93+
94+
When normalizing steps into a dataframe, new input, output, or property fields may be added at the end,
95+
disrupting the expected column order. This function ensures columns are grouped properly.
96+
97+
Args:
98+
df_columns: The list of all columns from the normalized dataframe.
99+
100+
Returns:
101+
List[str]: A list containing grouped and ordered columns.
102+
"""
103+
GENERAL_CATEGORIES = "general"
104+
CATEGORY_KEYS = [
105+
GENERAL_CATEGORIES,
106+
StepProjection.INPUTS,
107+
StepProjection.OUTPUTS,
108+
StepProjection.DATA,
109+
StepProjection.PROPERTIES,
110+
]
111+
112+
grouped_columns: Dict[str, List[str]] = {category: [] for category in CATEGORY_KEYS}
113+
114+
for column in df_columns:
115+
column_lower = column.lower()
116+
if (
117+
StepProjection.DATA.lower() in column_lower
118+
and column != StepProjection.DATA_MODEL.lower()
119+
):
120+
grouped_columns[StepProjection.DATA].append(column)
121+
elif StepProjection.INPUTS.lower() in column_lower:
122+
grouped_columns[StepProjection.INPUTS].append(column)
123+
elif StepProjection.OUTPUTS.lower() in column_lower:
124+
grouped_columns[StepProjection.OUTPUTS].append(column)
125+
elif StepProjection.PROPERTIES.lower() in column_lower:
126+
grouped_columns[StepProjection.PROPERTIES].append(column)
127+
else:
128+
grouped_columns[GENERAL_CATEGORIES].append(column)
129+
130+
return [
131+
column for category in CATEGORY_KEYS for column in grouped_columns[category]
132+
]

tests/testmonitor/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# flake8: noqa

0 commit comments

Comments
 (0)