1+ import logging
2+ import pickle
3+ from datetime import datetime
4+ from typing import Any , List , Optional
5+
6+ import pandas as pd
7+ from fastapi import HTTPException
8+ from pydantic import BaseModel
9+
10+ from src .service .data .storage import get_storage_interface
11+
12+ logger = logging .getLogger (__name__ )
13+
14+
15+ class RowMatcher (BaseModel ):
16+ columnName : str
17+ operation : str
18+ values : List [Any ]
19+
20+
21+ class DataRequestPayload (BaseModel ):
22+ modelId : str
23+ matchAny : Optional [List [RowMatcher ]] = []
24+ matchAll : Optional [List [RowMatcher ]] = []
25+ matchNone : Optional [List [RowMatcher ]] = []
26+
27+
28+ class DataResponsePayload (BaseModel ):
29+ dataCSV : str
30+
31+
32+ def get_storage () -> Any :
33+ """Get storage instance"""
34+ return get_storage_interface ()
35+
36+
37+ def apply_matcher (df : pd .DataFrame , matcher : RowMatcher , negate : bool = False ) -> pd .DataFrame :
38+ """Apply a single matcher to the dataframe."""
39+ if matcher .operation not in ["EQUALS" , "BETWEEN" ]:
40+ raise HTTPException (
41+ status_code = 400 ,
42+ detail = "RowMatch operation must be one of [BETWEEN, EQUALS]" ,
43+ )
44+ if matcher .operation == "EQUALS" :
45+ return apply_equals_matcher (df , matcher , negate )
46+ elif matcher .operation == "BETWEEN" :
47+ return apply_between_matcher (df , matcher , negate )
48+
49+
50+ def apply_equals_matcher (df : pd .DataFrame , matcher : RowMatcher , negate : bool = False ) -> pd .DataFrame :
51+ """Apply EQUALS matcher to dataframe."""
52+ column_name = matcher .columnName
53+ values = matcher .values
54+ if column_name not in df .columns :
55+ raise HTTPException (
56+ status_code = 400 ,
57+ detail = f"No feature or output found with name={ column_name } " ,
58+ )
59+ mask = df [column_name ].isin (values )
60+ if negate :
61+ mask = ~ mask
62+ return df [mask ]
63+
64+
65+ def apply_between_matcher (df : pd .DataFrame , matcher : RowMatcher , negate : bool = False ) -> pd .DataFrame :
66+ """Apply BETWEEN matcher to dataframe."""
67+ column_name = matcher .columnName
68+ values = matcher .values
69+
70+ if column_name not in df .columns :
71+ raise HTTPException (
72+ status_code = 400 ,
73+ detail = f"No feature or output found with name={ column_name } " ,
74+ )
75+ errors = []
76+ if len (values ) != 2 :
77+ errors .append (
78+ f"BETWEEN operation must contain exactly two values, describing the lower and upper bounds of the desired range. Received { len (values )} values"
79+ )
80+ if column_name == "trustyai.TIMESTAMP" :
81+ if errors :
82+ combined_error = ", " .join (errors )
83+ raise HTTPException (status_code = 400 , detail = combined_error )
84+ try :
85+ start_time = pd .to_datetime (str (values [0 ]))
86+ end_time = pd .to_datetime (str (values [1 ]))
87+ df_times = pd .to_datetime (df [column_name ])
88+ mask = (df_times >= start_time ) & (df_times < end_time )
89+ except Exception as e :
90+ raise HTTPException (
91+ status_code = 400 ,
92+ detail = f"Timestamp value is unparseable as an ISO_LOCAL_DATE_TIME: { str (e )} " ,
93+ )
94+ elif column_name == "trustyai.INDEX" :
95+ if errors :
96+ combined_error = ", " .join (errors )
97+ raise HTTPException (status_code = 400 , detail = combined_error )
98+ min_val , max_val = sorted ([int (v ) for v in values ])
99+ mask = (df [column_name ] >= min_val ) & (df [column_name ] < max_val )
100+ else :
101+ if not all (isinstance (v , (int , float )) for v in values ):
102+ errors .append (
103+ "BETWEEN operation must only contain numbers, describing the lower and upper bounds of the desired range. Received non-numeric values"
104+ )
105+ if errors :
106+ combined_error = ", " .join (errors )
107+ raise HTTPException (status_code = 400 , detail = combined_error )
108+ min_val , max_val = sorted (values )
109+ try :
110+ mask = (pd .to_numeric (df [column_name ], errors = "coerce" ) >= min_val ) & (
111+ pd .to_numeric (df [column_name ], errors = "coerce" ) < max_val
112+ )
113+ except :
114+ mask = (df [column_name ].astype (str ) >= str (min_val )) & (df [column_name ].astype (str ) < str (max_val ))
115+ if negate :
116+ mask = ~ mask
117+ return df [mask ]
118+
119+
120+ async def load_model_dataframe (model_id : str ) -> pd .DataFrame :
121+ """Load model dataframe from storage."""
122+ storage = get_storage ()
123+ try :
124+ input_data , input_cols = await storage .read_data (f"{ model_id } _inputs" )
125+ output_data , output_cols = await storage .read_data (f"{ model_id } _outputs" )
126+ metadata_data , metadata_cols = await storage .read_data (f"{ model_id } _metadata" )
127+ if input_data is None or output_data is None or metadata_data is None :
128+ raise HTTPException (status_code = 404 , detail = f"Model { model_id } not found" )
129+ df = pd .DataFrame ()
130+ if len (input_data ) > 0 :
131+ if input_data .ndim == 2 and len (input_cols ) == 1 and input_data .shape [1 ] > 1 :
132+ col_name = input_cols [0 ]
133+ for j in range (input_data .shape [1 ]):
134+ df [f"{ col_name } _{ j } " ] = input_data [:, j ]
135+ else :
136+ input_df = pd .DataFrame (input_data , columns = input_cols )
137+ for col in input_cols :
138+ df [col ] = input_df [col ]
139+ if len (output_data ) > 0 :
140+ if output_data .ndim == 2 and len (output_cols ) == 1 and output_data .shape [1 ] > 1 :
141+ col_name = output_cols [0 ]
142+ for j in range (output_data .shape [1 ]):
143+ df [f"{ col_name } _{ j } " ] = output_data [:, j ]
144+ else :
145+ if output_data .ndim == 2 :
146+ output_data = output_data .flatten ()
147+ output_df = pd .DataFrame ({output_cols [0 ]: output_data })
148+ for col in output_cols :
149+ df [col ] = output_df [col ]
150+ if len (metadata_data ) > 0 and isinstance (metadata_data [0 ], bytes ):
151+ deserialized_metadata = []
152+ for row in metadata_data :
153+ deserialized_row = pickle .loads (row )
154+ deserialized_metadata .append (deserialized_row )
155+ metadata_df = pd .DataFrame (deserialized_metadata , columns = metadata_cols )
156+ else :
157+ metadata_df = pd .DataFrame (metadata_data , columns = metadata_cols )
158+ trusty_mapping = {
159+ "id" : "trustyai.ID" ,
160+ "model_id" : "trustyai.MODEL_ID" ,
161+ "timestamp" : "trustyai.TIMESTAMP" ,
162+ "tag" : "trustyai.TAG" ,
163+ }
164+ for orig_col in metadata_cols :
165+ trusty_col = trusty_mapping .get (orig_col .lower (), orig_col )
166+ df [trusty_col ] = metadata_df [orig_col ]
167+ df ["trustyai.INDEX" ] = range (len (df ))
168+ return df
169+ except Exception as e :
170+ if "not found" in str (e ).lower () or "MissingH5PYDataException" in str (type (e ).__name__ ):
171+ raise HTTPException (status_code = 404 , detail = f"Model { model_id } not found" )
172+ raise HTTPException (status_code = 500 , detail = f"Error loading model data: { str (e )} " )
0 commit comments