Skip to content

Commit 00beb19

Browse files
authored
Merge pull request #126 from dataiku/release/2.0.1
Release/2.0.1
2 parents 36b3f4f + 12ae84b commit 00beb19

24 files changed

+378
-257
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Changelog
22

3+
## [Version 2.0.1] - Patch Release - 2025-11
4+
5+
* Small UI improvements
6+
* Bugfix on explicit Train/Test
7+
* Performance improvement
8+
39
## [Version 2.0.0] - New Feature Release - 2025-10
410

511
* Visual Webapp to train GLMs and assess their fit

plugin.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"id": "generalized-linear-models",
3-
"version": "2.0.0",
3+
"version": "2.0.1",
44
"meta": {
55
"label": "Generalized Linear Models",
66
"description": "Train and deploy Generalized Linear Models",

python-lib/backend/api_utils.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,12 @@ def format_models(global_dku_mltask):
2424
if is_conform:
2525
model_name = model_details.get_user_meta()['name']
2626
matches = re.findall(model_id_pattern, model_name)
27-
date = [v['value'] for v in model_details.get_user_meta()['labels'] if v['key'] == 'model:date'][0]
28-
models.append({"id": ml_id, "name": matches[0], "date": date, "project_key": project_key, "ml_task_id": ml_task_id, "analysis_id": analysis_id})
27+
found_date = [v['value'] for v in model_details.get_user_meta()['labels'] if v['key'] == 'model:date']
28+
if (len(found_date) > 0) and (len(matches) > 0):
29+
date = found_date[0]
30+
models.append({"id": ml_id, "name": matches[0], "date": date, "project_key": project_key, "ml_task_id": ml_task_id, "analysis_id": analysis_id})
31+
else:
32+
current_app.logger.info(f"model {ml_id} missing date or name info")
2933
else:
3034
current_app.logger.info(f"model {ml_id} is not conform")
3135
return models

python-lib/backend/fetch_api.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,13 @@
33

44
fetch_api = Blueprint("fetch_api", __name__, url_prefix="/api")
55

6+
@fetch_api.errorhandler(Exception)
7+
def handle_fetch_api_exception(e):
8+
current_app.logger.error(f"Error in fetch_api: {str(e)}")
9+
response = jsonify({"error": str(e)})
10+
response.status_code = 400
11+
return response
12+
613
@fetch_api.route("/send_webapp_id", methods=["POST"])
714
def update_config():
815
data_service = current_app.data_service

python-lib/backend/services.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ class MockDataService:
2323
def train_model(self, request_json: dict):
2424
current_app.logger.info("Local set up: No model training completed")
2525
time.sleep(2)
26+
raise ValueError("Model training error: Simulated training error for testing purposes.")
2627
return {'message': 'Model training initiated successfully.'}
2728

2829
def deploy_model(self, request_json: dict):
@@ -102,16 +103,6 @@ def export_one_way(self, request_json: dict):
102103
csv_data = variable_level_stats_df.to_csv(index=False).encode('utf-8')
103104
return csv_data
104105

105-
# def get_excluded_columns(self):
106-
# exposure_column = "Exposure"
107-
# target_column = "ClaimAmount"
108-
109-
# cols_json = {
110-
# "target_column": target_column,
111-
# "exposure_column": exposure_column
112-
# }
113-
# return cols_json
114-
115106
def get_dataset_columns(self, request_json: dict):
116107
dataset_name = "claim_train"
117108
exposure_column = "exposure"

python-lib/dku_visual_ml/dku_base.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import dataiku
2+
import pandas as pd
23
from logging_assist.logging import logger
34

45
class DataikuClientProject:
@@ -20,8 +21,8 @@ def format_ml_task(self, ml_task_config):
2021
if split_params['ttPolicy'] == 'SPLIT_SINGLE_DATASET':
2122
test_set = ""
2223
split_policy = "random"
23-
elif split_params['ttPolicy'] == 'EXPLICIT_TEST_SET':
24-
test_set = "REPLACE_ME"
24+
elif split_params['ttPolicy'] == 'EXPLICIT_FILTERING_TWO_DATASETS':
25+
test_set = split_params['eftdTest']['datasetSmartName']
2526
split_policy = "explicit"
2627
else:
2728
test_set = ""
@@ -77,5 +78,11 @@ def get_datasets(self):
7778
def get_variables_for_dataset(self, dataset_name):
7879
dataset = dataiku.Dataset(dataset_name)
7980
columns = dataset.get_config()['schema']['columns']
80-
column_names = [{'name': column['name']} for column in columns]
81-
return column_names
81+
df = dataset.get_dataframe(limit=100)
82+
numeric_columns = []
83+
for column in columns:
84+
col_name = column['name']
85+
if col_name in df.columns:
86+
if pd.api.types.is_numeric_dtype(df[col_name]):
87+
numeric_columns.append({'name': col_name})
88+
return numeric_columns

python-lib/dku_visual_ml/dku_model_trainer.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -55,20 +55,19 @@ def setup_using_existing_ml_task(self, mltask_id, analysis_id):
5555
logger.debug(f"Updating the ml task with analysis id {analysis_id} and mltask_id {mltask_id}")
5656

5757
self.mltask = self.project.get_ml_task(mltask_id=mltask_id, analysis_id=analysis_id)
58-
self.remove_failed_trainings()
5958

6059
logger.info(f"Successfully update the existing ML task")
6160

6261

6362
def assign_train_test_policy(self):
64-
logger.info(f"Assigning train test policy")
65-
63+
logger.info(f"Assigning train test policy")
6664
if hasattr(self.visual_ml_config, "policy"):
67-
if self.visual_ml_config.policy == "explicit_test_set":
65+
if self.visual_ml_config.policy == "Explicit":
6866
logger.info(f"Configuration specifies test set, assigning")
6967
settings = self.mltask.get_settings()
7068
settings.split_params.set_split_explicit(
7169
dku_dataset_selection_params,
70+
dku_dataset_selection_params,
7271
dataset_name=self.visual_ml_config.input_dataset,
7372
test_dataset_name=self.visual_ml_config.test_dataset_string)
7473
settings.save()
@@ -236,14 +235,6 @@ def set_code_env_settings(self,code_env_string):
236235
settings.mltask_settings['envSelection']['envName'] = code_env_string
237236
settings.save()
238237
logger.info(f"set code env settings to {self.mltask.get_settings().mltask_settings.get('envSelection')} ")
239-
240-
def remove_failed_trainings(self):
241-
242-
ids = self.mltask.get_trained_models_ids()
243-
for model_id in ids:
244-
state = self.mltask.get_trained_model_details(model_id).details.get('trainInfo').get('state')
245-
if state == "FAILED":
246-
self.mltask.delete_trained_model(model_id)
247238

248239

249240
def get_latest_model(self):
@@ -316,7 +307,6 @@ def train_model(self, code_env_string, session_name=None):
316307
if status == "FAILED":
317308
if error_message == "Failed to train : <class 'numpy.linalg.LinAlgError'> : Matrix is singular.":
318309
error_message = error_message + "Check colinearity of variables added to the model"
319-
self.remove_failed_trainings()
320310
return None, error_message
321311
else:
322312
return None, error_message

python-lib/glm_handler/dku_relativites_calculator.py

Lines changed: 67 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,12 @@ def __init__(self, data_handler, model_retriever, prepared_train_set=None, prepa
4848
logger.error(f"Error initializing RelativitiesCalculator: {e}")
4949
self.train_set = None
5050
self.test_set = None
51-
51+
52+
def _predict_from_df(self, df):
53+
preprocessed_data = self.model_retriever.predictor.preprocess(df)
54+
predictions_array = self.model_retriever.predictor._clf.predict(preprocessed_data[0])
55+
return predictions_array
56+
5257
def compute_base_values(self):
5358
logger.info("Computing base values on initiation.")
5459
params = self.model_retriever.predictor.params
@@ -102,7 +107,7 @@ def initialize_baseline(self):
102107

103108
def calculate_baseline_prediction(self, sample_train_row):
104109
logger.info("Calculating baseline prediction")
105-
return self.model_retriever.predictor.predict(sample_train_row).iloc[0][0]
110+
return self._predict_from_df(sample_train_row)[0]
106111

107112
def construct_relativities_df(self):
108113
logger.info("constructing relativites DF")
@@ -129,6 +134,7 @@ def construct_relativities_interaction_df(self):
129134
def get_relativities_df(self):
130135
"""
131136
Computes and returns the relativities DataFrame for the model.
137+
(Optimized with batch prediction)
132138
Returns:
133139
pd.DataFrame: The relativities DataFrame.
134140
"""
@@ -139,11 +145,13 @@ def get_relativities_df(self):
139145
self.relativities = {'base': {'base': baseline_prediction}}
140146
used_features = self.model_retriever.get_used_features()
141147

148+
dfs_to_predict = []
149+
features_and_values = [] # To map results back
150+
142151
for feature in used_features:
143152
feature_type = self.model_retriever.features[feature]['type']
144153
base_value = self.base_values[feature]
145-
self.relativities[feature] = {base_value: 1.0}
146-
train_row_copy = sample_train_row.copy()
154+
self.relativities[feature] = {}
147155

148156
exposure_col = self.model_retriever.exposure_columns
149157
exposure_per_modality = self.train_set.groupby(feature)[exposure_col].sum()
@@ -153,8 +161,22 @@ def get_relativities_df(self):
153161
values_to_process.append(base_value)
154162

155163
for value in values_to_process:
164+
if value == base_value:
165+
self.relativities[feature][value] = 1.0
166+
continue
167+
168+
train_row_copy = sample_train_row.copy()
156169
train_row_copy[feature] = value
157-
prediction = self.model_retriever.predictor.predict(train_row_copy).iloc[0][0]
170+
dfs_to_predict.append(train_row_copy)
171+
features_and_values.append((feature, value))
172+
173+
if dfs_to_predict:
174+
logger.info(f"Predicting batch of {len(dfs_to_predict)} rows for relativities...")
175+
batch_df = pd.concat(dfs_to_predict, ignore_index=True)
176+
batch_predictions = self._predict_from_df(batch_df)
177+
178+
for i, (feature, value) in enumerate(features_and_values):
179+
prediction = batch_predictions[i]
158180
relativity = prediction / baseline_prediction
159181
self.relativities[feature][value] = relativity
160182

@@ -165,6 +187,7 @@ def get_relativities_df(self):
165187
def get_relativities_interactions_df(self):
166188
"""
167189
Computes and returns the relativities DataFrame for the model.
190+
(Optimized with batch prediction)
168191
Returns:
169192
pd.DataFrame: The relativities DataFrame.
170193
"""
@@ -174,43 +197,58 @@ def get_relativities_interactions_df(self):
174197

175198
self.relativities_interaction = {}
176199
interactions = self.model_retriever.get_interactions()
200+
201+
dfs_to_predict = []
202+
features_and_values_list = [] # To map results back
177203

178204
for interaction in interactions:
179205
interaction_first = interaction[0]
180206
interaction_second = interaction[1]
181207

182208
base_value_first = self.base_values[interaction_first]
183209
base_value_second = self.base_values[interaction_second]
184-
try:
185-
self.relativities_interaction[interaction_first][interaction_second] = {base_value_first: {base_value_second: 1.0}}
186-
except KeyError:
187-
self.relativities_interaction[interaction_first] = {interaction_second: {base_value_first: {base_value_second: 1.0}}}
188-
train_row_copy = sample_train_row.copy()
210+
211+
# Initialize the nested dictionary structure
212+
if interaction_first not in self.relativities_interaction:
213+
self.relativities_interaction[interaction_first] = {}
214+
if interaction_second not in self.relativities_interaction[interaction_first]:
215+
self.relativities_interaction[interaction_first][interaction_second] = {}
216+
if base_value_first not in self.relativities_interaction[interaction_first][interaction_second]:
217+
self.relativities_interaction[interaction_first][interaction_second][base_value_first] = {}
218+
219+
# Set base relativity
220+
self.relativities_interaction[interaction_first][interaction_second][base_value_first][base_value_second] = 1.0
189221

190222
type_first = self.variable_types.get(interaction_first)
191223
type_second = self.variable_types.get(interaction_second)
192224

193-
if type_first == 'CATEGORICAL':
194-
values_to_process_first = self.modalities[interaction_first]
195-
else:
196-
values_to_process_first = [base_value_first]
197-
198-
if type_second == 'CATEGORICAL':
199-
values_to_process_second = self.modalities[interaction_second]
200-
else:
201-
values_to_process_second = [base_value_second]
202-
225+
values_to_process_first = self.modalities[interaction_first] if type_first == 'CATEGORICAL' else [base_value_first]
226+
values_to_process_second = self.modalities[interaction_second] if type_second == 'CATEGORICAL' else [base_value_second]
203227

204228
for value_first in values_to_process_first:
205229
for value_second in values_to_process_second:
230+
if value_first == base_value_first and value_second == base_value_second:
231+
continue # Skip base case, already set to 1.0
232+
233+
train_row_copy = sample_train_row.copy()
206234
train_row_copy[interaction_first] = value_first
207235
train_row_copy[interaction_second] = value_second
208-
prediction = self.model_retriever.predictor.predict(train_row_copy).iloc[0][0]
209-
relativity = prediction / baseline_prediction
210-
try:
211-
self.relativities_interaction[interaction_first][interaction_second][value_first][value_second] = relativity
212-
except KeyError:
213-
self.relativities_interaction[interaction_first][interaction_second][value_first] = {value_second: relativity}
236+
dfs_to_predict.append(train_row_copy)
237+
features_and_values_list.append((interaction_first, interaction_second, value_first, value_second))
238+
239+
# Predict on the entire batch at once
240+
if dfs_to_predict:
241+
logger.info(f"Predicting batch of {len(dfs_to_predict)} rows for interactions...")
242+
batch_df = pd.concat(dfs_to_predict, ignore_index=True)
243+
batch_predictions = self._predict_from_df(batch_df)
244+
245+
# Map results back
246+
for i, (f1, f2, v1, v2) in enumerate(features_and_values_list):
247+
prediction = batch_predictions[i]
248+
relativity = prediction / baseline_prediction
249+
if v1 not in self.relativities_interaction[f1][f2]:
250+
self.relativities_interaction[f1][f2][v1] = {}
251+
self.relativities_interaction[f1][f2][v1][v2] = relativity
214252

215253
relativities_interaction_df = self.construct_relativities_interaction_df()
216254
logger.info("Relativities DataFrame computed")
@@ -245,7 +283,7 @@ def prepare_dataset(self, dataset_type='train'):
245283
else:
246284
raise ValueError("dataset_type must be either 'train' or 'test'")
247285

248-
predicted = self.model_retriever.predictor.predict(dataset)
286+
predicted = self._predict_from_df(dataset)
249287
dataset['predicted'] = predicted
250288
dataset['weight'] = 1 if self.model_retriever.exposure_columns is None else dataset[self.model_retriever.exposure_columns]
251289

@@ -311,12 +349,9 @@ def weighted_mean(x):
311349
if other_feature != feature:
312350
feature_df[other_feature] = self.base_values[other_feature]
313351

314-
logger.debug("predictions")
315-
logger.debug(feature_df)
316-
predictions = self.model_retriever.predictor.predict(feature_df)
317-
logger.debug(predictions)
352+
predictions = self._predict_from_df(feature_df)
318353
base_data[feature] = pd.DataFrame({
319-
f'base_{feature}': predictions['prediction'],
354+
f'base_{feature}': predictions,
320355
feature: feature_df[feature]
321356
})
322357

resource/dist/assets/index.css

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)