Skip to content

Commit d00525e

Browse files
committed
Develop
1 parent e382b79 commit d00525e

File tree

5 files changed

+244
-83
lines changed

5 files changed

+244
-83
lines changed

data_science/src_nyc_taxi/evaluation/__init__.py

Lines changed: 176 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,48 +12,199 @@
1212
from helperFunctions.helperFunction import *
1313
from pyspark.sql.functions import *
1414
from pyspark.sql.types import FloatType, IntegerType, StringType
15+
import mlflow
16+
from mlflow.tracking import MlflowClient
17+
18+
def wait_until_ready(model_name, model_version, client):
19+
for _ in range(10):
20+
model_version_details = client.get_model_version(
21+
name=model_name,
22+
version=model_version,
23+
)
24+
status = ModelVersionStatus.from_string(model_version_details.status)
25+
print("Model status: %s" % ModelVersionStatus.to_string(status))
26+
if status == ModelVersionStatus.READY:
27+
break
28+
time.sleep(1)
29+
1530

16-
def evaluation(fs, taxi_data, model_name):
31+
32+
def evaluation(fs, taxi_data, model_name, model, training_set, run_id, client):
1733
taxi_data = rounded_taxi_data(taxi_data)
1834

1935
cols = ['fare_amount', 'trip_distance', 'pickup_zip', 'dropoff_zip', 'rounded_pickup_datetime', 'rounded_dropoff_datetime']
2036
taxi_data_reordered = taxi_data.select(cols)
2137
display(taxi_data_reordered)
2238

2339

40+
# If no model currently exists in production stage, simply register the model, and promote it the production stage
41+
model_stage = "production"
42+
model_uri = "models:/{model_name}/{model_stage}".format(model_name=model_name, model_stage=model_stage)
43+
44+
if not model_uri:
45+
46+
# MOVE TO REGISTRATION #############################################################################################
47+
48+
#artifact_path = "model"
49+
#model_uri = "runs:/{run_id}/{artifact_path}".format(run_id=run_id, artifact_path=artifact_path)
50+
51+
latest_model_version = get_latest_model_version(model_name)
52+
model_uri = f"models:/taxi_example_fare_packaged/{latest_model_version}"
53+
54+
model_details = mlflow.register_model(model_uri=model_uri, name=model_name)
55+
56+
# wait until the reigstered model is ready
57+
wait_until_ready(model_details.name, model_details.version, client)
58+
59+
client.update_registered_model(
60+
name=model_details.name,
61+
description="Insert"
62+
)
63+
64+
client.update_model_version(
65+
name=model_details.name,
66+
version=model_details.version,
67+
description="Insert"
68+
)
69+
#############################################################################################################################
70+
71+
else:
72+
73+
# Score Production - MOVING TO SCORE - #############################################################################################
74+
model_stage = "production"
75+
model_uri = "models:/{model_name}/{model_stage}".format(model_name=model_name, model_stage=model_stage)
76+
with_predictions = fs.score_batch(model_uri, taxi_data)
77+
78+
# Get Latest Version: Which is the the model you have just trained
79+
latest_model_version = get_latest_model_version(model_name)
80+
model_uri = "models:/{model_name}/{latest_model_version}".format(model_name=model_name, latest_model_version=latest_model_version)
81+
with_predictions = fs.score_batch(model_uri, taxi_data)
82+
83+
84+
import pyspark.sql.functions as func
85+
cols = ['prediction', 'fare_amount', 'trip_distance', 'pickup_zip', 'dropoff_zip',
86+
'rounded_pickup_datetime', 'rounded_dropoff_datetime', 'mean_fare_window_1h_pickup_zip',
87+
'count_trips_window_1h_pickup_zip', 'count_trips_window_30m_dropoff_zip', 'dropoff_is_weekend']
88+
89+
with_predictions_reordered = (
90+
with_predictions.select(
91+
cols,
92+
)
93+
.withColumnRenamed(
94+
"prediction",
95+
"predicted_fare_amount",
96+
)
97+
.withColumn(
98+
"predicted_fare_amount",
99+
func.round("predicted_fare_amount", 2),
100+
)
101+
)
102+
display(with_predictions_reordered)
103+
# Get the R2 etc. ####################################################################################################################
104+
105+
106+
107+
108+
109+
# CREATE LOGIC DEFINING WHEN TO PROMOTE MODEL (EVALUATION)
110+
is_improvement = True
111+
##########################################################
112+
113+
if is_improvement:
114+
115+
# MOVE TO "REGISTRATION" SCRIPTS - CALL FUNCTION FROM HERE
116+
model_details = mlflow.register_model(model_uri=model_uri, name=model_name)
117+
118+
# wait until the reigstered model is ready
119+
wait_until_ready(model_details.name, model_details.version, client)
120+
121+
client.update_registered_model(
122+
name=model_details.name,
123+
description="Insert"
124+
)
125+
126+
client.update_model_version(
127+
name=model_details.name,
128+
version=model_details.version,
129+
description="Insert"
130+
)
131+
132+
# Demote Staging to None
133+
staging_stage = 'staging'
134+
no_stage = None
135+
# Get the latest model version in the staging stage
136+
latest_production_version = mlflow.get_latest_versions(
137+
name=model_name,
138+
stages=[staging_stage],
139+
order_by=['creation_time desc'],
140+
max_results=1
141+
)[0].version
142+
143+
mlflow.transition_model_version_stage(
144+
name=model_name,
145+
version=latest_production_version,
146+
stage=no_stage
147+
)
148+
149+
150+
# Demote Production To Staging (Keeps Incumbent Model As A BackStop)
151+
production_stage = 'production'
152+
staging_stage = 'staging'
153+
# Get the latest model version in the production stage
154+
latest_production_version = mlflow.get_latest_versions(
155+
name=model_name,
156+
stages=[production_stage],
157+
order_by=['creation_time desc'],
158+
max_results=1
159+
)[0].version
160+
161+
# Demote the latest model version from production to staging
162+
mlflow.transition_model_version_stage(
163+
name=model_name,
164+
version=latest_production_version,
165+
stage=staging_stage
166+
)
167+
168+
169+
# Get latest registered model. This is the challenger that will be promoted to Production
170+
latest_registered_version = mlflow.get_latest_versions(
171+
name=model_name,
172+
order_by=['creation_time desc'],
173+
max_results=1
174+
)[0].version
175+
176+
mlflow.transition_model_version_stage(
177+
name=model_name,
178+
version=latest_registered_version,
179+
stage=production_stage
180+
)
181+
182+
183+
24184
# Get the model URI
25185
latest_model_version = get_latest_model_version(model_name)
26186
model_uri = f"models:/taxi_example_fare_packaged/{latest_model_version}"
27-
187+
with_predictions = fs.score_batch(model_uri, taxi_data)
28188
#If there is no model registered with this name, then register it, and promote it to production.
29189

30190
# If there is a model that is registered and in productionstage , then 1. load it, 2. score it.
31191
# 3. Load model that you've just logged. compare the results.
32192
# 4. If better then promote most recent version of model to production stage, and demote current production to stage
33193

34194

35-
36-
37-
38-
with_predictions = fs.score_batch(model_uri, taxi_data)
39-
40-
print()
41-
42-
43195
# COMMAND ----------
44-
latest_pyfunc_version = get_latest_model_version("pyfunc_taxi_fare_packaged")
45-
pyfunc_model_uri = f"models:/pyfunc_taxi_fare_packaged/{latest_pyfunc_version}"
46-
pyfunc_predictions = fs.score_batch(pyfunc_model_uri,
47-
taxi_data,
48-
result_type='string')
196+
#latest_pyfunc_version = get_latest_model_version("pyfunc_taxi_fare_packaged")
197+
#pyfunc_model_uri = f"models:/pyfunc_taxi_fare_packaged/{latest_pyfunc_version}"
198+
#pyfunc_predictions = fs.score_batch(pyfunc_model_uri,
199+
# taxi_data,
200+
# result_type='string')
49201

50202

51203
# COMMAND ----------
52204
import pyspark.sql.functions as func
53205
cols = ['prediction', 'fare_amount', 'trip_distance', 'pickup_zip', 'dropoff_zip',
54206
'rounded_pickup_datetime', 'rounded_dropoff_datetime', 'mean_fare_window_1h_pickup_zip',
55207
'count_trips_window_1h_pickup_zip', 'count_trips_window_30m_dropoff_zip', 'dropoff_is_weekend']
56-
57208
with_predictions_reordered = (
58209
with_predictions.select(
59210
cols,
@@ -67,19 +218,25 @@ def evaluation(fs, taxi_data, model_name):
67218
func.round("predicted_fare_amount", 2),
68219
)
69220
)
70-
71221
display(with_predictions_reordered)
72222

73223
# COMMAND ----------
74-
display(pyfunc_predictions.select('fare_amount', 'prediction'))
224+
#display(pyfunc_predictions.select('fare_amount', 'prediction'))
75225

76226
# COMMAND ----------
77227

78228
if __name__ == "__main__":
79229
fs = feature_store.FeatureStoreClient()
80230
model_name = "taxi_example_fare_packaged"
81231
taxi_data = spark.read.table("feature_store_taxi_example.nyc_yellow_taxi_with_zips")
82-
eval(fs=fs, taxi_data=taxi_data, model_name=model_name)
232+
run_id = mlflow.active_run().info.run_id
233+
234+
# Do not log
235+
236+
# training_set will cbe returned from another function.
237+
training_set = []
238+
client = MlflowClient()
239+
evaluation(fs=fs, taxi_data=taxi_data, model_name=model_name, training_set=training_set, run_id=run_id, client=client)
83240

84241

85242

data_science/src_nyc_taxi/training/__init__.py

Lines changed: 58 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -284,83 +284,78 @@ def train_model_lgbm(
284284
y_test = test.fare_amount
285285

286286

287-
#mlflow.end_run()
288-
#mlflow.autolog(exclusive=False)
289-
#with mlflow.start_run():
290-
#mlflow.lightgbm.autolog()
291-
292-
293-
train_lgb_dataset = lgb.Dataset(
294-
X_train,
295-
label=y_train.values
296-
)
297-
298-
test_lgb_dataset = lgb.Dataset(
299-
X_test,
300-
label=y_test.values
287+
mlflow.end_run()
288+
mlflow.autolog(exclusive=False)
289+
with mlflow.start_run():
290+
train_lgb_dataset = lgb.Dataset(
291+
X_train,
292+
label=y_train.values
293+
)
294+
295+
test_lgb_dataset = lgb.Dataset(
296+
X_test,
297+
label=y_test.values
298+
)
299+
300+
mlflow.log_param("num_leaves", "32")
301+
mlflow.log_param("objective", "regression")
302+
mlflow.log_param( "metric", "rmse")
303+
mlflow.log_param("learn_rate", "100")
304+
305+
param = {
306+
"num_leaves": 32,
307+
"objective": "regression",
308+
"metric": "rmse"
309+
}
310+
num_rounds = 100
311+
312+
# Train a lightGBM model
313+
model = lgb.train(
314+
param,
315+
train_lgb_dataset,
316+
num_rounds
301317
)
302-
303-
mlflow.log_param("num_leaves", "32")
304-
mlflow.log_param("objective", "regression")
305-
mlflow.log_param( "metric", "rmse")
306-
mlflow.log_param("learn_rate", "100")
307-
308-
param = {
309-
"num_leaves": 32,
310-
"objective": "regression",
311-
"metric": "rmse"
312-
}
313-
num_rounds = 100
314-
315-
# Train a lightGBM model
316-
model = lgb.train(
317-
param,
318-
train_lgb_dataset,
319-
num_rounds
320-
)
321318

322319

323320

324-
# Below Should be In Predict
321+
# Below Should be In Predict
325322

326323

327324

328-
#Save The Model
325+
#Save The Model
329326

330-
self.create_model_folder()
327+
self.create_model_folder()
331328

332-
model_file_path = self.get_model_file_path("taxi_example_fare_packaged")
333-
print(f"ModelFilePath: {model_file_path}")
334-
joblib.dump(
335-
model,
336-
open(model_file_path,'wb')
337-
)
338-
mlflow.log_param("local_model_file_path", model_file_path)
339-
340-
expected_y = y_test
341-
predicted_y = model.predict(X_test)
342-
343-
r2 = metrics.r2_score(
344-
expected_y,
345-
predicted_y
329+
model_file_path = self.get_model_file_path("taxi_example_fare_packaged")
330+
print(f"ModelFilePath: {model_file_path}")
331+
joblib.dump(
332+
model,
333+
open(model_file_path,'wb')
346334
)
335+
mlflow.log_param("local_model_file_path", model_file_path)
347336

348-
mlflow.log_metric(
349-
"r2",
350-
r2)
351-
337+
expected_y = y_test
338+
predicted_y = model.predict(X_test)
352339

353-
# log the model
340+
r2 = metrics.r2_score(
341+
expected_y,
342+
predicted_y
343+
)
344+
345+
mlflow.log_metric(
346+
"r2",
347+
r2)
354348

355-
fs.log_model(
356-
model,
357-
artifact_path="model_packaged",
358-
flavor=mlflow.lightgbm,
359-
training_set=training_set,
360-
registered_model_name=model_name
361-
)
349+
350+
fs.log_model(
351+
model,
352+
artifact_path="model_packaged",
353+
flavor=mlflow.lightgbm,
354+
training_set=training_set,
355+
registered_model_name=model_name
356+
)
362357

363-
return model
358+
return model, model_file_path
364359

365360

366361
# COMMAND ----------

0 commit comments

Comments
 (0)