Skip to content

Commit 03b1e76

Browse files
wjayeshstrickvl
andauthored
Fix some projects (#162)
* rm custom huggingface code * fix role, syntax, and more * upgrade deps * use langchain community * download erquired nltk data * use langchain openai * add faiss materializer * failed attempt at agent executor materializer * return agent and tools tuple * update zenml version * Update dependencies and README run command * bump zenml version to the latest * formatting * formatting * Fix NLTK data download to use a writable directory * Retrieve OpenAI API key from ZenML secrets in index generator * Reorder imports in web_url_loader.py * Add environment variable fallback for OpenAI API key in index generator * Add OpenAI API key retrieval in FAISS materializer * Add OpenAI API key retrieval in agent creator * update zenml syntax * update code with right syntax and requirements * update workspace * update requirements * update makefile and requirements * make it use uv * mount local cache dir to docker orchestrator * add instructions for local_docker orch --------- Co-authored-by: Alex Strick van Linschoten <[email protected]>
1 parent 8b8bb8d commit 03b1e76

File tree

121 files changed

+947
-2215
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

121 files changed

+947
-2215
lines changed

airflow-cloud-composer-etl-feature-train/steps/etl/transform.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@
1818
import os
1919
from datetime import datetime, timezone
2020
from typing import Optional
21-
from typing_extensions import Annotated
2221

2322
import pandas as pd
2423
from materializers import BigQueryDataset, CSVDataset
24+
from typing_extensions import Annotated
2525
from zenml import step
2626
from zenml.logger import get_logger
2727

airflow-cloud-composer-etl-feature-train/steps/feature_engineering/augment.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@
1818
import os
1919
from datetime import datetime, timezone
2020
from typing import Optional
21-
from typing_extensions import Annotated
2221

2322
from materializers import BigQueryDataset, CSVDataset
23+
from typing_extensions import Annotated
2424
from zenml import step
2525
from zenml.logger import get_logger
2626

classifier-e2e/run.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -196,12 +196,12 @@ def main(
196196
test_dataset_name, test_dataset_version_name
197197
)
198198
# Use versioned artifacts
199-
run_args_train[
200-
"train_dataset_id"
201-
] = train_dataset_artifact_version.id
202-
run_args_train[
203-
"test_dataset_id"
204-
] = test_dataset_artifact_version.id
199+
run_args_train["train_dataset_id"] = (
200+
train_dataset_artifact_version.id
201+
)
202+
run_args_train["test_dataset_id"] = (
203+
test_dataset_artifact_version.id
204+
)
205205

206206
run_args_train["random_state"] = random.randint(0, 1000)
207207

classifier-e2e/run_full.ipynb

Lines changed: 79 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
"! zenml login https://1cf18d95-zenml.cloudinfra.zenml.io \n",
4242
"\n",
4343
"import IPython\n",
44+
"\n",
4445
"IPython.Application.instance().kernel.do_shutdown(restart=True)"
4546
]
4647
},
@@ -73,22 +74,16 @@
7374
"outputs": [],
7475
"source": [
7576
"# Do the imports at the top\n",
76-
"from typing_extensions import Annotated\n",
77-
"from sklearn.datasets import load_breast_cancer\n",
77+
"from uuid import UUID\n",
7878
"\n",
7979
"import pandas as pd\n",
80-
"from zenml import step, pipeline, Model, get_step_context\n",
80+
"from pipelines import feature_engineering, training\n",
81+
"from sklearn.datasets import load_breast_cancer\n",
82+
"from steps import data_loader, inference_preprocessor\n",
83+
"from typing_extensions import Annotated\n",
84+
"from zenml import Model, get_step_context, pipeline, step\n",
8185
"from zenml.client import Client\n",
8286
"from zenml.logger import get_logger\n",
83-
"from uuid import UUID\n",
84-
"\n",
85-
"from zenml import pipeline\n",
86-
"\n",
87-
"from steps import (\n",
88-
" data_loader,\n",
89-
" inference_preprocessor\n",
90-
")\n",
91-
"from pipelines import feature_engineering, training\n",
9287
"\n",
9388
"logger = get_logger(__name__)\n",
9489
"\n",
@@ -126,20 +121,22 @@
126121
"@step\n",
127122
"def data_loader_simplified(\n",
128123
" random_state: int, is_inference: bool = False, target: str = \"target\"\n",
129-
") -> Annotated[pd.DataFrame, \"dataset\"]: # We name the dataset \n",
124+
") -> Annotated[pd.DataFrame, \"dataset\"]: # We name the dataset\n",
130125
" \"\"\"Dataset reader step.\"\"\"\n",
131126
" dataset = load_breast_cancer(as_frame=True)\n",
132127
" inference_size = int(len(dataset.target) * 0.05)\n",
133128
" dataset: pd.DataFrame = dataset.frame\n",
134-
" inference_subset = dataset.sample(inference_size, random_state=random_state)\n",
129+
" inference_subset = dataset.sample(\n",
130+
" inference_size, random_state=random_state\n",
131+
" )\n",
135132
" if is_inference:\n",
136133
" dataset = inference_subset\n",
137134
" dataset.drop(columns=target, inplace=True)\n",
138135
" else:\n",
139136
" dataset.drop(inference_subset.index, inplace=True)\n",
140137
" dataset.reset_index(drop=True, inplace=True)\n",
141138
" logger.info(f\"Dataset with {len(dataset)} records loaded!\")\n",
142-
" return dataset\n"
139+
" return dataset"
143140
]
144141
},
145142
{
@@ -243,7 +240,7 @@
243240
"metadata": {},
244241
"outputs": [],
245242
"source": [
246-
"feature_engineering(random_state=42,test_size=0.25)"
243+
"feature_engineering(random_state=42, test_size=0.25)"
247244
]
248245
},
249246
{
@@ -340,7 +337,9 @@
340337
"outputs": [],
341338
"source": [
342339
"# Get artifact version from our run\n",
343-
"dataset_trn_artifact_version_via_run = run.steps[\"data_preprocessor\"].outputs[\"dataset_trn\"] \n",
340+
"dataset_trn_artifact_version_via_run = run.steps[\"data_preprocessor\"].outputs[\n",
341+
" \"dataset_trn\"\n",
342+
"]\n",
344343
"\n",
345344
"# Get latest version from client directly\n",
346345
"dataset_trn_artifact_version = client.get_artifact_version(\"dataset_trn\")\n",
@@ -359,7 +358,9 @@
359358
"source": [
360359
"# Fetch the rest of the artifacts\n",
361360
"dataset_tst_artifact_version = client.get_artifact_version(\"dataset_tst\")\n",
362-
"preprocessing_pipeline_artifact_version = client.get_artifact_version(\"preprocess_pipeline\")"
361+
"preprocessing_pipeline_artifact_version = client.get_artifact_version(\n",
362+
" \"preprocess_pipeline\"\n",
363+
")"
363364
]
364365
},
365366
{
@@ -480,7 +481,7 @@
480481
"training.with_options(enable_cache=False)(\n",
481482
" model_type=\"xgboost\",\n",
482483
" train_dataset_id=dataset_trn_artifact_version.id,\n",
483-
" test_dataset_id=dataset_tst_artifact_version.id\n",
484+
" test_dataset_id=dataset_tst_artifact_version.id,\n",
484485
")\n",
485486
"\n",
486487
"xgboost_run = client.get_pipeline(\"training\").last_run"
@@ -497,7 +498,7 @@
497498
"sgd_run = training.with_options(enable_cache=False)(\n",
498499
" model_type=\"sgd\",\n",
499500
" train_dataset_id=dataset_trn_artifact_version.id,\n",
500-
" test_dataset_id=dataset_tst_artifact_version.id\n",
501+
" test_dataset_id=dataset_tst_artifact_version.id,\n",
501502
")\n",
502503
"\n",
503504
"sgd_run = client.get_pipeline(\"training\").last_run"
@@ -521,7 +522,9 @@
521522
"outputs": [],
522523
"source": [
523524
"# The evaluator returns a float value with the accuracy\n",
524-
"xgboost_run.steps[\"model_evaluator\"].output.load() >= sgd_run.steps[\"model_evaluator\"].output.load()"
525+
"xgboost_run.steps[\"model_evaluator\"].output.load() >= sgd_run.steps[\n",
526+
" \"model_evaluator\"\n",
527+
"].output.load()"
525528
]
526529
},
527530
{
@@ -579,7 +582,7 @@
579582
"training_configured(\n",
580583
" model_type=\"sgd\",\n",
581584
" train_dataset_id=dataset_trn_artifact_version.id,\n",
582-
" test_dataset_id=dataset_tst_artifact_version.id\n",
585+
" test_dataset_id=dataset_tst_artifact_version.id,\n",
583586
")"
584587
]
585588
},
@@ -601,7 +604,7 @@
601604
"training_configured(\n",
602605
" model_type=\"xgboost\",\n",
603606
" train_dataset_id=dataset_trn_artifact_version.id,\n",
604-
" test_dataset_id=dataset_tst_artifact_version.id\n",
607+
" test_dataset_id=dataset_tst_artifact_version.id,\n",
605608
")"
606609
]
607610
},
@@ -650,10 +653,14 @@
650653
"outputs": [],
651654
"source": [
652655
"# Let's load the XGBoost version\n",
653-
"xgboost_zenml_model_version = client.list_model_versions(\"breast_cancer_classifier\", tag=\"xgboost\")[-1]\n",
656+
"xgboost_zenml_model_version = client.list_model_versions(\n",
657+
" \"breast_cancer_classifier\", tag=\"xgboost\"\n",
658+
")[-1]\n",
654659
"\n",
655660
"# We can now load our classifier directly as well\n",
656-
"xgboost_classifier = xgboost_zenml_model_version.get_artifact(\"breast_cancer_classifier\").load()\n",
661+
"xgboost_classifier = xgboost_zenml_model_version.get_artifact(\n",
662+
" \"breast_cancer_classifier\"\n",
663+
").load()\n",
657664
"\n",
658665
"xgboost_classifier"
659666
]
@@ -750,7 +757,9 @@
750757
"outputs": [],
751758
"source": [
752759
"@step\n",
753-
"def inference_predict(dataset_inf: pd.DataFrame) -> Annotated[pd.Series, \"predictions\"]:\n",
760+
"def inference_predict(\n",
761+
" dataset_inf: pd.DataFrame,\n",
762+
") -> Annotated[pd.Series, \"predictions\"]:\n",
754763
" \"\"\"Predictions step\"\"\"\n",
755764
" # Get the model\n",
756765
" model = get_step_context().model\n",
@@ -761,7 +770,7 @@
761770
"\n",
762771
" predictions = pd.Series(predictions, name=\"predicted\")\n",
763772
"\n",
764-
" return predictions\n"
773+
" return predictions"
765774
]
766775
},
767776
{
@@ -788,18 +797,18 @@
788797
" random_state = 42\n",
789798
" target = \"target\"\n",
790799
"\n",
791-
" df_inference = data_loader(\n",
792-
" random_state=random_state, is_inference=True\n",
793-
" )\n",
800+
" df_inference = data_loader(random_state=random_state, is_inference=True)\n",
794801
" df_inference = inference_preprocessor(\n",
795802
" dataset_inf=df_inference,\n",
796803
" # We use the preprocess pipeline from the feature engineering pipeline\n",
797-
" preprocess_pipeline=client.get_artifact_version(name_id_or_prefix=preprocess_pipeline_id),\n",
804+
" preprocess_pipeline=client.get_artifact_version(\n",
805+
" name_id_or_prefix=preprocess_pipeline_id\n",
806+
" ),\n",
798807
" target=target,\n",
799808
" )\n",
800809
" inference_predict(\n",
801810
" dataset_inf=df_inference,\n",
802-
" )\n"
811+
" )"
803812
]
804813
},
805814
{
@@ -823,7 +832,7 @@
823832
"# Lets add some metadata to the model to make it identifiable\n",
824833
"pipeline_settings[\"model\"] = Model(\n",
825834
" name=\"breast_cancer_classifier\",\n",
826-
" version=\"production\", # We can pass in the stage name here!\n",
835+
" version=\"production\", # We can pass in the stage name here!\n",
827836
")"
828837
]
829838
},
@@ -863,7 +872,9 @@
863872
"outputs": [],
864873
"source": [
865874
"# Fetch production model\n",
866-
"production_model_version = client.get_model_version(\"breast_cancer_classifier\", \"production\")\n",
875+
"production_model_version = client.get_model_version(\n",
876+
" \"breast_cancer_classifier\", \"production\"\n",
877+
")\n",
867878
"\n",
868879
"# Get the predictions artifact\n",
869880
"production_model_version.get_artifact(\"predictions\").load()"
@@ -895,6 +906,7 @@
895906
"outputs": [],
896907
"source": [
897908
"from zenml.client import Client\n",
909+
"\n",
898910
"client = Client()"
899911
]
900912
},
@@ -905,8 +917,12 @@
905917
"metadata": {},
906918
"outputs": [],
907919
"source": [
908-
"sgd_model_version = client.list_model_versions(\"breast_cancer_classifier\",tag=\"sgd\")[-1]\n",
909-
"xgboost_model_version = client.list_model_versions(\"breast_cancer_classifier\",tag=\"xgboost\")[-1]\n",
920+
"sgd_model_version = client.list_model_versions(\n",
921+
" \"breast_cancer_classifier\", tag=\"sgd\"\n",
922+
")[-1]\n",
923+
"xgboost_model_version = client.list_model_versions(\n",
924+
" \"breast_cancer_classifier\", tag=\"xgboost\"\n",
925+
")[-1]\n",
910926
"print(f\"SGD version is staged as `{sgd_model_version.stage}`\")\n",
911927
"print(f\"XGBoost version is staged as `{xgboost_model_version.stage}`\")"
912928
]
@@ -975,10 +991,18 @@
975991
"metadata": {},
976992
"outputs": [],
977993
"source": [
978-
"sgd_clf_metadata = sgd_model_version.get_artifact(\"breast_cancer_classifier\").run_metadata\n",
979-
"xgboost_clf_metadata = xgboost_model_version.get_artifact(\"breast_cancer_classifier\").run_metadata\n",
980-
"print(f\"SGD{' (production)' if sgd_model_version.stage == 'production' else ''} metrics: train={sgd_clf_metadata['train_accuracy'].value*100:.2f}% test={sgd_clf_metadata['test_accuracy'].value*100:.2f}%\")\n",
981-
"print(f\"XGBoost{' (production)' if xgboost_model_version.stage == 'production' else ''} metrics: train={xgboost_clf_metadata['train_accuracy'].value*100:.2f}% test={xgboost_clf_metadata['test_accuracy'].value*100:.2f}%\")"
994+
"sgd_clf_metadata = sgd_model_version.get_artifact(\n",
995+
" \"breast_cancer_classifier\"\n",
996+
").run_metadata\n",
997+
"xgboost_clf_metadata = xgboost_model_version.get_artifact(\n",
998+
" \"breast_cancer_classifier\"\n",
999+
").run_metadata\n",
1000+
"print(\n",
1001+
" f\"SGD{' (production)' if sgd_model_version.stage == 'production' else ''} metrics: train={sgd_clf_metadata['train_accuracy'].value*100:.2f}% test={sgd_clf_metadata['test_accuracy'].value*100:.2f}%\"\n",
1002+
")\n",
1003+
"print(\n",
1004+
" f\"XGBoost{' (production)' if xgboost_model_version.stage == 'production' else ''} metrics: train={xgboost_clf_metadata['train_accuracy'].value*100:.2f}% test={xgboost_clf_metadata['test_accuracy'].value*100:.2f}%\"\n",
1005+
")"
9821006
]
9831007
},
9841008
{
@@ -996,21 +1020,27 @@
9961020
"metadata": {},
9971021
"outputs": [],
9981022
"source": [
999-
"import seaborn as sns\n",
1000-
"import numpy as np\n",
10011023
"import matplotlib.pyplot as plt\n",
1024+
"import numpy as np\n",
1025+
"import seaborn as sns\n",
10021026
"\n",
1003-
"def plot_confusion_matrix(metadata_pointer, tp: str,ax):\n",
1004-
" confusion_matrix = np.array(metadata_pointer[\"confusion_matrix\"].value, dtype=float).reshape((2,2))\n",
1027+
"\n",
1028+
"def plot_confusion_matrix(metadata_pointer, tp: str, ax):\n",
1029+
" confusion_matrix = np.array(\n",
1030+
" metadata_pointer[\"confusion_matrix\"].value, dtype=float\n",
1031+
" ).reshape((2, 2))\n",
10051032
" confusion_matrix /= np.sum(confusion_matrix)\n",
1006-
" sns.heatmap(confusion_matrix, annot=True,fmt='.2%',cmap=\"coolwarm\",ax=ax)\n",
1033+
" sns.heatmap(\n",
1034+
" confusion_matrix, annot=True, fmt=\".2%\", cmap=\"coolwarm\", ax=ax\n",
1035+
" )\n",
10071036
" ax.set_title(f\"{tp} confusion matrix\")\n",
10081037
" ax.set_ylabel(\"Ground Label\")\n",
10091038
" ax.set_xlabel(\"Predicted Label\")\n",
10101039
"\n",
1011-
"fig, ax = plt.subplots(1,2,figsize=(15,4))\n",
1012-
"plot_confusion_matrix(sgd_clf_metadata, \"SGD\",ax[0])\n",
1013-
"plot_confusion_matrix(xgboost_clf_metadata, \"RF\",ax[1])"
1040+
"\n",
1041+
"fig, ax = plt.subplots(1, 2, figsize=(15, 4))\n",
1042+
"plot_confusion_matrix(sgd_clf_metadata, \"SGD\", ax[0])\n",
1043+
"plot_confusion_matrix(xgboost_clf_metadata, \"RF\", ax[1])"
10141044
]
10151045
},
10161046
{
@@ -1052,7 +1082,7 @@
10521082
"for artifact_name, versions in sgd_model_version.data_artifacts.items():\n",
10531083
" if versions:\n",
10541084
" print(f\"Existing version of `{artifact_name}`:\")\n",
1055-
" for version_name, artifact_ in versions.items():\n",
1085+
" for version_name, artifact_ in versions.items():\n",
10561086
" print(version_name, artifact_.data_type.attribute)"
10571087
]
10581088
},

0 commit comments

Comments
 (0)