diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml new file mode 100644 index 0000000000..2c95aff3b9 --- /dev/null +++ b/.github/workflows/python-app.yml @@ -0,0 +1,33 @@ +name: Python application + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest diff --git a/README.md b/README.md index 6c090c8179..9620c89ce3 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ Working in a command line environment is recommended for ease of use with git and dvc. If on Windows, WSL1 or 2 is recommended. +Link: https://github.com/chavelei/Deploying-a-Scalable-ML-Pipeline-with-FastAPI + # Environment Set up (pip or conda) * Option 1: use the supplied file `environment.yml` to create a new environment with conda * Option 2: use the supplied file `requirements.txt` to create a new environment with pip diff --git a/local_api.py b/local_api.py index a3bff2f988..065d19ad63 100644 --- a/local_api.py +++ b/local_api.py @@ -3,12 +3,12 @@ import requests # TODO: send a GET using the URL http://127.0.0.1:8000 -r = None # Your code here +r = requests.get("http://127.0.0.1:8000") # TODO: print the status code -# print() +print(r.status_code) # TODO: print the welcome message -# print() +print(r.text) @@ -30,9 +30,9 @@ } # TODO: send a POST using the data above -r = None # Your code here +r = requests.post("http://127.0.0.1:8000/data/", json=data) # TODO: print the status code -# print() +print(r.status_code) # TODO: print the result -# print() +print(r.text) diff --git a/main.py b/main.py index 638e2414de..ce5e806f3c 100644 --- a/main.py +++ b/main.py @@ -26,21 +26,21 @@ class Data(BaseModel): hours_per_week: int = Field(..., example=40, alias="hours-per-week") native_country: str = Field(..., example="United-States", alias="native-country") -path = None # TODO: enter the path for the saved encoder +path = "../Deploying-a-Scalable-ML-Pipeline-with-FastAPI/model/encoder.pkl" encoder = load_model(path) -path = None # TODO: enter the path for the saved model +path = "../Deploying-a-Scalable-ML-Pipeline-with-FastAPI/model/model.pkl" model = load_model(path) # TODO: create a RESTful API using FastAPI -app = None # your code here +app = FastAPI() # TODO: create a GET on the root giving a welcome message @app.get("/") async def get_root(): """ Say hello!""" # your code here - pass + return {"message": "Welcome to the ML model API!"} # TODO: create a POST on a different path that does model inference @@ -69,6 +69,11 @@ async def post_inference(data: Data): # use data as data input # use training = False # do not need to pass lb as input - ) - _inference = None # your code here to predict the result using data_processed + data, + categorical_features=cat_features, + encoder=encoder, + training=False, + label=None + ) + _inference = inference(model, data_processed) return {"result": apply_label(_inference)} diff --git a/ml/model.py b/ml/model.py index f361110f18..5c60bf129f 100644 --- a/ml/model.py +++ b/ml/model.py @@ -1,6 +1,8 @@ import pickle from sklearn.metrics import fbeta_score, precision_score, recall_score from ml.data import process_data +from sklearn.ensemble import RandomForestClassifier # Example model, can be replaced with any other model +import pandas as pd # TODO: add necessary import # Optional: implement hyperparameter tuning. @@ -20,8 +22,9 @@ def train_model(X_train, y_train): Trained machine learning model. """ # TODO: implement the function - pass - + model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42) + model.fit(X_train, y_train) + return model def compute_model_metrics(y, preds): """ @@ -60,7 +63,7 @@ def inference(model, X): Predictions from the model. """ # TODO: implement the function - pass + return model.predict(X) def save_model(model, path): """ Serializes model to a file. @@ -73,12 +76,14 @@ def save_model(model, path): Path to save pickle file. """ # TODO: implement the function - pass + with open(path, 'wb') as f: + pickle.dump(model, f) def load_model(path): """ Loads pickle file from `path` and returns it.""" # TODO: implement the function - pass + with open(path, 'rb') as f: + return pickle.load(f) def performance_on_categorical_slice( @@ -118,11 +123,18 @@ def performance_on_categorical_slice( """ # TODO: implement the function + data_slice = data[data[column_name] == slice_value] X_slice, y_slice, _, _ = process_data( # your code here # for input data, use data in column given as "column_name", with the slice_value # use training = False + data_slice, + categorical_features=categorical_features, + label=label, + training=False, + encoder=encoder, + lb=lb ) - preds = None # your code here to get prediction on X_slice using the inference function + preds = inference(model, X_slice) precision, recall, fbeta = compute_model_metrics(y_slice, preds) return precision, recall, fbeta diff --git a/model/encoder.pkl b/model/encoder.pkl new file mode 100644 index 0000000000..82f96fb9b2 Binary files /dev/null and b/model/encoder.pkl differ diff --git a/model/model.pkl b/model/model.pkl new file mode 100644 index 0000000000..31daf008a6 Binary files /dev/null and b/model/model.pkl differ diff --git a/model_card_template.md b/model_card_template.md index 0392f3b9eb..a4b82fd2e3 100644 --- a/model_card_template.md +++ b/model_card_template.md @@ -3,16 +3,40 @@ For additional information see the Model Card paper: https://arxiv.org/pdf/1810.03993.pdf ## Model Details +This classification model was trained using the 1994 Census Bureau dataset from the UCI Machine Learning Repository (https://archive.ics.uci.edu/dataset/20/census+income). The goal is to predict whether an individual’s annual income exceeds $50,000 based on a set of demographic and socio-economic features, including: +* Sex +* Race +* Marital status +* Age +* Native country +* Education +* Relationship status +* Occupation +* Hours worked per week +* Work class +* Capital gain +* Capital loss ## Intended Use +Primary use: Predicting income category for individuals based on demographic and economic data. +Not intended for: Making real-world financial, hiring, or legal decisions without thorough fairness and bias evaluation. ## Training Data +Source: 1994 Census Bureau dataset (UCI Machine Learning Repository). +Size: 48,842 records after preprocessing. ## Evaluation Data ## Metrics -_Please include the metrics used and your model's performance on those metrics._ +Precision : 0.7807 | Recall: 0.5379 | F1: 0.6369 ## Ethical Considerations +Dataset reflects social and economic patterns from 1994, which may not represent current demographics or job markets. +Potential bias in predictions related to sensitive attributes such as race, sex, or marital status. +Misuse could perpetuate existing inequalities if deployed in sensitive decision-making contexts. ## Caveats and Recommendations +Model performance may degrade on modern census or employment datasets without retraining. +Bias analysis should be conducted before deployment. +Should not be the sole decision-making tool in critical domains such as hiring or lending. + diff --git a/screenshots/continuous_integration.png b/screenshots/continuous_integration.png new file mode 100644 index 0000000000..9f5d33b634 Binary files /dev/null and b/screenshots/continuous_integration.png differ diff --git a/screenshots/local_api.png b/screenshots/local_api.png new file mode 100644 index 0000000000..e0052c2528 Binary files /dev/null and b/screenshots/local_api.png differ diff --git a/screenshots/unit_test.png b/screenshots/unit_test.png new file mode 100644 index 0000000000..99b48a2611 Binary files /dev/null and b/screenshots/unit_test.png differ diff --git a/slice_output.txt b/slice_output.txt new file mode 100644 index 0000000000..2f7cc9ce34 --- /dev/null +++ b/slice_output.txt @@ -0,0 +1,202 @@ +workclass: ?, Count: 435 +Precision: 0.6923 | Recall: 0.4286 | F1: 0.5294 +workclass: Federal-gov, Count: 237 +Precision: 0.7800 | Recall: 0.4062 | F1: 0.5342 +workclass: Local-gov, Count: 548 +Precision: 0.7664 | Recall: 0.6325 | F1: 0.6931 +workclass: Never-worked, Count: 2 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +workclass: Private, Count: 5,664 +Precision: 0.8034 | Recall: 0.5082 | F1: 0.6226 +workclass: Self-emp-inc, Count: 272 +Precision: 0.8209 | Recall: 0.7237 | F1: 0.7692 +workclass: Self-emp-not-inc, Count: 651 +Precision: 0.6331 | Recall: 0.5087 | F1: 0.5641 +workclass: State-gov, Count: 327 +Precision: 0.8049 | Recall: 0.6875 | F1: 0.7416 +workclass: Without-pay, Count: 5 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +education: 10th, Count: 236 +Precision: 1.0000 | Recall: 0.1364 | F1: 0.2400 +education: 11th, Count: 298 +Precision: 1.0000 | Recall: 0.2778 | F1: 0.4348 +education: 12th, Count: 110 +Precision: 1.0000 | Recall: 0.2857 | F1: 0.4444 +education: 1st-4th, Count: 44 +Precision: 1.0000 | Recall: 0.0000 | F1: 0.0000 +education: 5th-6th, Count: 81 +Precision: 1.0000 | Recall: 0.2500 | F1: 0.4000 +education: 7th-8th, Count: 164 +Precision: 1.0000 | Recall: 0.4286 | F1: 0.6000 +education: 9th, Count: 128 +Precision: 1.0000 | Recall: 0.0000 | F1: 0.0000 +education: Assoc-acdm, Count: 280 +Precision: 0.9286 | Recall: 0.3133 | F1: 0.4685 +education: Assoc-voc, Count: 330 +Precision: 1.0000 | Recall: 0.2386 | F1: 0.3853 +education: Bachelors, Count: 1,327 +Precision: 0.7136 | Recall: 0.8412 | F1: 0.7722 +education: Doctorate, Count: 107 +Precision: 0.8052 | Recall: 0.8267 | F1: 0.8158 +education: HS-grad, Count: 2,594 +Precision: 0.9737 | Recall: 0.1878 | F1: 0.3149 +education: Masters, Count: 444 +Precision: 0.7669 | Recall: 0.8608 | F1: 0.8111 +education: Preschool, Count: 10 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +education: Prof-school, Count: 151 +Precision: 0.8378 | Recall: 0.8942 | F1: 0.8651 +education: Some-college, Count: 1,837 +Precision: 0.9072 | Recall: 0.2543 | F1: 0.3973 +marital-status: Divorced, Count: 1,089 +Precision: 1.0000 | Recall: 0.2593 | F1: 0.4118 +marital-status: Married-AF-spouse, Count: 5 +Precision: 1.0000 | Recall: 0.5000 | F1: 0.6667 +marital-status: Married-civ-spouse, Count: 3,730 +Precision: 0.7671 | Recall: 0.5837 | F1: 0.6630 +marital-status: Married-spouse-absent, Count: 122 +Precision: 0.6667 | Recall: 0.2857 | F1: 0.4000 +marital-status: Never-married, Count: 2,698 +Precision: 1.0000 | Recall: 0.2727 | F1: 0.4286 +marital-status: Separated, Count: 235 +Precision: 1.0000 | Recall: 0.2857 | F1: 0.4444 +marital-status: Widowed, Count: 262 +Precision: 1.0000 | Recall: 0.3571 | F1: 0.5263 +occupation: ?, Count: 437 +Precision: 0.6923 | Recall: 0.4286 | F1: 0.5294 +occupation: Adm-clerical, Count: 933 +Precision: 0.7792 | Recall: 0.4286 | F1: 0.5530 +occupation: Armed-Forces, Count: 2 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +occupation: Craft-repair, Count: 1,040 +Precision: 0.8088 | Recall: 0.2455 | F1: 0.3767 +occupation: Exec-managerial, Count: 1,029 +Precision: 0.8138 | Recall: 0.6632 | F1: 0.7308 +occupation: Farming-fishing, Count: 245 +Precision: 0.5789 | Recall: 0.4400 | F1: 0.5000 +occupation: Handlers-cleaners, Count: 333 +Precision: 1.0000 | Recall: 0.3200 | F1: 0.4848 +occupation: Machine-op-inspct, Count: 513 +Precision: 0.7222 | Recall: 0.2031 | F1: 0.3171 +occupation: Other-service, Count: 817 +Precision: 1.0000 | Recall: 0.0000 | F1: 0.0000 +occupation: Priv-house-serv, Count: 33 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +occupation: Prof-specialty, Count: 1,044 +Precision: 0.7742 | Recall: 0.7742 | F1: 0.7742 +occupation: Protective-serv, Count: 179 +Precision: 0.7143 | Recall: 0.3448 | F1: 0.4651 +occupation: Sales, Count: 885 +Precision: 0.7429 | Recall: 0.5556 | F1: 0.6357 +occupation: Tech-support, Count: 244 +Precision: 0.8421 | Recall: 0.4444 | F1: 0.5818 +occupation: Transport-moving, Count: 407 +Precision: 0.7727 | Recall: 0.2125 | F1: 0.3333 +relationship: Husband, Count: 3,281 +Precision: 0.7648 | Recall: 0.5909 | F1: 0.6667 +relationship: Not-in-family, Count: 2,084 +Precision: 1.0000 | Recall: 0.2778 | F1: 0.4348 +relationship: Other-relative, Count: 251 +Precision: 1.0000 | Recall: 0.1111 | F1: 0.2000 +relationship: Own-child, Count: 1,228 +Precision: 1.0000 | Recall: 0.3333 | F1: 0.5000 +relationship: Unmarried, Count: 895 +Precision: 0.9375 | Recall: 0.2778 | F1: 0.4286 +relationship: Wife, Count: 402 +Precision: 0.7836 | Recall: 0.5469 | F1: 0.6442 +race: Amer-Indian-Eskimo, Count: 79 +Precision: 0.8333 | Recall: 0.5000 | F1: 0.6250 +race: Asian-Pac-Islander, Count: 269 +Precision: 0.6207 | Recall: 0.6545 | F1: 0.6372 +race: Black, Count: 784 +Precision: 0.7347 | Recall: 0.3871 | F1: 0.5070 +race: Other, Count: 78 +Precision: 0.5000 | Recall: 0.3750 | F1: 0.4286 +race: White, Count: 6,931 +Precision: 0.7913 | Recall: 0.5431 | F1: 0.6441 +sex: Female, Count: 2,752 +Precision: 0.8293 | Recall: 0.4518 | F1: 0.5849 +sex: Male, Count: 5,389 +Precision: 0.7739 | Recall: 0.5537 | F1: 0.6456 +native-country: ?, Count: 138 +Precision: 0.6774 | Recall: 0.6774 | F1: 0.6774 +native-country: Cambodia, Count: 1 +Precision: 1.0000 | Recall: 0.0000 | F1: 0.0000 +native-country: Canada, Count: 27 +Precision: 0.7500 | Recall: 0.6000 | F1: 0.6667 +native-country: China, Count: 25 +Precision: 0.6667 | Recall: 1.0000 | F1: 0.8000 +native-country: Columbia, Count: 16 +Precision: 0.0000 | Recall: 1.0000 | F1: 0.0000 +native-country: Cuba, Count: 18 +Precision: 1.0000 | Recall: 0.7143 | F1: 0.8333 +native-country: Dominican-Republic, Count: 16 +Precision: 1.0000 | Recall: 0.0000 | F1: 0.0000 +native-country: Ecuador, Count: 7 +Precision: 1.0000 | Recall: 0.0000 | F1: 0.0000 +native-country: El-Salvador, Count: 35 +Precision: 1.0000 | Recall: 0.6667 | F1: 0.8000 +native-country: England, Count: 22 +Precision: 0.7500 | Recall: 0.6667 | F1: 0.7059 +native-country: France, Count: 7 +Precision: 0.6667 | Recall: 0.5000 | F1: 0.5714 +native-country: Germany, Count: 33 +Precision: 0.8750 | Recall: 0.5833 | F1: 0.7000 +native-country: Greece, Count: 7 +Precision: 0.0000 | Recall: 0.0000 | F1: 0.0000 +native-country: Guatemala, Count: 23 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Haiti, Count: 14 +Precision: 1.0000 | Recall: 0.0000 | F1: 0.0000 +native-country: Honduras, Count: 5 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Hong, Count: 4 +Precision: 1.0000 | Recall: 0.5000 | F1: 0.6667 +native-country: Hungary, Count: 3 +Precision: 1.0000 | Recall: 0.5000 | F1: 0.6667 +native-country: India, Count: 25 +Precision: 0.5556 | Recall: 0.5556 | F1: 0.5556 +native-country: Iran, Count: 13 +Precision: 0.6667 | Recall: 0.8000 | F1: 0.7273 +native-country: Ireland, Count: 2 +Precision: 1.0000 | Recall: 0.0000 | F1: 0.0000 +native-country: Italy, Count: 18 +Precision: 0.6000 | Recall: 0.4286 | F1: 0.5000 +native-country: Jamaica, Count: 24 +Precision: 0.0000 | Recall: 0.0000 | F1: 0.0000 +native-country: Japan, Count: 17 +Precision: 1.0000 | Recall: 0.4000 | F1: 0.5714 +native-country: Laos, Count: 8 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Mexico, Count: 161 +Precision: 1.0000 | Recall: 0.4286 | F1: 0.6000 +native-country: Nicaragua, Count: 11 +Precision: 1.0000 | Recall: 0.0000 | F1: 0.0000 +native-country: Outlying-US(Guam-USVI-etc), Count: 4 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Peru, Count: 8 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Philippines, Count: 50 +Precision: 0.7500 | Recall: 0.6000 | F1: 0.6667 +native-country: Poland, Count: 14 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Portugal, Count: 11 +Precision: 1.0000 | Recall: 0.0000 | F1: 0.0000 +native-country: Puerto-Rico, Count: 23 +Precision: 0.5000 | Recall: 0.5000 | F1: 0.5000 +native-country: Scotland, Count: 5 +Precision: 1.0000 | Recall: 0.5000 | F1: 0.6667 +native-country: South, Count: 22 +Precision: 0.1667 | Recall: 1.0000 | F1: 0.2857 +native-country: Taiwan, Count: 9 +Precision: 0.5000 | Recall: 1.0000 | F1: 0.6667 +native-country: Thailand, Count: 6 +Precision: 1.0000 | Recall: 0.5000 | F1: 0.6667 +native-country: Trinadad&Tobago, Count: 6 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: United-States, Count: 7,280 +Precision: 0.7899 | Recall: 0.5343 | F1: 0.6374 +native-country: Vietnam, Count: 19 +Precision: 0.0000 | Recall: 0.0000 | F1: 0.0000 +native-country: Yugoslavia, Count: 4 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 diff --git a/test_ml.py b/test_ml.py index 5f8306f14c..3f09f8a91a 100644 --- a/test_ml.py +++ b/test_ml.py @@ -1,28 +1,30 @@ +import os +import pandas as pd import pytest -# TODO: add necessary import -# TODO: implement the first test. Change the function name and input as needed -def test_one(): +# Load dataset once for all tests +PROJECT_PATH = "../Deploying-a-Scalable-ML-Pipeline-with-FastAPI" +DATA_PATH = os.path.join(PROJECT_PATH, "data", "census.csv") +data = pd.read_csv(DATA_PATH) +MODEL_PATH = os.path.join(PROJECT_PATH, "model", "model.pkl") +ENCODER_PATH = os.path.join(PROJECT_PATH, "model", "encoder.pkl") + +def test_pickles_exist(): """ - # add description for the first test + Test that encoder.pkl and model.pkl files were created. """ - # Your code here - pass - + assert os.path.exists(MODEL_PATH), "model.pkl does not exist" + assert os.path.exists(ENCODER_PATH), "encoder.pkl does not exist" -# TODO: implement the second test. Change the function name and input as needed -def test_two(): +def test_age_range(): """ - # add description for the second test + Test that all ages are between 0 and 100. """ - # Your code here - pass + assert data["age"].between(0, 100).all() - -# TODO: implement the third test. Change the function name and input as needed -def test_three(): +def test_hours_per_week_range(): """ - # add description for the third test + Test that hours worked per week are between 1 and 99. """ - # Your code here - pass + assert data["hours-per-week"].between(1, 99).all() + diff --git a/train_model.py b/train_model.py index ae783ed5b9..db7198b5c0 100644 --- a/train_model.py +++ b/train_model.py @@ -13,14 +13,14 @@ train_model, ) # TODO: load the cencus.csv data -project_path = "Your path here" +project_path = "../Deploying-a-Scalable-ML-Pipeline-with-FastAPI" data_path = os.path.join(project_path, "data", "census.csv") print(data_path) -data = None # your code here +data = pd.read_csv(data_path) # TODO: split the provided data to have a train dataset and a test dataset # Optional enhancement, use K-fold cross validation instead of a train-test split. -train, test = None, None# Your code here +train, test = train_test_split(data, test_size=0.25) # DO NOT MODIFY cat_features = [ @@ -40,6 +40,10 @@ # use the train dataset # use training=True # do not need to pass encoder and lb as input + train, + categorical_features=cat_features, + label="salary", + training=True ) X_test, y_test, _, _ = process_data( @@ -52,7 +56,7 @@ ) # TODO: use the train_model function to train the model on the training dataset -model = None # your code here +model = train_model(X_train, y_train) # save the model and the encoder model_path = os.path.join(project_path, "model", "model.pkl") @@ -66,7 +70,7 @@ ) # TODO: use the inference function to run the model inferences on the test dataset. -preds = None # your code here +preds = inference(model, X_test) # Calculate and print the metrics p, r, fb = compute_model_metrics(y_test, preds) @@ -81,6 +85,14 @@ p, r, fb = performance_on_categorical_slice( # your code here # use test, col and slicevalue as part of the input + test, + column_name=col, + slice_value=slicevalue, + categorical_features=cat_features, + label="salary", + encoder=encoder, + lb=lb, + model=model ) with open("slice_output.txt", "a") as f: print(f"{col}: {slicevalue}, Count: {count:,}", file=f)