Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[flake8]
max-line-length = 100
extend-ignore = E203, W503
exclude =
.git,
__pycache__,
.pytest_cache,
.mypy_cache,
.venv,
venv,
env,
build,
dist,
.ipynb_checkpoints,
screenshots,
data,
model
per-file-ignores =
tests/*:E501
31 changes: 0 additions & 31 deletions .github/workflows/manual.yml

This file was deleted.

36 changes: 36 additions & 0 deletions .github/workflows/python-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: CI
on:
push:
pull_request:

jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: 'pip'
- name: Install
run: |
python -m pip install --upgrade pip
# First install pinned runtime/dev dependencies to avoid building
# heavy wheels during the editable install step.
pip install -r requirements.txt
# Now install the project in editable mode so repository packages
# (like `ml`) are importable in CI.
pip install -e .

- name: Debug install
run: |
echo "Python:" $(python --version)
echo "pip:" $(pip --version)
pip list --format=columns
python -c "import ml, sys; print('ml import ok =>', ml.__file__)"
- name: Train tiny model
run: python scripts/train_tiny_model.py
- name: Lint
run: python -m flake8 .
- name: Tests
run: pytest -q
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,4 @@ pyrightconfig.json
fastapi/

# End of https://www.toptal.com/developers/gitignore/api/python
model/*.pkl
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,25 @@ Working in a command line environment is recommended for ease of use with git an
# Environment Set up (pip or conda)
* Option 1: use the supplied file `environment.yml` to create a new environment with conda
* Option 2: use the supplied file `requirements.txt` to create a new environment with pip

Quick start (venv + editable install)
-----------------------------------
If you prefer a lightweight virtualenv workflow, create and activate a venv, install the project in editable mode and run the tiny trainer used in CI:

```bash
# create & activate venv (macOS / Linux)
python -m venv .venv
source .venv/bin/activate

# install the project and dependencies in editable mode
pip install --upgrade pip
pip install -e .

# run the small training helper (writes artifacts to ./model)
python scripts/train_tiny_model.py
```

This mirrors how the CI installs the repository and makes the local `ml` package importable without modifying PYTHONPATH.

## Repositories
* Create a directory for the project and initialize git.
Expand Down
58 changes: 41 additions & 17 deletions local_api.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,49 @@
import json

import requests

# TODO: send a GET using the URL http://127.0.0.1:8000
r = None # Your code here
URL = "http://127.0.0.1:8000"


# TODO: print the status code
# print()
# TODO: print the welcome message
# print()
def safe_request(method, url, **kwargs):
try:
r = requests.request(method, url, timeout=5, **kwargs)
try:
# Try to parse as JSON
print(f"{method} {url}:", r.status_code, r.json())
except ValueError:
# Fallback to raw text if not JSON (e.g. error pages, 500s)
print(f"{method} {url}:", r.status_code, r.text)
except requests.exceptions.ConnectionError:
print(f"❌ Could not connect to {url}. Is the server running?")
except requests.exceptions.Timeout:
print(f"⏱️ Request to {url} timed out.")
except Exception as e:
print(f"⚠️ Unexpected error calling {url}: {e}")


# GET request
safe_request("GET", URL)

# First POST payload
payload1 = {
"age": 52,
"workclass": "Private",
"fnlgt": 209642,
"education": "Masters",
"education-num": 14,
"marital-status": "Married-civ-spouse",
"occupation": "Exec-managerial",
"relationship": "Husband",
"race": "White",
"sex": "Male",
"capital-gain": 0,
"capital-loss": 0,
"hours-per-week": 45,
"native-country": "United-States",
}
safe_request("POST", f"{URL}/data/", json=payload1)

data = {
# Second POST payload
payload2 = {
"age": 37,
"workclass": "Private",
"fnlgt": 178356,
Expand All @@ -28,11 +59,4 @@
"hours-per-week": 40,
"native-country": "United-States",
}

# TODO: send a POST using the data above
r = None # Your code here

# TODO: print the status code
# print()
# TODO: print the result
# print()
safe_request("POST", f"{URL}/data/", json=payload2)
143 changes: 88 additions & 55 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,74 +1,107 @@
import os

import pandas as pd
from fastapi import FastAPI
from pydantic import BaseModel, Field
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, ConfigDict, Field

from ml.data import apply_label, process_data
from ml.model import inference, load_model

# DO NOT MODIFY

# ---------- Request schema (Pydantic v2 style) ----------
class Data(BaseModel):
age: int = Field(..., example=37)
workclass: str = Field(..., example="Private")
fnlgt: int = Field(..., example=178356)
education: str = Field(..., example="HS-grad")
education_num: int = Field(..., example=10, alias="education-num")
marital_status: str = Field(
..., example="Married-civ-spouse", alias="marital-status"
# allow aliases (hyphenated names) and show a full example in /docs
model_config = ConfigDict(
populate_by_name=True,
json_schema_extra={
"example": {
"age": 37,
"workclass": "Private",
"fnlgt": 178356,
"education": "HS-grad",
"education-num": 10,
"marital-status": "Married-civ-spouse",
"occupation": "Prof-specialty",
"relationship": "Husband",
"race": "White",
"sex": "Male",
"capital-gain": 0,
"capital-loss": 0,
"hours-per-week": 40,
"native-country": "United-States",
}
},
)

age: int
workclass: str
fnlgt: int
education: str
education_num: int = Field(alias="education-num")
marital_status: str = Field(alias="marital-status")
occupation: str
relationship: str
race: str
sex: str
capital_gain: int = Field(alias="capital-gain")
capital_loss: int = Field(alias="capital-loss")
hours_per_week: int = Field(alias="hours-per-week")
native_country: str = Field(alias="native-country")


# ---------- Load artifacts ----------
_PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
ENCODER_PATH = os.path.join(_PROJECT_ROOT, "model", "encoder.pkl")
MODEL_PATH = os.path.join(_PROJECT_ROOT, "model", "model.pkl")

if not os.path.exists(ENCODER_PATH):
raise RuntimeError(
f"Encoder not found at {ENCODER_PATH}. Did you run train_model.py?"
)
occupation: str = Field(..., example="Prof-specialty")
relationship: str = Field(..., example="Husband")
race: str = Field(..., example="White")
sex: str = Field(..., example="Male")
capital_gain: int = Field(..., example=0, alias="capital-gain")
capital_loss: int = Field(..., example=0, alias="capital-loss")
hours_per_week: int = Field(..., example=40, alias="hours-per-week")
native_country: str = Field(..., example="United-States", alias="native-country")
if not os.path.exists(MODEL_PATH):
raise RuntimeError(f"Model not found at {MODEL_PATH}. Did you run train_model.py?")

encoder = load_model(ENCODER_PATH)
model = load_model(MODEL_PATH)

path = None # TODO: enter the path for the saved encoder
encoder = load_model(path)

path = None # TODO: enter the path for the saved model
model = load_model(path)
# ---------- FastAPI app ----------
app = FastAPI(title="Census Income Inference API")

# TODO: create a RESTful API using FastAPI
app = None # your code here

# TODO: create a GET on the root giving a welcome message
@app.get("/")
async def get_root():
""" Say hello!"""
# your code here
pass
return {"message": "Welcome to the Census Income Inference API"}


# TODO: create a POST on a different path that does model inference
@app.post("/data/")
async def post_inference(data: Data):
# DO NOT MODIFY: turn the Pydantic model into a dict.
data_dict = data.dict()
# DO NOT MODIFY: clean up the dict to turn it into a Pandas DataFrame.
# The data has names with hyphens and Python does not allow those as variable names.
# Here it uses the functionality of FastAPI/Pydantic/etc to deal with this.
data = {k.replace("_", "-"): [v] for k, v in data_dict.items()}
data = pd.DataFrame.from_dict(data)

cat_features = [
"workclass",
"education",
"marital-status",
"occupation",
"relationship",
"race",
"sex",
"native-country",
]
data_processed, _, _, _ = process_data(
# your code here
# use data as data input
# use training = False
# do not need to pass lb as input
)
_inference = None # your code here to predict the result using data_processed
return {"result": apply_label(_inference)}
try:
# Use aliases and normalize keys to hyphenated for the pipeline
data_dict = data.model_dump(by_alias=True)
df = pd.DataFrame([{k.replace("_", "-"): v for k, v in data_dict.items()}])

cat_features = [
"workclass",
"education",
"marital-status",
"occupation",
"relationship",
"race",
"sex",
"native-country",
]

X, _, _, _ = process_data(
df,
categorical_features=cat_features,
label=None,
training=False,
encoder=encoder,
lb=None,
)
preds = inference(model, X)
return {"result": apply_label(preds)}
except Exception as e:
# Keep stacktrace in server logs but surface a clear client error
raise HTTPException(status_code=500, detail=f"Inference failed: {e}")
2 changes: 1 addition & 1 deletion ml/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@

"""ML package."""
5 changes: 3 additions & 2 deletions ml/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
def process_data(
X, categorical_features=[], label=None, training=True, encoder=None, lb=None
):
""" Process the data used in the machine learning pipeline.
"""Process the data used in the machine learning pipeline.

Processes the data using one hot encoding for the categorical features and a
label binarizer for the labels. This can be used in either training or
Expand Down Expand Up @@ -69,8 +69,9 @@ def process_data(
X = np.concatenate([X_continuous, X_categorical], axis=1)
return X, y, encoder, lb


def apply_label(inference):
""" Convert the binary label in a single inference sample into string output."""
"""Convert the binary label in a single inference sample into string output."""
if inference[0] == 1:
return ">50K"
elif inference[0] == 0:
Expand Down
Loading