Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added .gitconfig
Empty file.
10 changes: 10 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ services:
restart: always
networks:
- app-network
mlflow:
build:
context: .
dockerfile: mlflow.dockerfile
ports:
- "5000:5000"
volumes:
- "${PWD}/mlflow:/home/mlflow/"
networks:
- app-network
networks:
app-network:
driver: bridge
Binary file added finalized_model.lib
Binary file not shown.
12 changes: 12 additions & 0 deletions mlflow.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM python:3.10-slim

RUN pip install mlflow==2.12.1

EXPOSE 5000

CMD [ \
"mlflow", "server", \
"--backend-store-uri", "sqlite:///home/mlflow/mlflow.db", \
"--host", "0.0.0.0", \
"--port", "5000" \
]
Binary file added mlflow_artifacts/dv_artifact.pkl
Binary file not shown.
14 changes: 14 additions & 0 deletions mlops/homework_03/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
.DS_Store
.file_versions
.gitkeep
.log
.logs/
.mage_temp_profiles
.preferences.yaml
.variables/
__pycache__/
docker-compose.override.yml
logs/
mage-ai.db
mage_data/
secrets/
Empty file added mlops/homework_03/__init__.py
Empty file.
Empty file.
Empty file.
Empty file.
67 changes: 67 additions & 0 deletions mlops/homework_03/data_exporters/build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
import joblib

# Set experiment id
mlflow.set_experiment('mage_lr_experiment')

# Set tracking uri
# mlflow.set_tracking_uri('http://localhost:5000')

if 'data_exporter' not in globals():
from mage_ai.data_preparation.decorators import data_exporter


@data_exporter
def export_data(data, *args, **kwargs):
"""
Exports data to some source.

Args:
data: The output from the upstream parent block
args: The output from any additional upstream blocks (if applicable)

Output (optional):
Optionally return any object and it'll be logged and
displayed when inspecting the block run.
"""
# Specify your transformation logic here
df_train = data

# turn dictionary into vector
dv = DictVectorizer()
train_dicts = df_train[['PULocationID', 'DOLocationID']].to_dict(orient='records')

# Feature matrix
X_train = dv.fit_transform(train_dicts)

# Target matrix
target = 'duration'
y_train = df_train[target].values

# Build model
model = LinearRegression()
model.fit(X_train, y_train)

# Specify artifact_path
artifact_directory = 'mlflow_artifacts'

# Create directory if it doesnt exist
os.makedirs(artifact_directory, exist_ok=True)

# Save and log the artifact (DictVectorizer)
artifact_path = os.path.join(artifact_directory, "dv_artifact.pkl")

with open(artifact_path, 'wb') as f:
joblib.dump(dv, f)

# Log the linear regression model with MLflow
with mlflow.start_run():
mlflow.sklearn.log_model(model, "linear_regression_model")
mlflow.log_param("intercept", model.intercept_)
mlflow.log_artifact(artifact_path)

return model, dv
16 changes: 16 additions & 0 deletions mlops/homework_03/data_exporters/export_titanic_clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from mage_ai.io.file import FileIO
from pandas import DataFrame

if 'data_exporter' not in globals():
from mage_ai.data_preparation.decorators import data_exporter


@data_exporter
def export_data_to_file(df: DataFrame, **kwargs) -> None:
"""
Template for exporting data to filesystem.

Docs: https://docs.mage.ai/design/data-loading#example-loading-data-from-a-file
"""
filepath = 'titanic_clean.csv'
FileIO().export(df, filepath)
Empty file.
28 changes: 28 additions & 0 deletions mlops/homework_03/data_loaders/ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import requests
from io import BytesIO
from typing import List

import pandas as pd


if 'data_loader' not in globals():
from mage_ai.data_preparation.decorators import data_loader


@data_loader
def load_data(*args, **kwargs):
"""
Template code for loading data from any source.

Returns:
Anything (e.g. data frame, dictionary, array, int, str, etc.)
"""
# Specify your data loading logic here
response = requests.get("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet")

if response.status_code != 200:
raise Exception(response.text)

df = pd.read_parquet(BytesIO(response.content))

return df
27 changes: 27 additions & 0 deletions mlops/homework_03/data_loaders/load_titanic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import io
import pandas as pd
import requests
from pandas import DataFrame

if 'data_loader' not in globals():
from mage_ai.data_preparation.decorators import data_loader
if 'test' not in globals():
from mage_ai.data_preparation.decorators import test


@data_loader
def load_data_from_api(**kwargs) -> DataFrame:
"""
Template for loading data from API
"""
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv?raw=True'

return pd.read_csv(url)


@test
def test_output(df) -> None:
"""
Template code for testing the output of the block.
"""
assert df is not None, 'The output is undefined'
9 changes: 9 additions & 0 deletions mlops/homework_03/dbt/profiles.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# https://docs.getdbt.com/docs/core/connect-data-platform/profiles.yml

base:
outputs:

dev:
type: duckdb

target: dev
Empty file.
Empty file.
134 changes: 134 additions & 0 deletions mlops/homework_03/io_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
version: 0.1.1
default:
# Default profile created for data IO access.
# Add your credentials for the source you use, and delete the rest.
# AWS
AWS_ACCESS_KEY_ID: "{{ env_var('AWS_ACCESS_KEY_ID') }}"
AWS_SECRET_ACCESS_KEY: "{{ env_var('AWS_SECRET_ACCESS_KEY') }}"
AWS_SESSION_TOKEN: session_token (Used to generate Redshift credentials)
AWS_REGION: region
# Algolia
ALGOLIA_APP_ID: app_id
ALGOLIA_API_KEY: api_key
ALGOLIA_INDEX_NAME: index_name
# Azure
AZURE_CLIENT_ID: "{{ env_var('AZURE_CLIENT_ID') }}"
AZURE_CLIENT_SECRET: "{{ env_var('AZURE_CLIENT_SECRET') }}"
AZURE_STORAGE_ACCOUNT_NAME: "{{ env_var('AZURE_STORAGE_ACCOUNT_NAME') }}"
AZURE_TENANT_ID: "{{ env_var('AZURE_TENANT_ID') }}"
# Chroma
CHROMA_COLLECTION: collection_name
CHROMA_PATH: path
# Clickhouse
CLICKHOUSE_DATABASE: default
CLICKHOUSE_HOST: host.docker.internal
CLICKHOUSE_INTERFACE: http
CLICKHOUSE_PASSWORD: null
CLICKHOUSE_PORT: 8123
CLICKHOUSE_USERNAME: null
# Druid
DRUID_HOST: hostname
DRUID_PASSWORD: password
DRUID_PATH: /druid/v2/sql/
DRUID_PORT: 8082
DRUID_SCHEME: http
DRUID_USER: user
# DuckDB
DUCKDB_DATABASE: database
DUCKDB_SCHEMA: main
# Google
GOOGLE_SERVICE_ACC_KEY:
type: service_account
project_id: project-id
private_key_id: key-id
private_key: "-----BEGIN PRIVATE KEY-----\nyour_private_key\n-----END_PRIVATE_KEY"
client_email: your_service_account_email
auth_uri: "https://accounts.google.com/o/oauth2/auth"
token_uri: "https://accounts.google.com/o/oauth2/token"
auth_provider_x509_cert_url: "https://www.googleapis.com/oauth2/v1/certs"
client_x509_cert_url: "https://www.googleapis.com/robot/v1/metadata/x509/your_service_account_email"
GOOGLE_SERVICE_ACC_KEY_FILEPATH: "/path/to/your/service/account/key.json"
GOOGLE_LOCATION: US # Optional
# MongoDB
# Specify either the connection string or the (host, password, user, port) to connect to MongoDB.
MONGODB_CONNECTION_STRING: "mongodb://{username}:{password}@{host}:{port}/"
MONGODB_HOST: host
MONGODB_PORT: 27017
MONGODB_USER: user
MONGODB_PASSWORD: password
MONGODB_DATABASE: database
MONGODB_COLLECTION: collection
# MSSQL
MSSQL_DATABASE: database
MSSQL_SCHEMA: schema
MSSQL_DRIVER: "ODBC Driver 18 for SQL Server"
MSSQL_HOST: host
MSSQL_PASSWORD: password
MSSQL_PORT: 1433
MSSQL_USER: SA
# MySQL
MYSQL_DATABASE: database
MYSQL_HOST: host
MYSQL_PASSWORD: password
MYSQL_PORT: 3306
MYSQL_USER: root
# Pinot
PINOT_HOST: hostname
PINOT_PASSWORD: password
PINOT_PATH: /query/sql
PINOT_PORT: 8000
PINOT_SCHEME: http
PINOT_USER: user
# PostgresSQL
POSTGRES_CONNECT_TIMEOUT: 10
POSTGRES_DBNAME: postgres
POSTGRES_SCHEMA: public # Optional
POSTGRES_USER: username
POSTGRES_PASSWORD: password
POSTGRES_HOST: hostname
POSTGRES_PORT: 5432
# Qdrant
QDRANT_COLLECTION: collection
QDRANT_PATH: path
# Redshift
REDSHIFT_SCHEMA: public # Optional
REDSHIFT_DBNAME: redshift_db_name
REDSHIFT_HOST: redshift_cluster_id.identifier.region.redshift.amazonaws.com
REDSHIFT_PORT: 5439
REDSHIFT_TEMP_CRED_USER: temp_username
REDSHIFT_TEMP_CRED_PASSWORD: temp_password
REDSHIFT_DBUSER: redshift_db_user
REDSHIFT_CLUSTER_ID: redshift_cluster_id
REDSHIFT_IAM_PROFILE: default
# Snowflake
SNOWFLAKE_USER: username
SNOWFLAKE_PASSWORD: password
SNOWFLAKE_ACCOUNT: account_id.region
SNOWFLAKE_DEFAULT_WH: null # Optional default warehouse
SNOWFLAKE_DEFAULT_DB: null # Optional default database
SNOWFLAKE_DEFAULT_SCHEMA: null # Optional default schema
SNOWFLAKE_PRIVATE_KEY_PASSPHRASE: null # Optional private key passphrase
SNOWFLAKE_PRIVATE_KEY_PATH: null # Optional private key path
SNOWFLAKE_ROLE: null # Optional role name
SNOWFLAKE_TIMEOUT: null # Optional timeout in seconds
# Trino
trino:
catalog: postgresql # Change this to the catalog of your choice
host: 127.0.0.1
http_headers:
X-Something: 'mage=power'
http_scheme: http
password: mage1337 # Optional
port: 8080
schema: core_data
session_properties: # Optional
acc01.optimize_locality_enabled: false
optimize_hash_generation: true
source: trino-cli # Optional
user: admin
verify: /path/to/your/ca.crt # Optional
# Weaviate
WEAVIATE_ENDPOINT: https://some-endpoint.weaviate.network
WEAVIATE_INSTANCE_API_KEY: YOUR-WEAVIATE-API-KEY
WEAVIATE_INFERENCE_API_KEY: YOUR-OPENAI-API-KEY
WEAVIATE_COLLECTION: collectionn_name
55 changes: 55 additions & 0 deletions mlops/homework_03/metadata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
project_type: standalone

variables_dir: ~/.mage_data
# remote_variables_dir: s3://bucket/path_prefix

variables_retention_period: '90d'

emr_config:
# You can customize the EMR cluster instance size with the two parameters
master_instance_type: 'r5.4xlarge'
slave_instance_type: 'r5.4xlarge'

# Configure security groups for EMR cluster instances.
# The default managed security groups are ElasticMapReduce-master and ElasticMapReduce-slave
# master_security_group: 'sg-xxxxxxxxxxxx'
# slave_security_group: 'sg-yyyyyyyyyyyy'

# If you want to ssh tunnel into EMR cluster, ec2_key_name must be configured.
# You can create a key pair in page https://console.aws.amazon.com/ec2#KeyPairs and download the key file.
# ec2_key_name: '[ec2_key_pair_name]'

spark_config:
# Application name
app_name: 'my spark app'
# Master URL to connect to
# e.g., spark_master: 'spark://host:port', or spark_master: 'yarn'
spark_master: 'local'
# Executor environment variables
# e.g., executor_env: {'PYTHONPATH': '/home/path'}
executor_env: {}
# Jar files to be uploaded to the cluster and added to the classpath
# e.g., spark_jars: ['/home/path/example1.jar']
spark_jars: []
# Path where Spark is installed on worker nodes
# e.g. spark_home: '/usr/lib/spark'
spark_home:
# List of key-value pairs to be set in SparkConf
# e.g., others: {'spark.executor.memory': '4g', 'spark.executor.cores': '2'}
others: {}
# Whether to create custom SparkSession via code and set in kwargs['context']
use_custom_session: false
# The variable name to set in kwargs['context'],
# e.g. kwargs['context']['spark'] = spark_session
custom_session_var_name: 'spark'

help_improve_mage: true
notification_config:
alert_on:
- trigger_failure
- trigger_passed_sla
slack_config:
webhook_url: "{{ env_var('MAGE_SLACK_WEBHOOK_URL') }}"
teams_config:
webhook_url: "{{ env_var('MAGE_TEAMS_WEBHOOK_URL') }}"
project_uuid: homework_03
Empty file.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
blocks: {}
layout: []
Loading