Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/components/custom_registry_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,6 @@
env_prefix="[PROD]"
)
"""
* WHAT: Inspect boto3.upload_file usage
* WHY: Durable storage pattern in AWS
* Migration Planning: Replace with GCS client logic inside a
26 changes: 2 additions & 24 deletions src/components/custom_training_components.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,2 @@
"""
Custom Training Components
=========================
Custom-built components for model training.

Author: MLOps Team
Maintained by: ML Engineering Team

Components:
- train_model_op: Train logistic regression model from BigQuery data

Usage:
from components.custom_training_components import train_model_op

train_task = train_model_op(
train_data=bq_train_task.outputs["destination_table"],
feature_columns=['Age', 'BMI', 'PlasmaGlucose'], # <-- Passed as parameter
target_column='Diabetic',
reg_rate=0.05,
project_id=project_id,
bq_location=region,
env_prefix="[DEV]"
)
"""
#* WHAT: Identify boto3/joblib/ds_utils imports
#* WHY: These libraries enable artifact persistence in S3
39 changes: 39 additions & 0 deletions src/components/model_input_ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#* WHAT: Inspect select_sql_from_dict or pd.read_sql usage
#* WHY: Redshift or S3 → DataFrame conversion
#* Migration Planning: Equivalent logic would move into
# prebuilt_bigquery_components.py using BigQuery query components.


# Imports

# Component Decorators
# Consider bigquery_query_job_op ((Google Managed Prebuilt Component that does not require the @component decorator))
@component(
base_image="python:<placeholder>",
packages_to_install=["placeholder for packages"]
)
def _read_from_redshift(sql_client, sql: str, params: dict = None, chunksize: Optional[int] = None) -> pd.DataFrame:
"""
Exploration helper to read data from Redshift using available SQL access object.

WHERE: _read_from_redshift ((Place holder for reading from BigQuery))
WHAT: example patterns using sql_client.select_sql_from_dict or pandas.read_sql
WHY: Redshift is columnar and can be expensive to pull; record trade-offs and auth considerations
"""
try:
if hasattr(sql_client, "select_sql_from_dict"):
q = {"sql": sql, "params": params or {}}
df = sql_client.select_sql_from_dict(q)
else:
df = pd.read_sql(sql, sql_client.conn, params=params)
except Exception:
LOG.exception("Redshift read failed; returning empty DataFrame for lab fallback")
df = pd.DataFrame()
return df

# Intermediate solution. Considering Prebuilt or Custom components
#* WHERE: stage_table_to_s3() in ingest_model.py
#* WHAT: Inspect UNLOAD vs client-side upload patterns
#* WHY: Efficiency vs cost trade-offs in Redshift
#* Migration Planning: Replace with BigQuery export jobs inside
# prebuilt_bigquery_components.py or custom_data_quality_components.py.
Loading