Skip to content
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
cf6c084
Update Dockerfile
ValentineDragan Oct 16, 2025
0201f0c
Update circleci config to login to chainguard
ValentineDragan Oct 16, 2025
8ece2a5
fix typo in circleci config
ValentineDragan Oct 24, 2025
c3ef4e5
Add code to debug circleci errors
ValentineDragan Oct 24, 2025
93f579a
Debug missing chainguard token
ValentineDragan Oct 24, 2025
3853517
Debug failing oidc token swap
ValentineDragan Oct 24, 2025
9dac2cd
Update config
ValentineDragan Oct 24, 2025
8104ff1
Retry OIDC token swap with updated chainguard identity
ValentineDragan Oct 27, 2025
4ddee82
Update audience for token exchange request
ValentineDragan Oct 27, 2025
e5642e8
Simplify chainguard authentication with chainctl
ValentineDragan Oct 27, 2025
44b59ed
Specify audience cgr.dev in auth login
ValentineDragan Oct 27, 2025
82fe322
Update system packages in Dockerfile
ValentineDragan Oct 28, 2025
209a34a
Update Dockerfile packages for chainguard compatbility
ValentineDragan Oct 28, 2025
6f79179
update Dockerfile
ValentineDragan Oct 28, 2025
ae1bb4e
Revert circleci python version to 3.10.14
ValentineDragan Oct 28, 2025
a465e51
Update hardcoded model-engine image tag used in integration tests
ValentineDragan Oct 28, 2025
0eca9cb
Fix CircleCI config trying to use hardcoded model-engine image tag fo…
ValentineDragan Oct 28, 2025
fe8d764
Mount service_config_circleci.yaml in batch job pods
ValentineDragan Oct 28, 2025
b53b7c9
Fix broken helm template
ValentineDragan Oct 28, 2025
b054e67
Add missing infra config and service template config to batch job pods
ValentineDragan Oct 28, 2025
c142b27
remove redundant config for batch job pods
ValentineDragan Oct 28, 2025
f312dfe
enable SHA256 checksums for Celery S3 backend to avoid MD5 decoding i…
ValentineDragan Oct 28, 2025
f742965
Fix failing md5 monkey patch
ValentineDragan Oct 28, 2025
c8a2c66
bump sqlalchemy to 2.0.21 to address md5 FIPS compliance
ValentineDragan Oct 28, 2025
5e4fcf2
Fix black linting errors
ValentineDragan Oct 28, 2025
506b0bf
wrap Dockerfile layers between root and nonroot user
ValentineDragan Oct 28, 2025
66cbd33
Remove the federal/ directory since Dockerfile is now FIPS compliant …
ValentineDragan Oct 28, 2025
e27a32d
set celery_enable_sha256 to true in all configs for FIPS compliance
ValentineDragan Oct 28, 2025
fb479f6
make changes backwards compatible by having separate Dockerfiles
ValentineDragan Nov 2, 2025
9b11f29
formatting
ValentineDragan Nov 2, 2025
aec1fe0
Merge branch 'main' into fix/fix-vulnerabilities-in-model-engine-image
ValentineDragan Nov 2, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ jobs:
executor: ubuntu-large
steps:
- checkout
- chainguard_login
- run:
name: Build Docker Image
command: |
Expand All @@ -116,6 +117,7 @@ jobs:
- aws-cli/setup:
role-arn: ${CIRCLECI_ROLE_ARN}
aws-region: AWS_REGION
- chainguard_login
- run:
name: Build Docker Image
command: |
Expand Down Expand Up @@ -156,7 +158,11 @@ jobs:
- run:
name: Pre-load model-engine image to minikube
command: |
# Load the base image for gateway/init containers
minikube --logtostderr -v 1 image load model-engine:$CIRCLE_SHA1
# Tag and load with ECR prefix for batch job containers
docker tag model-engine:$CIRCLE_SHA1 $CIRCLECI_AWS_ACCOUNT_ID.dkr.ecr.us-west-2.amazonaws.com/model-engine:$CIRCLE_SHA1
minikube --logtostderr -v 1 image load $CIRCLECI_AWS_ACCOUNT_ID.dkr.ecr.us-west-2.amazonaws.com/model-engine:$CIRCLE_SHA1
- run:
name: Pre-load integration test images to minikube
command: |
Expand Down Expand Up @@ -209,6 +215,20 @@ executors:
resource_class: 2xlarge

commands:
chainguard_login:
description: Authenticate to Chainguard Registry via OIDC
steps:
- run:
name: Install chainctl
command: |
curl -o chainctl "https://dl.enforce.dev/chainctl/latest/chainctl_$(uname -s | tr '[:upper:]' '[:lower:]')_$(uname -m | sed 's/aarch64/arm64/')"
sudo install -o $UID -g $(id -g) -m 0755 chainctl /usr/local/bin/
- run:
name: Login to Chainguard Registry
command: |
chainctl auth login --identity-token "${CIRCLE_OIDC_TOKEN}" --identity "${CHAINGUARD_IDENTITY_ID}" --audience cgr.dev
CHAINGUARD_TOKEN=$(chainctl auth token --audience cgr.dev)
echo "${CHAINGUARD_TOKEN}" | docker login cgr.dev -u "oauth2accesstoken" --password-stdin
environment_setup:
description: Basic Environment setup
steps:
Expand Down
44 changes: 40 additions & 4 deletions charts/model-engine/templates/service_template_config_map.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1025,12 +1025,26 @@ data:
{{- toYaml . | nindent 12 }}
{{- end }}
serviceAccountName: {{ $launch_name }}
{{- if $require_aws_config }}
volumes:
{{- if $require_aws_config }}
- name: config-volume
configMap:
name: {{ $aws_config_map_name }}
{{- end }}
{{- end }}
{{- if $config_values }}
- name: service-config-volume
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

curious if you needed to add this for specific reason? do you actually use batch-job-orchestration-job

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, if we don't add this change, there are integration tests running batch jobs that will fail because they can't find the service configs. See explanation below:

This is part of fixing the integration tests bug in the file below (rest_api_utils.py). Context (I debugged all this by SSH-ing into the instance running the CircleCI workflows and inspecting the kubernetes logs):

  • Some of the integration tests were using a hardcoded model engine image tag (830c81ecba2a147022e504917c6ce18b00c2af44) to run - see CREATE_DOCKER_IMAGE_BATCH_JOB_BUNDLE_REQUEST, CREATE_FINE_TUNE_DI_BATCH_JOB_BUNDLE_REQUEST..
  • The integration tests would spin up kubernetes pods for some batch jobs, and they were being created from the hardcoded model engine image. But that meant that new changes to model engine server might not actually be reflected in the integration tests, so I updated the rest_api_utils.py file to rebuild the container.

Fixing this bug caused the integration tests to fail because the old hardcoded image had the service configs copied inside the container image, whereas new images need to mount them instead. I reran these tests on another branch where I only changed the model engine image tag used to confirm this is an isolated issue - no other changes (i.e. Dockerfile):

configMap:
name: {{ $launch_name }}-service-config
items:
- key: launch_service_config
path: service_config.yaml
- name: infra-service-config-volume
configMap:
name: {{ $launch_name }}-service-config
items:
- key: infra_service_config
path: config.yaml
{{- end }}
containers:
- name: main
image: {{ $gateway_repository }}:${GIT_TAG}
Expand Down Expand Up @@ -1077,12 +1091,18 @@ data:
cpu: 4
memory: 32Gi
ephemeral-storage: 30Gi
{{- if $require_aws_config }}
volumeMounts:
{{- if $require_aws_config }}
- name: config-volume
mountPath: /opt/.aws/config
subPath: config
{{- end }}
{{- end }}
{{- if $config_values }}
- name: service-config-volume
mountPath: /workspace/model-engine/service_configs
- name: infra-service-config-volume
mountPath: /workspace/model-engine/model_engine_server/core/configs
{{- end }}
{{- range $device := tuple "cpu" "gpu" }}
docker-image-batch-job-{{- $device }}.yaml: |-
apiVersion: batch/v1
Expand Down Expand Up @@ -1134,6 +1154,14 @@ data:
configMap:
name: {{ $aws_config_map_name }}
{{- end }}
{{- if $config_values }}
- name: service-config-volume
configMap:
name: {{ $launch_name }}-service-config
items:
- key: launch_service_config
path: service_config.yaml
{{- end }}
- name: workdir
emptyDir: {}
- name: dshm
Expand Down Expand Up @@ -1178,6 +1206,10 @@ data:
mountPath: /opt/.aws/config
subPath: config
{{- end }}
{{- if $config_values }}
- name: service-config-volume
mountPath: /workspace/model-engine/service_configs
{{- end }}
- name: workdir
mountPath: ${MOUNT_PATH}
- mountPath: /dev/shm
Expand Down Expand Up @@ -1212,6 +1244,10 @@ data:
mountPath: /opt/.aws/config
subPath: config
{{- end }}
{{- if $config_values }}
- name: service-config-volume
mountPath: /workspace/model-engine/service_configs
{{- end }}
- name: workdir
mountPath: ${MOUNT_PATH}
{{- end }}
Expand Down
2 changes: 1 addition & 1 deletion charts/model-engine/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ spellbook:
enabled: false

# celery_enable_sha256 [optional] uses SHA256 hashes for federal compliance mode (FIPS, enhanced security)
celery_enable_sha256: null
celery_enable_sha256: true

# debug_mode [optional] enables detailed debug logging for infrastructure components
debug_mode: null
Expand Down
3 changes: 2 additions & 1 deletion charts/model-engine/values_circleci.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This is a YAML-formatted file.

celery_enable_sha256: null
celery_enable_sha256: true
celery_broker_type_redis: null
debug_mode: null

Expand Down Expand Up @@ -39,6 +39,7 @@ balloons:
# commit from which the image was built.
# tag:
context: circleci

image:
gatewayRepository: model-engine
builderRepository: model-engine
Expand Down
2 changes: 1 addition & 1 deletion charts/model-engine/values_sample.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# This is a YAML-formatted file.

# celery_enable_sha256 [optional] uses SHA256 hashes for federal compliance mode (FIPS, enhanced security)
celery_enable_sha256: null
celery_enable_sha256: true

# debug_mode [optional] enables detailed debug logging for infrastructure components
debug_mode: null
Expand Down
47 changes: 0 additions & 47 deletions federal/Dockerfile.chainguard

This file was deleted.

17 changes: 0 additions & 17 deletions federal/sitecustomize.py

This file was deleted.

6 changes: 3 additions & 3 deletions integration_tests/rest_api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def my_model(**keyword_args):
"flavor": {
"flavor": "streaming_enhanced_runnable_image",
"repository": "model-engine",
"tag": "830c81ecba2a147022e504917c6ce18b00c2af44",
"tag": os.environ.get("GIT_TAG"),
"command": [
"dumb-init",
"--",
Expand Down Expand Up @@ -269,7 +269,7 @@ def my_model(**keyword_args):
CREATE_DOCKER_IMAGE_BATCH_JOB_BUNDLE_REQUEST: Dict[str, Any] = {
"name": format_name("di_batch_job_bundle_1"),
"image_repository": "model-engine",
"image_tag": "830c81ecba2a147022e504917c6ce18b00c2af44",
"image_tag": os.environ.get("GIT_TAG"),
"command": ["jq", ".", "/launch_mount_location/file"],
"env": {"ENV1": "VAL1"},
"mount_location": "/launch_mount_location/file",
Expand All @@ -289,7 +289,7 @@ def my_model(**keyword_args):
CREATE_FINE_TUNE_DI_BATCH_JOB_BUNDLE_REQUEST: Dict[str, Any] = {
"name": format_name("fine_tune_di_batch_job_bundle_1"),
"image_repository": "model-engine",
"image_tag": "830c81ecba2a147022e504917c6ce18b00c2af44",
"image_tag": os.environ.get("GIT_TAG"),
"command": ["cat", "/launch_mount_location/file"],
"env": {"ENV1": "VAL1"},
"mount_location": "/launch_mount_location/file",
Expand Down
59 changes: 28 additions & 31 deletions model-engine/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,42 +1,37 @@
# syntax = docker/dockerfile:experimental

FROM python:3.10.15-slim as model-engine
FROM cgr.dev/scale.com/python-fips:3.10.19-dev
WORKDIR /workspace

RUN apt-get update && apt-get install -y \
apt-utils \
dumb-init \
git \
ssh \
htop \
iftop \
vim \
curl \
procps \
libcurl4-openssl-dev \
libssl-dev \
python3-dev \
gcc \
build-essential \
telnet \
&& rm -rf /var/lib/apt/lists/*
USER root

RUN apk update && apk add \
htop \
dumb-init \
libssh \
openssh-client \
iftop \
curl \
curl-dev \
procps \
libcurl-openssl4 \
vim \
kubectl \
jq \
gcc \
glibc-dev \
python-3.10-dev \
libffi-dev \
openssl-dev \
build-base \
postgresql-dev \
libpq-16

RUN curl -Lo /bin/aws-iam-authenticator https://github.com/kubernetes-sigs/aws-iam-authenticator/releases/download/v0.5.9/aws-iam-authenticator_0.5.9_linux_amd64
RUN chmod +x /bin/aws-iam-authenticator

# Install kubectl
RUN curl -LO "https://dl.k8s.io/release/v1.23.13/bin/linux/amd64/kubectl" \
&& chmod +x kubectl \
&& mv kubectl /usr/local/bin/kubectl

# Pin pip version
RUN pip install pip==24.2
RUN chmod -R 777 /workspace

# Install AWS CLI
RUN pip install awscli==1.34.28 --no-cache-dir

## grab model_engine_server project (w/ requirements install layer caching)
WORKDIR /workspace/model-engine/
COPY model-engine/requirements-test.txt /workspace/model-engine/requirements-test.txt
COPY model-engine/requirements.txt /workspace/model-engine/requirements.txt
Expand All @@ -51,7 +46,9 @@ RUN pip install -e .
COPY integration_tests /workspace/integration_tests

WORKDIR /workspace
ENV PYTHONPATH /workspace
ENV WORKSPACE /workspace
ENV PYTHONPATH=/workspace
ENV WORKSPACE=/workspace

USER nonroot

EXPOSE 5000
5 changes: 4 additions & 1 deletion model-engine/model_engine_server/common/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,10 @@ def generate_destination(user_id: str, endpoint_name: str, endpoint_type: str) -


def _generate_deployment_name_parts(user_id: str, endpoint_name: str) -> List[str]:
user_endpoint_hash = hashlib.md5((user_id + endpoint_name).encode("utf-8")).hexdigest()
# Use MD5 for deployment name hashing (non-security purpose) - FIPS compliant
user_endpoint_hash = hashlib.new(
"md5", (user_id + endpoint_name).encode("utf-8"), usedforsecurity=False
).hexdigest()
return [
DEPLOYMENT_PREFIX,
user_id[:24],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,10 @@ class CeleryAutoscalerParams:


def _hash_any_to_int(data: Any):
return int(hashlib.md5(str(data).encode()).hexdigest(), 16) # nosemgrep
# Use MD5 for hashing (non-security purpose) - FIPS compliant with usedforsecurity=False
return int(
hashlib.new("md5", str(data).encode(), usedforsecurity=False).hexdigest(), 16
) # nosemgrep


async def list_deployments(core_api, apps_api) -> Dict[Tuple[str, str], CeleryAutoscalerParams]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,10 @@ def _get_inject_bundle_image_params(
bundle_id = model_bundle.id
service_image_str = "-".join([base_image_params.image_tag, GIT_TAG, bundle_id])
# nosemgrep
service_image_hash = hashlib.md5(str(service_image_str).encode("utf-8")).hexdigest()
# Use MD5 for image tag hashing (non-security purpose, required for Docker compatibility)
service_image_hash = hashlib.new(
"md5", str(service_image_str).encode("utf-8"), usedforsecurity=False
).hexdigest()
service_image_tag = f"inject-bundle-image-{service_image_hash}"
ecr_repo = base_image_params.repo

Expand Down Expand Up @@ -812,7 +815,12 @@ def _get_restricted_env_vars(env_vars: Dict[str, str]) -> Set[str]:
def _get_requirements_hash(requirements: List[str]) -> str:
"""Identifying hash for endpoint's Python requirements."""
# nosemgrep
return hashlib.md5("\n".join(sorted(requirements)).encode("utf-8")).hexdigest()[:6]
# Use MD5 for requirements hashing (non-security purpose)
return hashlib.new(
"md5",
"\n".join(sorted(requirements)).encode("utf-8"),
usedforsecurity=False,
).hexdigest()[:6]

@staticmethod
def _get_image_tag(base_image_tag: str, git_tag: str, requirements_hash: str) -> str:
Expand Down
Loading