Smoke Tests for SageMaker #1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Smoke Tests for SageMaker | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| commit_sha: | |
| description: 'Commit SHA to use for artifacts' | |
| required: false | |
| type: string | |
| jobs: | |
| smoke-tests-sagemaker: | |
| name: Run Smoke Tests for SageMaker | |
| runs-on: ubuntu-latest | |
| environment: sagemaker-e2e-tests-workflow-env | |
| permissions: | |
| id-token: write | |
| contents: read | |
| env: | |
| COMMIT_SHA: ${{ inputs.commit_sha || github.sha }} | |
| GH_REF_NAME: ${{ github.ref_name }} | |
| SAGEMAKER_ARTIFACT_PREFIX: "code-editor-sagemaker-server" | |
| GH_TOKEN: ${{ github.token }} | |
| AWS_REGION: us-east-2 | |
| AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} | |
| ECR_REPOSITORY: ${{ secrets.ECR_REPOSITORY }} | |
| PROJECT_NAME: ${{ secrets.PROJECT_NAME }} | |
| DATAZONE_DOMAIN_ID: ${{ secrets.DATAZONE_DOMAIN_ID }} | |
| TEST_SAGEMAKER_ROLE: ${{ secrets.TEST_SAGEMAKER_ROLE }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: Setup environment | |
| run: | | |
| echo "Installing required dependencies" | |
| sudo apt-get update | |
| sudo apt-get install -y quilt libxml2-utils jq libx11-dev libxkbfile-dev | |
| - name: Setup Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: '22' | |
| - name: Download sagemaker build artifact | |
| run: | | |
| gh run download --name "$COMMIT_SHA-code-editor-sagemaker-server-build" | |
| - name: Check build artifacts exist | |
| run: | | |
| ls -la | |
| FILES=( | |
| "$SAGEMAKER_ARTIFACT_PREFIX-build.tar.gz" | |
| ) | |
| # Check build artifact exists | |
| for file in "${FILES[@]}"; do | |
| if [ ! -f "$file" ]; then | |
| echo "Error: $file not found for commit $COMMIT_SHA" | |
| exit 1 | |
| fi | |
| done | |
| - name: Configure ECR role AWS credentials | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ secrets.TEST_ECR_ROLE }} | |
| aws-region: ${{ env.AWS_REGION }} | |
| - name: Extract artifacts | |
| run: | | |
| tar -xzf "$SAGEMAKER_ARTIFACT_PREFIX-build.tar.gz" | |
| - name: Build and push Docker image | |
| run: | | |
| # Login to ECR | |
| ECR_REGISTRY="$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com" | |
| aws ecr get-login-password --region $AWS_REGION | docker login --username AWS --password-stdin $ECR_REGISTRY | |
| # Create image tag with branch-commit format | |
| BRANCH_NAME=$(echo "$GH_REF_NAME" | sed 's/[^a-zA-Z0-9-]/-/g') | |
| IMAGE_TAG="$BRANCH_NAME-$COMMIT_SHA" | |
| # Build image | |
| docker build -f .github/workflows/dockerfiles/Dockerfile.sagemaker -t $ECR_REPOSITORY:$IMAGE_TAG . | |
| # Tag image | |
| docker tag $ECR_REPOSITORY:$IMAGE_TAG $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$ECR_REPOSITORY:$IMAGE_TAG | |
| # Push image | |
| docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$ECR_REPOSITORY:$IMAGE_TAG > /dev/null | |
| echo "Docker image pushed successfully" | |
| # Get and store the image SHA digest | |
| IMAGE_SHA=$(docker inspect --format='{{index .RepoDigests 0}}' $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$ECR_REPOSITORY:$IMAGE_TAG | cut -d'@' -f2) | |
| IMAGE_URI="$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$ECR_REPOSITORY@$IMAGE_SHA" | |
| echo "ECR_IMAGE_URI=$IMAGE_URI" >> $GITHUB_ENV | |
| echo "Image pushed successfully with SHA: ${IMAGE_SHA:0:12}..." | |
| # Clean up local Docker images and build artifacts to free disk space | |
| docker rmi $ECR_REPOSITORY:$IMAGE_TAG $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$ECR_REPOSITORY:$IMAGE_TAG | |
| echo "Local Docker images cleaned up" | |
| rm -rf vscode-reh-web-linux-x64 | |
| rm -rf $SAGEMAKER_ARTIFACT_PREFIX-build | |
| echo "Local build artifacts cleaned up" | |
| - name: Configure SageMaker role AWS credentials | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ secrets.TEST_SAGEMAKER_ROLE }} | |
| aws-region: ${{ env.AWS_REGION }} | |
| - name: Create SageMaker code editor image | |
| run: | | |
| # Fetch DataZone project ID | |
| PROJECT_ID=$(aws datazone list-projects --domain-identifier "$DATAZONE_DOMAIN_ID" --name "$PROJECT_NAME" --query 'items[0].id' --output text) | |
| if [ -z "$PROJECT_ID" ] || [ "$PROJECT_ID" = "None" ]; then | |
| echo "Error: DataZone project not found" | |
| exit 1 | |
| fi | |
| echo "DataZone project found successfully" | |
| echo "::add-mask::$PROJECT_ID" | |
| # Find SageMaker domain by project ID (domain name contains project ID) | |
| DOMAIN_ID=$(aws sagemaker list-domains --query "Domains[?contains(DomainName, '$PROJECT_ID')].DomainId" --output text) | |
| if [ -z "$DOMAIN_ID" ]; then | |
| echo "Error: SageMaker domain not found for project ID" | |
| exit 1 | |
| fi | |
| echo "SageMaker domain found successfully" | |
| echo "::add-mask::$DOMAIN_ID" | |
| echo "SAGEMAKER_DOMAIN_ID=$DOMAIN_ID" >> $GITHUB_ENV | |
| echo "PROJECT_ID=$PROJECT_ID" >> $GITHUB_ENV | |
| # Create SageMaker image | |
| BRANCH_NAME=$(echo "$GH_REF_NAME" | sed 's/[^a-zA-Z0-9-]/-/g') | |
| IMAGE_NAME="$BRANCH_NAME-${COMMIT_SHA}" | |
| aws sagemaker create-image \ | |
| --image-name "$IMAGE_NAME" \ | |
| --role-arn $TEST_SAGEMAKER_ROLE | |
| # Wait for image to be ready (max 30 retries = 5 minutes) | |
| echo "Waiting for SageMaker image to be ready..." | |
| RETRY_COUNT=0 | |
| MAX_RETRIES=30 | |
| while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do | |
| STATUS=$(aws sagemaker describe-image --image-name "$IMAGE_NAME" --query 'ImageStatus' --output text) | |
| echo "Image status: $STATUS (attempt $((RETRY_COUNT + 1))/$MAX_RETRIES)" | |
| if [ "$STATUS" = "CREATED" ]; then | |
| break | |
| elif [ "$STATUS" = "CREATE_FAILED" ]; then | |
| echo "Image creation failed" | |
| exit 1 | |
| fi | |
| RETRY_COUNT=$((RETRY_COUNT + 1)) | |
| sleep 10 | |
| done | |
| if [ $RETRY_COUNT -eq $MAX_RETRIES ]; then | |
| echo "Timeout waiting for image to be ready" | |
| exit 1 | |
| fi | |
| # Create image version | |
| aws sagemaker create-image-version \ | |
| --image-name "$IMAGE_NAME" \ | |
| --base-image "$ECR_IMAGE_URI" | |
| echo "SAGEMAKER_IMAGE_NAME=$IMAGE_NAME" >> $GITHUB_ENV | |
| - name: Attach image to domain | |
| run: | | |
| # Wait for image version to be ready (100 seconds) | |
| echo "Waiting for image version to be ready..." | |
| RETRY_COUNT=0 | |
| MAX_RETRIES=10 | |
| while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do | |
| VERSION_STATUS=$(aws sagemaker describe-image-version --image-name "$SAGEMAKER_IMAGE_NAME" --query 'ImageVersionStatus' --output text) | |
| echo "Image version status: $VERSION_STATUS (attempt $((RETRY_COUNT + 1))/$MAX_RETRIES)" | |
| if [ "$VERSION_STATUS" = "CREATED" ]; then | |
| break | |
| elif [ "$VERSION_STATUS" = "CREATE_FAILED" ]; then | |
| echo "Image version creation failed" | |
| exit 1 | |
| fi | |
| RETRY_COUNT=$((RETRY_COUNT + 1)) | |
| sleep 10 | |
| done | |
| if [ $RETRY_COUNT -eq $MAX_RETRIES ]; then | |
| echo "Timeout waiting for image version to be ready" | |
| exit 1 | |
| fi | |
| # Create universal AppImageConfig (if it doesn't exist) | |
| APP_IMAGE_CONFIG_NAME="code-editor-app-config" | |
| if ! aws sagemaker describe-app-image-config --app-image-config-name "$APP_IMAGE_CONFIG_NAME" >/dev/null 2>&1; then | |
| aws sagemaker create-app-image-config \ | |
| --app-image-config-name "$APP_IMAGE_CONFIG_NAME" \ | |
| --code-editor-app-image-config '{}' | |
| echo "Created universal AppImageConfig: $APP_IMAGE_CONFIG_NAME" | |
| else | |
| echo "Universal AppImageConfig already exists: $APP_IMAGE_CONFIG_NAME" | |
| fi | |
| # Get existing custom images and append new one | |
| EXISTING_IMAGES=$(aws sagemaker describe-domain --domain-id "$SAGEMAKER_DOMAIN_ID" --query 'DefaultUserSettings.CodeEditorAppSettings.CustomImages' --output json 2>/dev/null || echo '[]') | |
| # Create new custom images array with existing + new image | |
| NEW_IMAGES=$(echo "$EXISTING_IMAGES" | jq --arg imageName "$SAGEMAKER_IMAGE_NAME" --arg configName "$APP_IMAGE_CONFIG_NAME" '. + [{"ImageName": $imageName, "ImageVersionNumber": 1, "AppImageConfigName": $configName}] | unique_by(.ImageName)') | |
| # Update domain with all custom images | |
| aws sagemaker update-domain \ | |
| --domain-id "$SAGEMAKER_DOMAIN_ID" \ | |
| --default-user-settings "{\"CodeEditorAppSettings\": {\"CustomImages\": $NEW_IMAGES}}" | |
| echo "Image attached to domain successfully" | |
| - name: Create SageMaker code editor space | |
| run: | | |
| # Create space name using branch-commit format | |
| BRANCH_NAME=$(echo "$GH_REF_NAME" | sed 's/[^a-zA-Z0-9-]/-/g') | |
| SPACE_NAME="$BRANCH_NAME-${COMMIT_SHA}" | |
| # Create the space with project ownership | |
| aws sagemaker create-space \ | |
| --domain-id "$SAGEMAKER_DOMAIN_ID" \ | |
| --space-name "$SPACE_NAME" \ | |
| --ownership-settings '{ | |
| "OwnerUserProfileName": "'$(aws sagemaker list-user-profiles --domain-id "$SAGEMAKER_DOMAIN_ID" --query 'UserProfiles[0].UserProfileName' --output text)'" | |
| }' \ | |
| --space-sharing-settings '{ | |
| "SharingType": "Private" | |
| }' \ | |
| --space-settings '{ | |
| "AppType": "CodeEditor", | |
| "RemoteAccess": "DISABLED", | |
| "SpaceStorageSettings": { | |
| "EbsStorageSettings": { | |
| "EbsVolumeSizeInGb": 16 | |
| } | |
| }, | |
| "CodeEditorAppSettings": { | |
| "DefaultResourceSpec": { | |
| "SageMakerImageArn": "arn:aws:sagemaker:'$AWS_REGION':'$AWS_ACCOUNT_ID':image/'$SAGEMAKER_IMAGE_NAME'", | |
| "InstanceType": "ml.t3.medium" | |
| } | |
| } | |
| }' | |
| echo "Created SageMaker space: $SPACE_NAME" | |
| echo "SAGEMAKER_SPACE_NAME=$SPACE_NAME" >> $GITHUB_ENV | |
| - name: Start SageMaker code editor space | |
| run: | | |
| # Wait for space to be ready (200 seconds) | |
| echo "Waiting for space to be ready..." | |
| RETRY_COUNT=0 | |
| MAX_RETRIES=20 | |
| while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do | |
| SPACE_STATUS=$(aws sagemaker describe-space --domain-id "$SAGEMAKER_DOMAIN_ID" --space-name "$SAGEMAKER_SPACE_NAME" --query 'Status' --output text) | |
| echo "Space status: $SPACE_STATUS (attempt $((RETRY_COUNT + 1))/$MAX_RETRIES)" | |
| if [ "$SPACE_STATUS" = "InService" ]; then | |
| break | |
| elif [ "$SPACE_STATUS" = "Failed" ]; then | |
| echo "Space creation failed" | |
| exit 1 | |
| fi | |
| RETRY_COUNT=$((RETRY_COUNT + 1)) | |
| sleep 10 | |
| done | |
| if [ $RETRY_COUNT -eq $MAX_RETRIES ]; then | |
| echo "Timeout waiting for space to be ready" | |
| exit 1 | |
| fi | |
| # Create app to start the space | |
| aws sagemaker create-app \ | |
| --domain-id "$SAGEMAKER_DOMAIN_ID" \ | |
| --space-name "$SAGEMAKER_SPACE_NAME" \ | |
| --app-type "CodeEditor" \ | |
| --app-name "default" | |
| echo "Started SageMaker space: $SAGEMAKER_SPACE_NAME" | |