Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion custom_image_utils/args_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,5 +247,4 @@ def parse_args(args):
(Only supported for Dataproc Images 2.3 and above)"""
)


return parser.parse_args(args)
13 changes: 11 additions & 2 deletions custom_image_utils/shell_script_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,11 @@
# too many serial port output requests per minute occur if they all occur at once
sleep $(( ( RANDOM % 60 ) + 20 ))

gcloud compute instances describe --format json {image_name}-install --zone {zone} | tee {log_dir}/instance.json
if (( DEBUG != 0 )); then
gcloud compute instances describe --format json {image_name}-install --project={project_id} --zone {zone} | tee {log_dir}/instance.json
else
gcloud compute instances describe --format json {image_name}-install --project={project_id} --zone {zone} > {log_dir}/instance.json
fi

execute_with_retries gcloud compute instances tail-serial-port-output {image_name}-install \
--project={project_id} \
Expand Down Expand Up @@ -300,6 +304,8 @@ def _init_args(self, args):
"run.sh": "startup_script/run.sh",
"init_actions.sh": self.args["customization_script"]
}
if self.args.get("metadata") and "http-proxy" in self.args["metadata"]:
all_sources["gce-proxy-setup.sh"] = "startup_script/gce-proxy-setup.sh"
all_sources.update(self.args["extra_sources"])

sources_map_items = tuple(enumerate(all_sources.items()))
Expand Down Expand Up @@ -329,8 +335,11 @@ def _init_args(self, args):
self.args[
"storage_location_flag"] = "--storage-location={storage_location}".format(
**self.args) if self.args["storage_location"] else ""
self.args[
"gce_startup_script_flag"] = "--metadata-from-file startup-script={gce_startup_script}".format(
**self.args) if self.args.get("gce_startup_script") else ""
metadata_flag_template = (
"--metadata=shutdown-timer-in-sec={shutdown_timer_in_sec},"
"--metadata=VmDnsSetting=ZonalOnly,shutdown-timer-in-sec={shutdown_timer_in_sec},"
"custom-sources-path={custom_sources_path}"
)
if self.args["zone"]:
Expand Down
29 changes: 29 additions & 0 deletions examples/secure-boot/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,35 @@ SC by ensuring that the underlying infrastructure is secure and
trusted. This helps to strengthen the overall security posture of
Google Cloud Platform environments and protect sensitive data.

## Development Workflow for Customization Scripts

When developing a new customization script (like `install_gpu_driver.sh` or others in this directory) to be used with the custom image build process, the following workflow is recommended:

1. **Manual Execution on a Live Cluster Node:**
* Create a standard Dataproc cluster matching the target OS version (e.g., 2.2-debian12).
* SSH into a node (e.g., the master node).
* Copy your script to the node (e.g., using `gcloud compute scp your-script.sh node-name:/tmp/`).
* Get a clean root shell: `exec sudo -i bash`
* Run the script with tracing and timing: `time bash -x /tmp/your-script.sh`.
* Debug and refine the script in this environment. This allows for rapid iteration, and the `time` command helps identify performance bottlenecks. Add idempotency checks to your script to allow for faster re-runs during debugging.

2. **Test as an Init Action on Cluster Creation:**
* Once the script runs cleanly when executed manually as root, test it as an initialization action during cluster creation.
* Upload the script to a GCS bucket.
* Create a new cluster using `gcloud dataproc clusters create --initialization-actions gs://your-bucket/your-script.sh ...`.
* This verifies the script works correctly within the Dataproc cluster startup sequence and with the permissions of the init action environment.

3. **Test as a Custom Image Customization Script:**
* Use the `temp/create-custom-image-test-host-X.X.sh` scripts (e.g., `create-custom-image-test-host-2.2.sh`) to create a single builder VM mimicking the custom image creation environment for the target OS.
* This will run your script via the `startup_script/run.sh` wrapper, just like the real image build process.
* SSH to the debug instance (e.g., `debug-deb12-build`) to check logs and behavior. The relevant logs will be in `/tmp` on the instance.

4. **Full Custom Image Build:**
* Once the script works in the single debug instance, run the full build process using `examples/secure-boot/build-and-run-podman.sh` (ensure it's set to run `examples/secure-boot/build-current-images.sh`).
* This will build all image variants and purposes defined in `pre-init.sh` and its screenrc.

This staged approach helps isolate issues, starting from manual execution and progressively moving closer to the automated custom image build process.

## Examples

To create a custom image with a self-signed, trusted certificate
Expand Down
181 changes: 26 additions & 155 deletions examples/secure-boot/build-current-images.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,169 +26,41 @@
# cp examples/secure-boot/env.json.sample env.json
# vi env.json
# docker build -f Dockerfile -t custom-images-builder:latest .
# time docker run -it custom-images-builder:latest bash examples/secure-boot/build-current-images.sh


set -ex

function execute_with_retries() (
set +x
local -r cmd="$*"
local install_log="${tmpdir}/install.log"

for ((i = 0; i < 3; i++)); do
set -x
eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
set +x
if [[ $retval == 0 ]] ; then return 0 ; fi
sleep 5
done
return 1
)

function configure_service_account() {
# Create service account
if gcloud iam service-accounts list --filter email="${GSA}" 2>&1 | grep -q 'Listed 0 items.' ; then
# Create service account for this purpose
echo "creating pre-init customization service account ${GSA}"
gcloud iam service-accounts create "${SA_NAME}" \
--description="Service account for pre-init customization" \
--display-name="${SA_NAME}"
fi

if [[ -d tls ]] ; then mv tls "tls-$(date +%s)" ; fi
eval "$(bash examples/secure-boot/create-key-pair.sh)"

execute_with_retries gcloud projects add-iam-policy-binding "${PROJECT_ID}" \
--member="serviceAccount:${GSA}" \
--role="roles/dataproc.worker" \
--condition=None

# Grant the service account access to buckets in this project
# TODO: this is over-broad and should be limited only to the buckets
# used by these clusters
for storage_object_role in 'User' 'Creator' 'Viewer' ; do
execute_with_retries gcloud projects add-iam-policy-binding "${PROJECT_ID}" \
--member="serviceAccount:${GSA}" \
--role="roles/storage.object${storage_object_role}" \
--condition=None
done

for secret in "${public_secret_name}" "${private_secret_name}" ; do
for sm_role in 'viewer' 'secretAccessor' ; do
# Grant the service account permission to list the secret
execute_with_retries gcloud secrets -q add-iam-policy-binding "${secret}" \
--member="serviceAccount:${GSA}" \
--role="roles/secretmanager.${sm_role}" \
--condition=None
done
done

execute_with_retries gcloud projects add-iam-policy-binding "${PROJECT_ID}" \
--member="serviceAccount:${GSA}" \
--role=roles/compute.instanceAdmin.v1 \
--condition=None

execute_with_retries gcloud iam service-accounts add-iam-policy-binding "${GSA}" \
--member="serviceAccount:${GSA}" \
--role=roles/iam.serviceAccountUser \
--condition=None
}

function revoke_bindings() {
execute_with_retries gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \
--member="serviceAccount:${GSA}" \
--role="roles/dataproc.worker" \
--condition=None

# Revoke the service account's access to buckets in this project
for storage_object_role in 'User' 'Creator' 'Viewer' ; do
execute_with_retries gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \
--member="serviceAccount:${GSA}" \
--role="roles/storage.object${storage_object_role}" \
--condition=None
done

for secret in "${public_secret_name}" "${private_secret_name}" ; do
# Revoke the service account's permission to list and access the secret
for sm_role in 'viewer' 'secretAccessor' ; do
execute_with_retries gcloud secrets -q remove-iam-policy-binding "${secret}" \
--member="serviceAccount:${GSA}" \
--role="roles/secretmanager.${sm_role}" \
--condition=None
done
done

execute_with_retries gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \
--member="serviceAccount:${GSA}" \
--role=roles/compute.instanceAdmin.v1 \
--condition=None

execute_with_retries gcloud iam service-accounts remove-iam-policy-binding "${GSA}" \
--member="serviceAccount:${GSA}" \
--role=roles/iam.serviceAccountUser \
--condition=None
}


export DOMAIN="$(jq -r .DOMAIN env.json)"
export PROJECT_ID="$(jq -r .PROJECT_ID env.json)"
export PURPOSE="$(jq -r .PURPOSE env.json)"
export BUCKET="$(jq -r .BUCKET env.json)"
export SECRET_NAME="$(jq -r .SECRET_NAME env.json)"
export REGION="$(jq -r .REGION env.json)"
export ZONE="$(jq -r .ZONE env.json)"
export PRINCIPAL_USER="$(jq -r .PRINCIPAL env.json)"
export PRINCIPAL_DOMAIN="$(jq -r .DOMAIN env.json)"
export PRINCIPAL="${PRINCIPAL_USER}@${PRINCIPAL_DOMAIN}"

echo -n "setting gcloud config..."
gcloud config set project "${PROJECT_ID}"
gcloud config set account "${PRINCIPAL}"
gcloud auth login

CURRENT_COMPUTE_REGION="$(gcloud config get compute/region)"
if [[ "${CURRENT_COMPUTE_REGION}" != "${REGION}" ]]; then
echo "setting compute region"
gcloud config set compute/region "${REGION}"
fi
CURRENT_DATAPROC_REGION="$(gcloud config get dataproc/region)"
if [[ "${CURRENT_DATAPROC_REGION}" != "${REGION}" ]]; then
echo "setting dataproc region"
gcloud config set dataproc/region "${REGION}"
fi
CURRENT_COMPUTE_ZONE="$(gcloud config get compute/zone)"
if [[ "${CURRENT_COMPUTE_ZONE}" != "${ZONE}" ]]; then
echo "setting compute zone"
gcloud config set compute/zone "${ZONE}"
fi
SA_NAME="sa-${PURPOSE}"

if [[ "${PROJECT_ID}" =~ ":" ]] ; then
GSA="${SA_NAME}@${PROJECT_ID#*:}.${PROJECT_ID%:*}.iam.gserviceaccount.com"
else
GSA="${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com"
# export timestamp=$(date "+%Y%m%d-%H%M%S")
# echo "Log directory: ./tmp/logs/${timestamp}"
# mkdir -p ./tmp/logs/${timestamp}
# time podman run -it \
# -v ~/.config/gcloud:/root/.config/gcloud \
# -v ./tmp/logs/${timestamp}:/tmp \
# -e DEBUG=0 \
# -e timestamp=${timestamp} \
# custom-images-builder:latest \
# bash examples/secure-boot/build-current-images.sh


set -e

DEBUG="${DEBUG:-0}"
if (( DEBUG != 0 )); then
set -x
fi

readonly timestamp="$(date "+%Y%m%d-%H%M%S")"
export timestamp

export tmpdir=/tmp/${timestamp};
mkdir -p ${tmpdir}
# Activate service account
source examples/secure-boot/lib/env.sh
source examples/secure-boot/lib/util.sh

configure_service_account
export tmpdir="${REPRO_TMPDIR}"
mkdir -p "${tmpdir}/sentinels"

# screen session name
session_name="build-current-images"

export ZONE="$(jq -r .ZONE env.json)"
gcloud compute instances list --zones "${ZONE}" --format json > ${tmpdir}/instances.json
gcloud compute images list --format json > ${tmpdir}/images.json

# Run generation scripts simultaneously for each dataproc image version
print_status "Starting screen session ${session_name} to build images... "
screen -L -US "${session_name}" -c examples/secure-boot/pre-init.screenrc
#report_result "Done"

function find_disk_usage() {
print_status "Analyzing disk usage... "
# grep maximum-disk-used /tmp/custom-image-*/logs/startup-script.log
grep -H 'Customization script' /tmp/custom-image-*/logs/workflow.log
echo '# DP_IMG_VER RECOMMENDED_DISK_SIZE DSK_SZ D_USED D_FREE D%F PURPOSE'
Expand All @@ -199,6 +71,5 @@ function find_disk_usage() {
| grep -A20 'Filesystem.*Avail' | tail -20 \
| perl examples/secure-boot/genline.pl "${startup_log}"
done
report_result "Done"
}

revoke_bindings
Loading