Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ venv/
__pycache__/
*.charm
build/
images/
.idea
sel-screenshots/*
geckodriver.log
Expand Down
34 changes: 34 additions & 0 deletions scripts/get_upstream_images/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# get_upstream_images.py

This script automates the extraction of container image references from the official [Kubeflow Manifests](https://github.com/kubeflow/manifests) repository.

## Usage

From inside this directory:

```shell
python3 extract_images.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
python3 extract_images.py
python3 get_upstream_images.py

suggestion(blocking): probably a typo but the script name doesn't match

```

The script creates an `images` directory in the folder where you executed the script. Inside, it generates text files containing a sorted, unique list of images. You will get one file per processed workgroup, plus an aggregate file containing all images across the entire deployment:
```
images/
├── kf_1.11.0_katib_images.txt
├── kf_1.11.0_kserve_images.txt
├── kf_1.11.0_pipelines_images.txt
└── kf_1.11.0_all_images.txt
```

By default, the script fetches the `latest` version of the manifests (corresponding to the `master` branch). To specify a version, pass a positional argument:

```shell
python3 extract_images.py 1.11.0
```

You can also skip any working groups by passing the `--skip` argument:

```shell
python3 extract_images.py --skip spark model-registry
```

The default value of `--skip` is `["spark", "model-registry"]` (since we don't currently have charmed operators for these controllers).
161 changes: 161 additions & 0 deletions scripts/get_upstream_images/get_upstream_images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# This script is based on the following upstream script:
# https://github.com/kubeflow/manifests/blob/0837fb9cf3ec73f51cbddff656a160cb258eaad5/tests/trivy_scan.py

import os
import subprocess
import re
import argparse
import tempfile

# Dictionary mapping Kubeflow workgroups to directories containing kustomization files
wg_dirs = {
"katib": "../applications/katib/upstream/installs",
"pipelines": "../applications/pipeline/upstream/env/cert-manager/platform-agnostic-multi-user",
"trainer": "../applications/training-operator/upstream/overlays ../applications/trainer/overlays",
"manifests": "../common/cert-manager/cert-manager/base ../common/cert-manager/kubeflow-issuer/base ../common/istio/istio-crds/base ../common/istio/istio-namespace/base ../common/istio/istio-install/overlays/oauth2-proxy ../common/oauth2-proxy/overlays/m2m-self-signed ../common/dex/overlays/oauth2-proxy ../common/knative/knative-serving/overlays/gateways ../common/knative/knative-eventing/base ../common/istio/cluster-local-gateway/base ../common/kubeflow-namespace/base ../common/kubeflow-roles/base ../common/istio/kubeflow-istio-resources/base",
"workbenches": "../applications/pvcviewer-controller/upstream/base ../applications/admission-webhook/upstream/overlays ../applications/centraldashboard/overlays ../applications/jupyter/jupyter-web-app/upstream/overlays ../applications/volumes-web-app/upstream/overlays ../applications/tensorboard/tensorboards-web-app/upstream/overlays ../applications/profiles/upstream/overlays ../applications/jupyter/notebook-controller/upstream/overlays ../applications/tensorboard/tensorboard-controller/upstream/overlays",
"kserve": "../applications/kserve - ../applications/kserve/models-web-app/overlays/kubeflow",
"model-registry": "../applications/model-registry/upstream/overlays/db ../applications/model-registry/upstream/options/istio ../applications/model-registry/upstream/options/ui/overlays/istio",
"spark": "../applications/spark/spark-operator/overlays/kubeflow",
}

SCRIPT_DIRECTORY = os.getcwd()
IMAGES_DIRECTORY = os.path.join(SCRIPT_DIRECTORY, "images")
os.makedirs(IMAGES_DIRECTORY, exist_ok=True)

def log(*args, **kwargs):
# Custom log function that print messages with flush=True by default.
kwargs.setdefault("flush", True)
print(*args, **kwargs)


def save_images(wg, images, version):
# Saves a list of container images to a text file named after the workgroup and version.
output_file = os.path.join(IMAGES_DIRECTORY, f"kf_{version}_{wg}_images.txt")
with open(output_file, "w") as f:
f.write("\n".join(images))
log(f"File {output_file} successfully created")


def validate_semantic_version(version):
# Validates a semantic version string (e.g., "0.1.2" or "latest").
regex = r"^[0-9]+\.[0-9]+\.[0-9]+$"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion(blocking): this regex doesn't properly validate a semantic version. For instance, it failed validating both 1.11.0-rc.1 and 26.03-rc.1. I put the regex into a validator and also 26.03 wouldn't match.
Starting from the suggested SemVer regex, I built one which allows to have only major.minor groups and also the prefix v:

^v?(?P<major>0|[1-9]\d*)\.(?P<minor>|[0-9]\d*)\.?(?P<patch>|[0-9]\d*)(?:-(?P<prerelease>(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$

It's fairly complex and it can be checked here.

if re.match(regex, version) or version == "latest":
return version
else:
raise ValueError(f"Invalid semantic version: '{version}'")


def extract_images(version, skip_list=None):
if skip_list is None:
skip_list = []
version = validate_semantic_version(version)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion(blocking): I would run the semver validation even before cloning the repo.
You could actually leverage the type keyword to perform the validation directly from the parser. If you raise an argparse.ArgumentTypeError you get also this nice output:

usage: get_upstream_images.py [-h] [--skip [SKIP ...]] [version]
get_upstream_images.py: error: argument version: Invalid semantic version: 'invalid'

log(f"Running the script using Kubeflow version: {version}")

if skip_list:
log(f"Skipping workgroups: {', '.join(skip_list)}")

all_images = set() # Collect all unique images across workgroups

for wg, dirs in wg_dirs.items():
# Skip this workgroup if it was provided in the --skip argument
if wg in skip_list:
continue
wg_images = set() # Collect unique images for this workgroup
for dir_path in dirs.split():
for root, _, files in os.walk(dir_path):
for file in files:
if file in [
"kustomization.yaml",
"kustomization.yml",
"Kustomization",
]:
full_path = os.path.join(root, file)
try:
# Execute `kustomize build` to render the kustomization file
result = subprocess.run(
["kustomize", "build", root],
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
except subprocess.CalledProcessError as e:
log(
f'ERROR:\t Failed "kustomize build" command for directory: {root}. See error above'
)
continue

# Use regex to find lines with 'image: <image-name>:<version>' or 'image: <image-name>'
# and '- image: <image-name>:<version>' but avoid environment variables
kustomize_images = re.findall(
r"^\s*-?\s*image:\s*([^$\s:]+(?:\:[^\s]+)?)$",
result.stdout,
re.MULTILINE,
)
wg_images.update(kustomize_images)

# Ensure uniqueness within workgroup images
uniq_wg_images = sorted(wg_images)
all_images.update(uniq_wg_images)
save_images(wg, uniq_wg_images, version)

# Ensure uniqueness across all workgroups
uniq_images = sorted(all_images)
save_images("all", uniq_images, version)


def clone_and_extract_images(version, skip_list):
"""Clone kubeflow/manifests repository temporarily, and extract the images."""
repo_url = "https://github.com/kubeflow/manifests.git"

with tempfile.TemporaryDirectory() as temp_dir:
try:
clone_cmd = ["git", "clone", "--depth", "1"]
if version != "latest":
tag = f"v{version}"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion(blocking): do not hardcode the v prefix. I tried with the 26.03-rc.1 tag and the script failed:

python3 get_upstream_images.py 26.03-rc.1
Cloning repository: https://github.com/kubeflow/manifests.git
ERROR: Failed to clone repository. Details: Command '['git', 'clone', '--depth', '1', '--branch', 'v26.03-rc.1', 'https://github.com/kubeflow/manifests.git', 'manifests']' returned non-zero exit status 128.

I would also suggest not to limit the script to tags, but also branches. The difference is negligible but it's more flexible.

clone_cmd.extend(["--branch", tag])
# Clone into the temp directory (shallow clone for speed)
log(f"Cloning repository: {repo_url}")
clone_cmd.extend([repo_url, "manifests"])
subprocess.run(
clone_cmd,
cwd=temp_dir,
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL
)
except subprocess.CalledProcessError as e:
log(f"ERROR: Failed to clone repository. Details: {e}")
exit(1)

repo_path = os.path.join(temp_dir, "manifests")
tests_path = os.path.join(repo_path, "tests")
os.chdir(tests_path)

extract_images(version, skip_list)

os.chdir(SCRIPT_DIRECTORY)


parser = argparse.ArgumentParser(
description="Extract images from Kubeflow kustomizations."
)
# Define a positional argument 'version' with optional occurrence and default value 'latest'. You can run this file as python3 <filename>.py or python <filename>.py <version>
parser.add_argument(
"version",
nargs="?",
type=str,
default="latest",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion(blocking): I would default to master.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uhm, we can probably just make this argument required. The user should definitely know which images they want to fetch

help="Kubeflow version to use (defaults to latest).",
)
# Skip any specified workgroups
parser.add_argument(
"--skip",
nargs="*",
type=str,
default=["spark", "model-registry"],
help="List of workgroups to skip (e.g., --skip katib spark manifests). Defaults to [spark, model-registry]",
)
args = parser.parse_args()
clone_and_extract_images(args.version, args.skip)