diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yml similarity index 96% rename from .github/workflows/e2e_tests.yaml rename to .github/workflows/e2e_tests.yml index fca6d6e7..ef8d7382 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yml @@ -3,14 +3,9 @@ name: e2e on: pull_request: - branches: - - main - - 'release-*' - paths-ignore: - - 'docs/**' - - '**.adoc' - - '**.md' - - 'LICENSE' + branches: [ main ] + push: + branches: [ main ] concurrency: group: ${{ github.head_ref }}-${{ github.workflow }} @@ -117,7 +112,7 @@ jobs: pip install poetry poetry install --with test,docs echo "Running e2e tests..." - poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 + poetry run pytest -v -s ./tests/e2e/local_interactive_sdk_kind_test.py::TestRayLocalInteractiveOauth::test_local_interactives_nvidia_gpu > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 env: GRPC_DNS_RESOLVER: "native" diff --git a/docs/sphinx/user-docs/cluster-configuration.rst b/docs/sphinx/user-docs/cluster-configuration.rst index 7ca871e7..0dc54930 100644 --- a/docs/sphinx/user-docs/cluster-configuration.rst +++ b/docs/sphinx/user-docs/cluster-configuration.rst @@ -44,6 +44,21 @@ requirements for creating the Ray Cluster. documentation on building a custom image `here `__. +Ray Usage Statistics +------------------- + +By default, Ray usage statistics collection is disabled in CodeFlare SDK clusters. This stops statistics from being sent to AnyScale. If you want to enable usage statistics collection, you can set the ``RAY_USAGE_STATS_ENABLED`` environment variable to ``1`` in your cluster configuration: + +.. code:: python + + from codeflare_sdk import Cluster, ClusterConfiguration + + cluster = Cluster(ClusterConfiguration( + name='ray-example', + namespace='default', + envs={'RAY_USAGE_STATS_ENABLED': '1'} # Enable usage statistics + )) + The ``labels={"exampleLabel": "example"}`` parameter can be used to apply additional labels to the RayCluster resource. diff --git a/src/codeflare_sdk/ray/cluster/config.py b/src/codeflare_sdk/ray/cluster/config.py index 4f646baa..8ea97d55 100644 --- a/src/codeflare_sdk/ray/cluster/config.py +++ b/src/codeflare_sdk/ray/cluster/config.py @@ -161,6 +161,10 @@ def __post_init__(self): "Warning: TLS verification has been disabled - Endpoint checks will be bypassed" ) + # Set default environment variable to disable Ray usage stats if not already set + if "RAY_USAGE_STATS_ENABLED" not in self.envs: + self.envs["RAY_USAGE_STATS_ENABLED"] = "0" + if self.enable_gcs_ft: if not self.redis_address: raise ValueError( diff --git a/src/codeflare_sdk/ray/cluster/test_cluster.py b/src/codeflare_sdk/ray/cluster/test_cluster.py index 298c416e..ce684607 100644 --- a/src/codeflare_sdk/ray/cluster/test_cluster.py +++ b/src/codeflare_sdk/ray/cluster/test_cluster.py @@ -465,11 +465,10 @@ def test_get_cluster_no_appwrapper(mocker): return_value=expected_rc, ) get_cluster("test-all-params", "ns", write_to_file=True) - assert filecmp.cmp( - f"{aw_dir}test-all-params.yaml", - f"{expected_clusters_dir}/ray/unit-test-all-params.yaml", - shallow=True, - ) + + with open(f"{aw_dir}test-all-params.yaml") as f: + generated_rc = yaml.load(f, Loader=yaml.FullLoader) + assert generated_rc == expected_rc def test_get_cluster_with_appwrapper(mocker): @@ -487,11 +486,10 @@ def test_get_cluster_with_appwrapper(mocker): return_value=expected_aw, ) get_cluster("aw-all-params", "ns", write_to_file=True) - assert filecmp.cmp( - f"{aw_dir}aw-all-params.yaml", - f"{expected_clusters_dir}/appwrapper/unit-test-all-params.yaml", - shallow=True, - ) + + with open(f"{aw_dir}aw-all-params.yaml") as f: + generated_aw = yaml.load(f, Loader=yaml.FullLoader) + assert generated_aw == expected_aw def test_wait_ready(mocker, capsys): diff --git a/src/codeflare_sdk/ray/cluster/test_config.py b/src/codeflare_sdk/ray/cluster/test_config.py index 6007f60b..6c990c19 100644 --- a/src/codeflare_sdk/ray/cluster/test_config.py +++ b/src/codeflare_sdk/ray/cluster/test_config.py @@ -24,6 +24,7 @@ import filecmp import pytest import os +import yaml parent = Path(__file__).resolve().parents[4] # project directory expected_clusters_dir = f"{parent}/tests/test_cluster_yamls" @@ -85,7 +86,11 @@ def test_config_creation_all_parameters(mocker): assert cluster.config.worker_memory_requests == "12G" assert cluster.config.worker_memory_limits == "16G" assert cluster.config.appwrapper == False - assert cluster.config.envs == {"key1": "value1", "key2": "value2"} + assert cluster.config.envs == { + "key1": "value1", + "key2": "value2", + "RAY_USAGE_STATS_ENABLED": "0", + } assert cluster.config.image == "example/ray:tag" assert cluster.config.image_pull_secrets == ["secret1", "secret2"] assert cluster.config.write_to_file == True @@ -206,6 +211,46 @@ def test_gcs_fault_tolerance_config_validation(): ) +def test_ray_usage_stats_default(mocker): + mocker.patch("kubernetes.client.ApisApi.get_api_versions") + mocker.patch("kubernetes.client.CustomObjectsApi.list_namespaced_custom_object") + + cluster = Cluster( + ClusterConfiguration(name="default-usage-stats-cluster", namespace="ns") + ) + + # Verify that usage stats are disabled by default + assert cluster.config.envs["RAY_USAGE_STATS_ENABLED"] == "0" + + # Check that the environment variable is set in the YAML + head_container = cluster.resource_yaml["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0] + env_vars = {env["name"]: env["value"] for env in head_container["env"]} + assert env_vars["RAY_USAGE_STATS_ENABLED"] == "0" + + +def test_ray_usage_stats_enabled(mocker): + mocker.patch("kubernetes.client.ApisApi.get_api_versions") + mocker.patch("kubernetes.client.CustomObjectsApi.list_namespaced_custom_object") + + cluster = Cluster( + ClusterConfiguration( + name="usage-stats-enabled-cluster", + namespace="ns", + envs={"RAY_USAGE_STATS_ENABLED": "1"}, + ) + ) + + assert cluster.config.envs["RAY_USAGE_STATS_ENABLED"] == "1" + + head_container = cluster.resource_yaml["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0] + env_vars = {env["name"]: env["value"] for env in head_container["env"]} + assert env_vars["RAY_USAGE_STATS_ENABLED"] == "1" + + # Make sure to always keep this function last def test_cleanup(): os.remove(f"{aw_dir}test-all-params.yaml") diff --git a/tests/e2e/local_interactive_sdk_kind_test.py b/tests/e2e/local_interactive_sdk_kind_test.py index c20fd879..a6fcfa5f 100644 --- a/tests/e2e/local_interactive_sdk_kind_test.py +++ b/tests/e2e/local_interactive_sdk_kind_test.py @@ -48,6 +48,7 @@ def run_local_interactives( head_cpu_limits="500m", head_memory_requests=2, head_memory_limits=2, + head_extended_resource_requests={gpu_resource_name: 0}, worker_cpu_requests="500m", worker_cpu_limits=1, worker_memory_requests=1, @@ -68,8 +69,8 @@ def run_local_interactives( ray.shutdown() ray.init(address=cluster.local_client_url(), logging_level="DEBUG") - @ray.remote(num_gpus=number_of_gpus / 2) - def heavy_calculation_part(num_iterations): + @ray.remote(num_gpus=number_of_gpus) + def heavy_calculation(num_iterations): result = 0.0 for i in range(num_iterations): for j in range(num_iterations): @@ -77,13 +78,6 @@ def heavy_calculation_part(num_iterations): result += math.sin(i) * math.cos(j) * math.tan(k) return result - @ray.remote(num_gpus=number_of_gpus / 2) - def heavy_calculation(num_iterations): - results = ray.get( - [heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)] - ) - return sum(results) - ref = heavy_calculation.remote(3000) result = ray.get(ref) assert result == 1789.4644387076714 diff --git a/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml b/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml index f1b75410..af5acbad 100644 --- a/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml +++ b/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml @@ -54,6 +54,8 @@ spec: value: value1 - name: key2 value: value2 + - name: RAY_USAGE_STATS_ENABLED + value: '0' image: example/ray:tag imagePullPolicy: Always lifecycle: @@ -159,6 +161,8 @@ spec: value: value1 - name: key2 value: value2 + - name: RAY_USAGE_STATS_ENABLED + value: '0' image: example/ray:tag imagePullPolicy: Always lifecycle: diff --git a/tests/test_cluster_yamls/kueue/aw_kueue.yaml b/tests/test_cluster_yamls/kueue/aw_kueue.yaml index fd78f070..7101f6a8 100644 --- a/tests/test_cluster_yamls/kueue/aw_kueue.yaml +++ b/tests/test_cluster_yamls/kueue/aw_kueue.yaml @@ -75,6 +75,9 @@ spec: - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert subPath: odh-ca-bundle.crt + env: + - name: RAY_USAGE_STATS_ENABLED + value: '0' volumes: - configMap: items: @@ -133,6 +136,9 @@ spec: - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert subPath: odh-ca-bundle.crt + env: + - name: RAY_USAGE_STATS_ENABLED + value: '0' volumes: - configMap: items: diff --git a/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml b/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml index a6dd81d7..f8b3aa46 100644 --- a/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml +++ b/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml @@ -75,6 +75,9 @@ spec: - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert subPath: odh-ca-bundle.crt + env: + - name: RAY_USAGE_STATS_ENABLED + value: '0' volumes: - configMap: items: @@ -133,6 +136,9 @@ spec: - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert subPath: odh-ca-bundle.crt + env: + - name: RAY_USAGE_STATS_ENABLED + value: '0' volumes: - configMap: items: diff --git a/tests/test_cluster_yamls/ray/default-appwrapper.yaml b/tests/test_cluster_yamls/ray/default-appwrapper.yaml index 6d1cdcd5..1532c0e8 100644 --- a/tests/test_cluster_yamls/ray/default-appwrapper.yaml +++ b/tests/test_cluster_yamls/ray/default-appwrapper.yaml @@ -53,6 +53,9 @@ spec: name: dashboard - containerPort: 10001 name: client + env: + - name: RAY_USAGE_STATS_ENABLED + value: '0' resources: limits: cpu: 2 @@ -111,6 +114,9 @@ spec: - -c - ray stop name: machine-learning + env: + - name: RAY_USAGE_STATS_ENABLED + value: '0' resources: limits: cpu: 1 diff --git a/tests/test_cluster_yamls/ray/default-ray-cluster.yaml b/tests/test_cluster_yamls/ray/default-ray-cluster.yaml index 38e02f8f..db401026 100644 --- a/tests/test_cluster_yamls/ray/default-ray-cluster.yaml +++ b/tests/test_cluster_yamls/ray/default-ray-cluster.yaml @@ -45,6 +45,9 @@ spec: name: dashboard - containerPort: 10001 name: client + env: + - name: RAY_USAGE_STATS_ENABLED + value: '0' resources: limits: cpu: 2 @@ -110,6 +113,9 @@ spec: requests: cpu: 1 memory: 2G + env: + - name: RAY_USAGE_STATS_ENABLED + value: '0' volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert diff --git a/tests/test_cluster_yamls/ray/unit-test-all-params.yaml b/tests/test_cluster_yamls/ray/unit-test-all-params.yaml index d5d8059d..6900b058 100644 --- a/tests/test_cluster_yamls/ray/unit-test-all-params.yaml +++ b/tests/test_cluster_yamls/ray/unit-test-all-params.yaml @@ -45,6 +45,8 @@ spec: value: value1 - name: key2 value: value2 + - name: RAY_USAGE_STATS_ENABLED + value: '0' image: example/ray:tag imagePullPolicy: Always lifecycle: @@ -150,6 +152,8 @@ spec: value: value1 - name: key2 value: value2 + - name: RAY_USAGE_STATS_ENABLED + value: '0' image: example/ray:tag imagePullPolicy: Always lifecycle: