Dev mode (#637)

ArneTR · web-flow · commit 807cdd412e4d · 2024-01-01T12:47:42.000+01:00
* Refactoring for error messages and security fix for path echoing

* Added dev_no_sleep, dev_no_metrics, dev_no_build

* Added healthcheck

* Normalization and extra error check

* Tests; Improved test speed by 6x

* Forcing no_build = False

* Healthcheck now handles lists conformant and more strict on health status string

* Test-Fix

* Test-Fix
diff --git a/lib/schema_checker.py b/lib/schema_checker.py
@@ -99,6 +99,15 @@ def check_usage_scenario(self, usage_scenario):
                     Optional("environment"): self.single_or_list(Or(dict,str)),
                     Optional("ports"): self.single_or_list(Or(str, int)),
                     Optional("depends_on"): Or([str],dict),
+                    Optional("healthcheck"): {
+                        Optional('test'): Or(list, str),
+                        Optional('interval'): str,
+                        Optional('timeout'): str,
+                        Optional('retries'): int,
+                        Optional('start_period'): str,
+                        # Optional('start_interval'): str, docker CLI does not support this atm
+                        Optional('disable'): bool,
+                    },
                     Optional("setup-commands"): [str],
                     Optional("volumes"): self.single_or_list(str),
                     Optional("folder-destination"):str,
diff --git a/metric_providers/psu/energy/ac/xgboost/machine/model b/metric_providers/psu/energy/ac/xgboost/machine/model
@@ -1 +1 @@
-Subproject commit 16f45ee04d57442544422097179e20fb0a420665
+Subproject commit e42affa1765ff350b64c800c4115fa802fd3e9b2
diff --git a/runner.py b/runner.py
@@ -89,7 +89,7 @@ def __init__(self,
         name, uri, uri_type, filename='usage_scenario.yml', branch=None,
         debug_mode=False, allow_unsafe=False, no_file_cleanup=False, skip_system_checks=False,
         skip_unsafe=False, verbose_provider_boot=False, full_docker_prune=False,
-        dry_run=False, dev_repeat_run=False, docker_prune=False, job_id=None):
+        dev_no_sleeps=False, dev_no_build=False, dev_no_metrics=False, docker_prune=False, job_id=None):
 
         if skip_unsafe is True and allow_unsafe is True:
             raise RuntimeError('Cannot specify both --skip-unsafe and --allow-unsafe')
@@ -104,8 +104,9 @@ def __init__(self,
         self._verbose_provider_boot = verbose_provider_boot
         self._full_docker_prune = full_docker_prune
         self._docker_prune = docker_prune
-        self._dry_run = dry_run
-        self._dev_repeat_run = dev_repeat_run
+        self._dev_no_sleeps = dev_no_sleeps
+        self._dev_no_build = dev_no_build
+        self._dev_no_metrics = dev_no_metrics
         self._uri = uri
         self._uri_type = uri_type
         self._original_filename = filename
@@ -145,7 +146,7 @@ def __init__(self,
         # self.__filename = self._original_filename # this can be changed later if working directory changes
 
     def custom_sleep(self, sleep_time):
-        if not self._dry_run:
+        if not self._dev_no_sleeps:
             print(TerminalColors.HEADER, '\nSleeping for : ', sleep_time, TerminalColors.ENDC)
             time.sleep(sleep_time)
 
@@ -387,13 +388,13 @@ def check_running_containers(self):
     def populate_image_names(self):
         for service_name, service in self._usage_scenario.get('services', {}).items():
             if not service.get('image', None): # image is a non-mandatory field. But we need it, so we tmp it
-                if self._dev_repeat_run:
+                if self._dev_no_build:
                     service['image'] = f"{service_name}"
                 else:
                     service['image'] = f"{service_name}_{random.randint(500000,10000000)}"
 
     def remove_docker_images(self):
-        if self._dev_repeat_run:
+        if self._dev_no_build:
             return
 
         print(TerminalColors.HEADER, '\nRemoving all temporary GMT images', TerminalColors.ENDC)
@@ -477,6 +478,10 @@ def update_and_insert_specs(self):
         )
 
     def import_metric_providers(self):
+        if self._dev_no_metrics:
+            print(TerminalColors.HEADER, '\nSkipping import of metric providers', TerminalColors.ENDC)
+            return
+
         config = GlobalConfig().config
 
         print(TerminalColors.HEADER, '\nImporting metric providers', TerminalColors.ENDC)
@@ -520,6 +525,10 @@ def import_metric_providers(self):
         self.__metric_providers.sort(key=lambda item: 'rapl' not in item.__class__.__name__.lower())
 
     def download_dependencies(self):
+        if self._dev_no_build:
+            print(TerminalColors.HEADER, '\nSkipping downloading dependencies', TerminalColors.ENDC)
+            return
+
         print(TerminalColors.HEADER, '\nDownloading dependencies', TerminalColors.ENDC)
         subprocess.run(['docker', 'pull', 'gcr.io/kaniko-project/executor:latest'], check=True)
 
@@ -567,6 +576,7 @@ def build_docker_images(self):
                                          encoding='UTF-8',
                                          check=True)
                 # The image exists so exit and don't build
+                print(f"Image {service['image']} exists in build cache. Skipping build ...")
                 continue
             except subprocess.CalledProcessError:
                 pass
@@ -656,10 +666,11 @@ def order_service_names(service_name, visited=None):
                 raise RuntimeError(f"Cycle found in depends_on definition with service '{service_name}'!")
             visited.add(service_name)
 
+            if service_name not in services:
+                raise RuntimeError(f"Dependent service '{service_name}' defined in 'depends_on' does not exist in usage_scenario!")
+
             service = services[service_name]
             if 'depends_on' in service:
-                if isinstance(service['depends_on'], dict):
-                    raise RuntimeError(f"Service definition of {service_name} uses the long form of 'depends_on', however, GMT only supports the short form!")
                 for dep in service['depends_on']:
                     if dep not in names_ordered:
                         order_service_names(dep, visited)
@@ -834,6 +845,35 @@ def setup_services(self):
             if 'pause-after-phase' in service:
                 self.__services_to_pause_phase[service['pause-after-phase']] = self.__services_to_pause_phase.get(service['pause-after-phase'], []) + [container_name]
 
+            if 'healthcheck' in service:  # must come last
+                if 'disable' in service['healthcheck'] and service['healthcheck']['disable'] is True:
+                    docker_run_string.append('--no-healthcheck')
+                else:
+                    if 'test' in service['healthcheck']:
+                        docker_run_string.append('--health-cmd')
+                        health_string = service['healthcheck']['test']
+                        if isinstance(service['healthcheck']['test'], list):
+                            health_string_copy = service['healthcheck']['test'].copy()
+                            health_string_command = health_string_copy.pop(0)
+                            if health_string_command not in ['CMD', 'CMD-SHELL']:
+                                raise RuntimeError(f"Healthcheck starts with {health_string_command}. Please use 'CMD' or 'CMD-SHELL' when supplying as list. For disabling do not use 'NONE' but the disable argument.")
+                            health_string = ' '.join(health_string_copy)
+                        docker_run_string.append(health_string)
+                    if 'interval' in service['healthcheck']:
+                        docker_run_string.append('--health-interval')
+                        docker_run_string.append(service['healthcheck']['interval'])
+                    if 'timeout' in service['healthcheck']:
+                        docker_run_string.append('--health-timeout')
+                        docker_run_string.append(service['healthcheck']['timeout'])
+                    if 'retries' in service['healthcheck']:
+                        docker_run_string.append('--health-retries')
+                        docker_run_string.append(service['healthcheck']['retries'])
+                    if 'start_period' in service['healthcheck']:
+                        docker_run_string.append('--health-start-period')
+                        docker_run_string.append(service['healthcheck']['start_period'])
+                    if 'start_interval' in service['healthcheck']:
+                        raise RuntimeError('start_interval is not supported atm in healthcheck')
+
             docker_run_string.append(self.clean_image_name(service['image']))
 
             # Before starting the container, check if the dependent containers are "ready".
@@ -842,29 +882,55 @@ def setup_services(self):
             # In the future we want to implement an health check to know if dependent containers are actually ready.
             if 'depends_on' in service:
                 for dependent_container in service['depends_on']:
+                    print(f"Waiting for dependent container {dependent_container}")
                     time_waited = 0
-                    state = ""
+                    state = ''
+                    health = 'healthy' # default because some containers have no health
                     max_waiting_time = config['measurement']['boot']['wait_time_dependencies']
                     while time_waited < max_waiting_time:
-                        # TODO: Check health status instead if `healthcheck` is enabled (https://github.com/green-coding-berlin/green-metrics-tool/issues/423)
-                        # This waiting loop is actually a pre-work for the upcoming health check. For the check if the container is "running", as implemented here, the waiting loop is not needed.
                         status_output = subprocess.check_output(
                             ["docker", "container", "inspect", "-f", "{{.State.Status}}", dependent_container],
                             stderr=subprocess.STDOUT,
-                            encoding='utf-8'
+                            encoding='UTF-8',
                         )
 
                         state = status_output.strip()
-
-                        if state == "running":
+                        print(f"State of container '{dependent_container}': {state}")
+
+                        if isinstance(service['depends_on'], dict) \
+                            and 'condition' in service['depends_on'][dependent_container]:
+
+                            condition = service['depends_on'][dependent_container]['condition']
+                            if condition == 'service_healthy':
+                                ps = subprocess.run(
+                                    ["docker", "container", "inspect", "-f", "{{.State.Health.Status}}", dependent_container],
+                                    check=False,
+                                    stdout=subprocess.PIPE,
+                                    stderr=subprocess.STDOUT, # put both in one stream
+                                    encoding='UTF-8'
+                                )
+                                health = ps.stdout.strip()
+                                if ps.returncode != 0 or health == '<nil>':
+                                    raise RuntimeError(f"Health check for dependent_container '{dependent_container}' was requested, but container has no healthcheck implemented! (Output was: {health})")
+                                if health == 'unhealthy':
+                                    raise RuntimeError('ontainer healthcheck failed terminally with status "unhealthy")')
+                                print(f"Health of container '{dependent_container}': {health}")
+                            elif condition == 'service_started':
+                                pass
+                            else:
+                                raise RuntimeError(f"Unsupported condition in healthcheck for service '{service_name}':  {condition}")
+
+                        if state == 'running' and health == 'healthy':
                             break
 
-                        print(f"State of container '{dependent_container}': {state}. Waiting for 1 second")
-                        self.custom_sleep(1)
+                        print('Waiting for 1 second')
+                        time.sleep(1)
                         time_waited += 1
 
-                    if state != "running":
-                        raise RuntimeError(f"Dependent container '{dependent_container}' of '{container_name}' is not running after waiting for {time_waited} sec! Consider checking your service configuration, the entrypoint of the container or the logs of the container.")
+                    if state != 'running':
+                        raise RuntimeError(f"Dependent container '{dependent_container}' of '{container_name}' is not running but {state} after waiting for {time_waited} sec! Consider checking your service configuration, the entrypoint of the container or the logs of the container.")
+                    if health != 'healthy':
+                        raise RuntimeError(f"Dependent container '{dependent_container}' of '{container_name}' is not healthy but '{health}' after waiting for {time_waited} sec! Consider checking your service configuration, the entrypoint of the container or the logs of the container.")
 
             if 'command' in service:  # must come last
                 for cmd in service['command'].split():
@@ -946,6 +1012,9 @@ def add_to_log(self, container_name, message, cmd=''):
 
 
     def start_metric_providers(self, allow_container=True, allow_other=True):
+        if self._dev_no_metrics:
+            return
+
         print(TerminalColors.HEADER, '\nStarting metric providers', TerminalColors.ENDC)
 
         for metric_provider in self.__metric_providers:
@@ -1099,6 +1168,9 @@ def run_flows(self):
 
     # this function should never be called twice to avoid double logging of metrics
     def stop_metric_providers(self):
+        if self._dev_no_metrics:
+            return
+
         print(TerminalColors.HEADER, 'Stopping metric providers and parsing measurements', TerminalColors.ENDC)
         errors = []
         for metric_provider in self.__metric_providers:
@@ -1454,8 +1526,9 @@ def run(self):
     parser.add_argument('--verbose-provider-boot', action='store_true', help='Boot metric providers gradually')
     parser.add_argument('--full-docker-prune', action='store_true', help='Stop and remove all containers, build caches, volumes and images on the system')
     parser.add_argument('--docker-prune', action='store_true', help='Prune all unassociated build caches, networks volumes and stopped containers on the system')
-    parser.add_argument('--dry-run', action='store_true', help='Removes all sleeps. Resulting measurement data will be skewed.')
-    parser.add_argument('--dev-repeat-run', action='store_true', help='Checks if a docker image is already in the local cache and will then not build it. Also doesn\'t clear the images after a run')
+    parser.add_argument('--dev-no-metrics', action='store_true', help='Skips loading the metric providers. Runs will be faster, but you will have no metric')
+    parser.add_argument('--dev-no-sleeps', action='store_true', help='Removes all sleeps. Resulting measurement data will be skewed.')
+    parser.add_argument('--dev-no-build', action='store_true', help='Checks if a container images are already in the local cache and will then not build it. Also doesn\'t clear the images after a run. Please note that skipping builds only works the second time you make a run.')
     parser.add_argument('--print-logs', action='store_true', help='Prints the container and process logs to stdout')
 
     args = parser.parse_args()
@@ -1470,9 +1543,9 @@ def run(self):
         error_helpers.log_error('--allow-unsafe and skip--unsafe in conjuction is not possible')
         sys.exit(1)
 
-    if args.dev_repeat_run and (args.docker_prune or args.full_docker_prune):
+    if args.dev_no_build and (args.docker_prune or args.full_docker_prune):
         parser.print_help()
-        error_helpers.log_error('--dev-repeat-run blocks pruning docker images. Combination is not allowed')
+        error_helpers.log_error('--dev-no-build blocks pruning docker images. Combination is not allowed')
         sys.exit(1)
 
     if args.full_docker_prune and GlobalConfig().config['postgresql']['host'] == 'green-coding-postgres-container':
@@ -1515,8 +1588,8 @@ def run(self):
                     branch=args.branch, debug_mode=args.debug, allow_unsafe=args.allow_unsafe,
                     no_file_cleanup=args.no_file_cleanup, skip_system_checks=args.skip_system_checks,
                     skip_unsafe=args.skip_unsafe,verbose_provider_boot=args.verbose_provider_boot,
-                    full_docker_prune=args.full_docker_prune, dry_run=args.dry_run,
-                    dev_repeat_run=args.dev_repeat_run, docker_prune=args.docker_prune)
+                    full_docker_prune=args.full_docker_prune, dev_no_sleeps=args.dev_no_sleeps,
+                    dev_no_build=args.dev_no_build, dev_no_metrics=args.dev_no_metrics, docker_prune=args.docker_prune)
 
     # Using a very broad exception makes sense in this case as we have excepted all the specific ones before
     #pylint: disable=broad-except
diff --git a/tests/data/usage_scenarios/depends_on_error_unsupported_condition.yml b/tests/data/usage_scenarios/depends_on_error_unsupported_condition.yml
@@ -0,0 +1,20 @@
+---
+name: Test depends_on
+author: Arne Tarara <arne@green-coding.berlin>
+description: test
+
+services:
+  test-container-1:
+    image: alpine
+    depends_on:
+      test-container-2:
+        condition: service_completed_successfully
+  test-container-2:
+    image: alpine
+
+flow:
+  - name: dummy
+    container: test-container-1
+    commands:
+      - type: console
+        command: pwd
diff --git a/tests/data/usage_scenarios/depends_on_long_form.yml b/tests/data/usage_scenarios/depends_on_long_form.yml
@@ -1,5 +1,5 @@
 ---
-name: Test depends_on
+name: Test depends_on long_form
 author: David Kopp
 description: test
 
diff --git a/tests/data/usage_scenarios/healthcheck.yml b/tests/data/usage_scenarios/healthcheck.yml
@@ -0,0 +1,23 @@
+---
+name: Test depends_on
+author: David Kopp
+description: test
+
+services:
+  test-container-1:
+    image: alpine
+    depends_on:
+      test-container-2:
+        condition: service_healthy
+  test-container-2:
+    image: alpine
+    healthcheck:
+      test: ls
+      interval: 1s
+
+flow:
+  - name: dummy
+    container: test-container-1
+    commands:
+      - type: console
+        command: pwd
diff --git a/tests/data/usage_scenarios/healthcheck_error_missing.yml b/tests/data/usage_scenarios/healthcheck_error_missing.yml
@@ -0,0 +1,20 @@
+---
+name: Test depends_on
+author: David Kopp
+description: test
+
+services:
+  test-container-1:
+    image: alpine
+    depends_on:
+      test-container-2:
+        condition: service_healthy
+  test-container-2:
+    image: alpine
+
+flow:
+  - name: dummy
+    container: test-container-1
+    commands:
+      - type: console
+        command: pwd
diff --git a/tests/smoke_test.py b/tests/smoke_test.py
@@ -39,12 +39,11 @@ def setup_module(module):
     err = io.StringIO()
     GlobalConfig(config_name='test-config.yml').config
     with redirect_stdout(out), redirect_stderr(err):
-        uri = os.path.abspath(os.path.join(
-            CURRENT_DIR, 'stress-application/'))
+        uri = os.path.abspath(os.path.join(CURRENT_DIR, 'stress-application/'))
         subprocess.run(['docker', 'compose', '-f', uri+'/compose.yml', 'build'], check=True)
 
         # Run the application
-        runner = Runner(name=RUN_NAME, uri=uri, uri_type='folder', dev_repeat_run=True, skip_system_checks=False)
+        runner = Runner(name=RUN_NAME, uri=uri, uri_type='folder', dev_no_build=True, dev_no_sleeps=True, dev_no_metrics=False, skip_system_checks=False)
         runner.run()
 
     #pylint: disable=global-statement
diff --git a/tests/test_functions.py b/tests/test_functions.py
diff --git a/tests/test_runner.py b/tests/test_runner.py
diff --git a/tests/test_usage_scenario.py b/tests/test_usage_scenario.py
diff --git a/tests/test_volume_loading.py b/tests/test_volume_loading.py