winmssql scale support

ebattat · ebattat · commit 8d547905008a · 2026-03-05T11:29:15.000+02:00
diff --git a/benchmark_runner/common/template_operations/templates/winmssql/windows_benchmark_runner/analyze_windows_hammerdb.py b/benchmark_runner/common/template_operations/templates/winmssql/windows_benchmark_runner/analyze_windows_hammerdb.py
@@ -23,17 +23,15 @@ def get_json_files(self, hammerdb_results_path: str) -> list[str]:
         path = Path(hammerdb_results_path)
         return [str(file) for file in path.iterdir() if file.is_file() and file.suffix == '.json']
 
-    import re
-
     def get_tpm_per_worker(self, json_files):
         """
-        Extract the maximum TPM per worker from a list of JSON files.
+        Extract the average TPM per worker from a list of JSON files.
         Args:
             json_files (list[str]): List of JSON file paths containing HammerDB results.
         Returns:
-            dict[int, int]: Dictionary mapping worker ID to its maximum TPM.
+            dict[int, int]: Dictionary mapping worker ID to its average TPM (rounded to int).
         """
-        results = {}
+        results = {}  # worker_id -> list of TPM values
         for json_file in json_files:
             logger.info(f"Analyzing: {json_file}")
             try:
@@ -43,27 +41,29 @@ def get_tpm_per_worker(self, json_files):
                     raise ValueError(f"Cannot extract worker ID from filename: {json_file}")
                 current_worker = int(match.group(1))
 
-                current_max_tpm = self.extract_max_tpm(json_file)
+                current_avg_tpm = self.extract_avg_tpm(json_file)
 
-                if results.get(current_worker):
-                    if current_max_tpm > results.get(current_worker):
-                        results[current_worker] = current_max_tpm
-                else:
-                    results[current_worker] = current_max_tpm
+                if current_worker not in results:
+                    results[current_worker] = []
+                results[current_worker].append(current_avg_tpm)
 
             except Exception as e:
                 logger.error(f"Skipping file due to error: {json_file} -> {e}")
 
-        return results
+        # Convert lists to average TPM per worker (same format as before: int values)
+        return {
+            worker: int(round(sum(tpms) / len(tpms))) if tpms else 0
+            for worker, tpms in results.items()
+        }
 
-    def extract_max_tpm(self, json_file):
+    def extract_avg_tpm(self, json_file):
         """
-        Extract the maximum TPM value from a single JSON file.
+        Extract the average TPM value from a single JSON file.
         Handles Windows and Linux line endings and different encodings.
         Args:
             json_file (str): Path to the JSON file containing HammerDB results.
         Returns:
-            int: Maximum TPM value found in the file.
+            float: Average TPM value found in the file.
         """
         # Detect possible encoding
         try:
@@ -93,16 +93,18 @@ def extract_max_tpm(self, json_file):
         tpm_dict = data["MSSQLServer tpm"]
 
         # Convert values to integers
-        tpm_values = [int(v) for v in tpm_dict.values()]
-        return max(tpm_values)
+        tpm_values = [int(v) for v in tpm_dict.values() if int(v) > 0]
+        if not tpm_values:
+            return 0
+        return sum(tpm_values) / len(tpm_values)
 
     def hammerdb_results_for_elasticsearch(self, hammerdb_results: dict, output_file: str) -> list[dict]:
         """
         Prepare HammerDB results in a format suitable for Elasticsearch
         and write them to a JSON file.
 
         Args:
-            hammerdb_results (dict): Dictionary of worker_id -> max TPM
+            hammerdb_results (dict): Dictionary of worker_id -> average TPM
             output_file (str): Path to output JSON file (default: hammerdb_result.json)
 
         Returns:
diff --git a/benchmark_runner/common/template_operations/templates/winmssql/windows_benchmark_runner/elasticsearch_uploader.py b/benchmark_runner/common/template_operations/templates/winmssql/windows_benchmark_runner/elasticsearch_uploader.py
@@ -86,9 +86,17 @@ def upload_to_elasticsearch(self, index: str, data: dict) -> list[str]:
             for row in data:
                 # Update row with uuid
                 uuid = str(uuid4())
-                data.update({'uuid': uuid, 'status': 'Succeeded'})
-                response = self._elasticsearch.upload_to_elasticsearch(index=index, data=data, timestamp=datetime.now(timezone.utc)-timedelta(hours=8))
+                row.update({'uuid': uuid, 'status': 'Succeeded'})
+                response = self._elasticsearch.upload_to_elasticsearch(index=index, data=row, timestamp=datetime.now(timezone.utc)-timedelta(hours=8))
                 uuids.append(uuid)
+                # Log success
+                logger.info(f"Uploaded to Elasticsearch index '{index}', response: {response}")
+            # Update that vm data succeeded uploaded to Elasticsearch
+            uuid = str(uuid4())
+            response = self._elasticsearch.upload_to_elasticsearch(index=index, data={'uuid': uuid, 'vm': 'Succeeded', 'vm_os_version': 'winmssql2022'},
+                                                                   timestamp=datetime.now(timezone.utc) - timedelta(
+                                                                       hours=8))
+            uuids.append(uuid)
             # Log success
             logger.info(f"Uploaded to Elasticsearch index '{index}', response: {response}")
             return uuids
diff --git a/benchmark_runner/workloads/bootstorm_vm.py b/benchmark_runner/workloads/bootstorm_vm.py
@@ -426,6 +426,9 @@ def run_vm_workload(self):
                 # Create VMs
                 if self._create_vms_only:
                     steps = (self._create_vm_scale, )
+                elif self._only_delete_all:
+                    steps = (self._stop_vm_scale,
+                             self._wait_for_stop_vm_scale,self._delete_vm_scale, self._wait_for_delete_vm_scale)
                 # Run VMs without deleting
                 elif not self._delete_all:
                     steps = (self._create_vm_scale, self._run_vm_scale)
diff --git a/benchmark_runner/workloads/winmssql_vm.py b/benchmark_runner/workloads/winmssql_vm.py
@@ -1,6 +1,7 @@
 
 import os
 import time
+from datetime import datetime, timezone, timedelta
 
 from benchmark_runner.common.logger.logger_time_stamp import logger_time_stamp, logger
 from benchmark_runner.common.elasticsearch.elasticsearch_exceptions import ElasticSearchDataNotUploaded
@@ -18,36 +19,57 @@ def __init__(self):
         if not self._windows_url:
             raise ValueError('Missing Windows DV URL')
 
-    def wait_for_windows_hammerdb_finished(self):
+    def wait_for_windows_hammerdb_finished(self, vm_nums: int = 1):
         """
-        Wait until the Windows HammerDB workload finishes by checking the 'status' key in Elasticsearch
+        Wait until the Windows HammerDB workload finishes by checking the 'vm' key in Elasticsearch
         and verifying that data is not already updated by checking key data_updated.
+
+        Args:
+            vm_nums: Expected number of VMs to complete (for scale support)
+
         Returns:
             True if the workload succeeded.
+
         Raises:
             Windows_HammerDB_NOT_Succeeded: If the workload did not succeed within the timeout.
         """
         current_wait_time = 0
 
         while True:
-            response = self._get_latest_resource_with_key(index=self._es_index, key='status')
-            # Verify that winmssl elasticsearch data is uploaded by checking 'status', 'vm_os_version'
-            # Checking that this is the latest data by verify that 'data_updated' is not True
-            if response.get('status') == 'Succeeded' and response.get('vm_os_version') == 'winmssql2022' and str(response.get('data_updated', '')).lower() != 'true':
+            # Get ALL documents in time window
+            current_datetime = datetime.now(timezone.utc)
+            start_datetime = current_datetime - timedelta(hours=1)
+            end_datetime = current_datetime + timedelta(hours=1)
+
+            es_data = self._es_operations.get_index_data_between_dates(
+                index=self._es_index,
+                start_datetime=start_datetime,
+                end_datetime=end_datetime
+            )
+
+            # Count documents with 'vm': 'Succeeded' and data_updated != True
+            succeeded_count = sum(
+                1 for doc in es_data
+                if doc.get('_source', {}).get('vm') == 'Succeeded'
+                and doc.get('_source', {}).get('vm_os_version') == 'winmssql2022'
+                and str(doc.get('_source', {}).get('data_updated', '')).lower() != 'true'
+            )
+
+            logger.info(f'Found {succeeded_count}/{vm_nums} successful VMs')
+
+            if succeeded_count >= vm_nums:
                 return True
-            else:
-                logger.info('Waiting for the Windows HammerDB run to finish successfully...')
 
-            # check timeout
+            # Check timeout
             if self._timeout > 0 and current_wait_time >= self._timeout:
                 break
 
-            # sleep before next check
+            # Sleep before next check
             time.sleep(OC.DELAY)
             current_wait_time += OC.DELAY
 
         raise Windows_HammerDB_NOT_Succeeded(
-            f"HammerDB did not succeed within {self._timeout} seconds"
+            f"Only {succeeded_count}/{vm_nums} VMs completed HammerDB successfully within {self._timeout} seconds"
         )
 
     @logger_time_stamp
@@ -62,32 +84,47 @@ def run(self):
             else:
                 self._es_index = 'hammerdb-results'
             self._initialize_run()
-            # create windows dv
-            self._oc.create_async(yaml=os.path.join(f'{self._run_artifacts_path}', 'windows_dv.yaml'))
-            self._oc.wait_for_dv_status(status='Succeeded')
-            self._oc.create_async(yaml=os.path.join(f'{self._run_artifacts_path}', f'{self._name}.yaml'))
-            self._oc.wait_for_vm_status(vm_name=f'{self._workload_name}-{self._trunc_uuid}', status=VMStatus.Stopped)
-            self._set_bootstorm_vm_first_run_time()
-            self._set_bootstorm_vm_start_time(vm_name=self._vm_name)
-            self._virtctl.start_vm_sync(vm_name=self._vm_name)
-            self._data_dict = self._get_bootstorm_vm_elapsed_time(vm_name=self._vm_name, vm_node='')
-            self._data_dict['run_artifacts_url'] = os.path.join(self._run_artifacts_url,
-                                                                f'{self._get_run_artifacts_hierarchy(workload_name=self._workload_name, is_file=True)}-{self._time_stamp_format}.tar.gz')
-            self.wait_for_windows_hammerdb_finished()
-            ids = self._get_index_ids_between_dates(index=self._es_index, key='status')
-            # Adding data_updated=True to stamp that this data is already updated and enrich with new product versions fields
-            for id in ids:
-                self._update_elasticsearch_index(index=self._es_index, id=id, kind='vm', status='Succeeded', run_artifacts_url=self._data_dict['run_artifacts_url'], database='mssql', vm_name=f'{self._workload_name}-{self._trunc_uuid}', data_updated=True)
-            self._finalize_vm()
-            if self._delete_all:
+            if not self._verification_only:
+                # create windows dv
+                self._oc.create_async(yaml=os.path.join(f'{self._run_artifacts_path}', 'windows_dv.yaml'))
+                self._oc.wait_for_dv_status(status='Succeeded')
+            if self._scale:
+                # Just create the vms
+                self._create_vms_only = True
+                self.run_vm_workload()
+                self._create_vms_only = False
+            else:
+                self._oc.create_async(yaml=os.path.join(f'{self._run_artifacts_path}', f'{self._name}.yaml'))
+                self._oc.wait_for_vm_status(vm_name=f'{self._workload_name}-{self._trunc_uuid}',
+                                            status=VMStatus.Stopped)
+                self._set_bootstorm_vm_first_run_time()
+                self._set_bootstorm_vm_start_time(vm_name=self._vm_name)
+                self._virtctl.start_vm_sync(vm_name=self._vm_name)
+                self._data_dict = self._get_bootstorm_vm_elapsed_time(vm_name=self._vm_name, vm_node='')
+            vm_count = int(self._scale)*len(self._scale_node_list) if self._scale else 1
+            self.wait_for_windows_hammerdb_finished(vm_nums=vm_count)
+            if self._scale:
+                ids = self._get_index_ids_between_dates(index=self._es_index, key='status')
+                # Adding data_updated=True to stamp that this data is already updated and enrich with new product versions fields
+                self._data_dict['run_artifacts_url'] = os.path.join(self._run_artifacts_url, f'{self._get_run_artifacts_hierarchy(workload_name=self._workload_name, is_file=True)}-{self._time_stamp_format}.tar.gz')
+                for id in ids:
+                    self._update_elasticsearch_index(index=self._es_index, id=id, kind='vm', status='Succeeded', run_artifacts_url=self._data_dict['run_artifacts_url'], database='mssql', vm_name=f'{self._workload_name}-{self._trunc_uuid}', data_updated=True, scale=int(self._scale)*len(self._scale_node_list))
+                self._only_delete_all=True
+                self.run_vm_workload()
+            else:
+                ids = self._get_index_ids_between_dates(index=self._es_index, key='status')
+                # Adding data_updated=True to stamp that this data is already updated and enrich with new product versions fields
+                self._data_dict['run_artifacts_url'] = os.path.join(self._run_artifacts_url, f'{self._get_run_artifacts_hierarchy(workload_name=self._workload_name, is_file=True)}-{self._time_stamp_format}.tar.gz')
+                for id in ids:
+                    self._update_elasticsearch_index(index=self._es_index, id=id, kind='vm', status='Succeeded', run_artifacts_url=self._data_dict['run_artifacts_url'], database='mssql', vm_name=f'{self._workload_name}-{self._trunc_uuid}', data_updated=True)
+                self._finalize_vm()
                 self._oc.delete_vm_sync(
                     yaml=os.path.join(f'{self._run_artifacts_path}', f'{self._name}.yaml'),
                     vm_name=self._vm_name)
-            if self._delete_all:
-                # delete windows dv
-                self._oc.delete_async(yaml=os.path.join(f'{self._run_artifacts_path}', 'windows_dv.yaml'))
-                # delete namespace
-                self._oc.delete_async(yaml=os.path.join(f'{self._run_artifacts_path}', 'namespace.yaml'))
+            # delete windows dv
+            self._oc.delete_async(yaml=os.path.join(f'{self._run_artifacts_path}', 'windows_dv.yaml'))
+            # delete namespace
+            self._oc.delete_async(yaml=os.path.join(f'{self._run_artifacts_path}', 'namespace.yaml'))
         except ElasticSearchDataNotUploaded as err:
             self._oc.delete_vm_sync(
                 yaml=os.path.join(f'{self._run_artifacts_path}', f'{self._name}.yaml'),
diff --git a/benchmark_runner/workloads/workloads_operations.py b/benchmark_runner/workloads/workloads_operations.py
@@ -104,6 +104,7 @@ def __init__(self):
         self._windows_url = self._environment_variables_dict.get('windows_url', '')
         self._create_vms_only = self._environment_variables_dict.get('create_vms_only', '')
         self._delete_all = self._environment_variables_dict.get('delete_all', '')
+        self._only_delete_all = False
         self._verification_only = self._environment_variables_dict.get('verification_only', '')
         self._must_gather_log = self._environment_variables_dict.get('must_gather_log', '')
         self._test_name = self._environment_variables_dict.get('test_name', '')
@@ -485,7 +486,7 @@ def _upload_to_elasticsearch(self, index: str, kind: str, status: str, result: d
         self._es_operations.upload_to_elasticsearch(index=index, data=self.__get_metadata(kind=kind, status=status, result=result))
 
     @logger_time_stamp
-    def _update_elasticsearch_index(self, index: str, id: str, kind: str, status: str, run_artifacts_url: str, database: str = '', vm_name: str = '', data_updated: bool = False):
+    def _update_elasticsearch_index(self, index: str, id: str, kind: str, status: str, run_artifacts_url: str, database: str = '', vm_name: str = '', data_updated: bool = False, scale: int = None):
         """
         This method updates elasticsearch id
         :param index:
@@ -495,12 +496,15 @@ def _update_elasticsearch_index(self, index: str, id: str, kind: str, status: st
         :param status:
         :param run_artifacts_url:
         :param data_updated: check if data was updated
+        :param scale: scale number
         :return:
         """
         metadata = self.__get_metadata(kind=kind, database=database, status=status, run_artifacts_url=run_artifacts_url)
         if vm_name:
             metadata.update({'vm_name': vm_name})
             metadata.update({'data_updated': data_updated})
+        if scale is not None:
+            metadata.update({'scale': scale})
         self._es_operations.update_elasticsearch_index(index=index, id=id, metadata=metadata)
 
     def _verify_elasticsearch_data_uploaded(self, index: str, uuid: str):