diff --git a/examples/guidellm_example.py b/examples/guidellm_example.py index f09b1e6..05cd810 100644 --- a/examples/guidellm_example.py +++ b/examples/guidellm_example.py @@ -9,11 +9,11 @@ GUIDELLM__MAX_CONCURRENCY=256, GUIDELLM__REQUEST_TIMEOUT=21600, target="http://localhost:8000/v1", - data_type="emulated", max_seconds=30, - data="prompt_tokens=512,generated_tokens=256", + #scenario = "benchmarking_32k", + data="prompt_tokens=128,output_tokens=128", vllm_kwargs={"enable-chunked-prefill": True} ) task.execute_remotely("oneshot-a100x1") -#task.execute_locally() \ No newline at end of file +#task.execute_locally() diff --git a/examples/lmeval_example.py b/examples/lmeval_example.py index 8910aa2..688c355 100644 --- a/examples/lmeval_example.py +++ b/examples/lmeval_example.py @@ -6,8 +6,8 @@ model_id="meta-llama/Llama-3.2-1B-Instruct", tasks="gsm8k", model_args="dtype=auto,max_model_len=8192", - batch_size="auto", + batch_size="auto", ) task.execute_remotely("oneshot-a100x1") -#task.execute_locally() \ No newline at end of file +#task.execute_locally() diff --git a/src/automation/configs.py b/src/automation/configs.py index 76dbe58..5972ab3 100644 --- a/src/automation/configs.py +++ b/src/automation/configs.py @@ -1,2 +1,3 @@ -DEFAULT_DOCKER_IMAGE = "498127099666.dkr.ecr.us-east-1.amazonaws.com/mlops/k8s-research-cuda12_5:latest" -DEFAULT_OUTPUT_URI = "gs://neuralmagic-clearml" \ No newline at end of file +DEFAULT_DOCKER_IMAGE = "quay.io/nmmlops/mlops/k8s-research-cuda12_8:latest" +DEFAULT_OUTPUT_URI = "gs://neuralmagic-clearml" +DEFAULT_RESEARCH_BRANCH = "main" diff --git a/src/automation/standards/benchmarking/benchmarking_128k.json b/src/automation/standards/benchmarking/benchmarking_128k.json new file mode 100644 index 0000000..13b8105 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_128k.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 128000, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 128000, + "output_tokens": 2048, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 2048 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_16k.json b/src/automation/standards/benchmarking/benchmarking_16k.json new file mode 100644 index 0000000..f927a4a --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_16k.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 16000, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 16000, + "output_tokens": 2048, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 2048 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_32k.json b/src/automation/standards/benchmarking/benchmarking_32k.json new file mode 100644 index 0000000..6543fd7 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_32k.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 32000, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 32000, + "output_tokens": 2048, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 2048 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_64k.json b/src/automation/standards/benchmarking/benchmarking_64k.json new file mode 100644 index 0000000..871b210 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_64k.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 64000, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 64000, + "output_tokens": 2048, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 2048 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_chat.json b/src/automation/standards/benchmarking/benchmarking_chat.json new file mode 100644 index 0000000..f4d0548 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_chat.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 512, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 512, + "output_tokens": 256, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 256 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_code_completion.json b/src/automation/standards/benchmarking/benchmarking_code_completion.json new file mode 100644 index 0000000..6be35df --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_code_completion.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 256, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 256, + "output_tokens": 1024, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 1024 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_code_fixing.json b/src/automation/standards/benchmarking/benchmarking_code_fixing.json new file mode 100644 index 0000000..bceff14 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_code_fixing.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 1024, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 1024, + "output_tokens": 1024, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 1024 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_docstring_generation.json b/src/automation/standards/benchmarking/benchmarking_docstring_generation.json new file mode 100644 index 0000000..0eda212 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_docstring_generation.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 768, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 768, + "output_tokens": 128, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 128 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_instruction.json b/src/automation/standards/benchmarking/benchmarking_instruction.json new file mode 100644 index 0000000..0fac491 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_instruction.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 256, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 256, + "output_tokens": 128, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 128 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_long_rag.json b/src/automation/standards/benchmarking/benchmarking_long_rag.json new file mode 100644 index 0000000..4fe719a --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_long_rag.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 10240, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 10240, + "output_tokens": 1536, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 1536 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_rag.json b/src/automation/standards/benchmarking/benchmarking_rag.json new file mode 100644 index 0000000..9525b09 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_rag.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 1024, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 1024, + "output_tokens": 128, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 128 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_summarization.json b/src/automation/standards/benchmarking/benchmarking_summarization.json new file mode 100644 index 0000000..9525b09 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_summarization.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 1024, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 1024, + "output_tokens": 128, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 128 + } +} diff --git a/src/automation/standards/benchmarking/chat.json b/src/automation/standards/benchmarking/chat.json new file mode 100644 index 0000000..024438c --- /dev/null +++ b/src/automation/standards/benchmarking/chat.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 512, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 1024, + "output_tokens": 256, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 1024 + } +} diff --git a/src/automation/standards/benchmarking/rag.json b/src/automation/standards/benchmarking/rag.json new file mode 100644 index 0000000..c7ee2f2 --- /dev/null +++ b/src/automation/standards/benchmarking/rag.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 4096, + "prompt_tokens_stdev": 512, + "prompt_tokens_min": 2048, + "prompt_tokens_max": 6144, + "output_tokens": 512, + "output_tokens_stdev": 128, + "output_tokens_min": 1, + "output_tokens_max": 1024 + } +} diff --git a/src/automation/tasks/base_task.py b/src/automation/tasks/base_task.py index d886599..c6961b1 100644 --- a/src/automation/tasks/base_task.py +++ b/src/automation/tasks/base_task.py @@ -1,27 +1,30 @@ from clearml import Task from typing import Sequence, Optional -from automation.configs import DEFAULT_OUTPUT_URI +from automation.configs import DEFAULT_OUTPUT_URI, DEFAULT_RESEARCH_BRANCH from automation.standards import STANDARD_CONFIGS import yaml import os class BaseTask(): - base_packages = ["git+https://github.com/neuralmagic/research.git"] - def __init__( self, project_name: str, task_name: str, docker_image: str, + branch: Optional[str] = DEFAULT_RESEARCH_BRANCH, packages: Optional[Sequence[str]]=None, task_type: str="training", ): + branch_name = branch or DEFAULT_RESEARCH_BRANCH + base_packages = [f"git+https://github.com/neuralmagic/research.git@{branch_name}"] if packages is not None: - packages = list(set(packages + self.base_packages)) + packages = list(set(packages + base_packages)) else: - packages = self.base_packages + packages = base_packages + + print(packages) self.project_name = project_name self.task_name = task_name @@ -29,6 +32,7 @@ def __init__( self.packages = packages self.task_type = task_type self.task = None + self.branch = branch self.script_path = None self.callable_artifacts = None @@ -50,8 +54,8 @@ def process_config(self, config): return yaml.safe_load(open(STANDARD_CONFIGS[config], "r")) elif os.path.exists(config): return yaml.safe_load(open(config, "r")) - elif os.path.exists(os.path.join("..", "standatrds", config)): - return yaml.safe_load(open(os.path.join("..", "standatrds", config)), "r") + elif os.path.exists(os.path.join("..", "standards", config)): + return yaml.safe_load(open(os.path.join("..", "standards", config)), "r") else: return yaml.safe_load(config) @@ -91,7 +95,7 @@ def create_task(self): add_task_init_call=True, script=self.script_path, repo="https://github.com/neuralmagic/research.git", - branch="main", + branch=self.branch, ) self.task.output_uri = DEFAULT_OUTPUT_URI self.set_arguments() diff --git a/src/automation/tasks/guidellm.py b/src/automation/tasks/guidellm.py index 390012b..d4c6974 100644 --- a/src/automation/tasks/guidellm.py +++ b/src/automation/tasks/guidellm.py @@ -1,13 +1,20 @@ from automation.tasks import BaseTask -from automation.configs import DEFAULT_DOCKER_IMAGE +from automation.configs import DEFAULT_DOCKER_IMAGE, DEFAULT_RESEARCH_BRANCH from typing import Optional, Sequence import os DEFAULT_SERVER_WAIT_TIME = 600 # 600 seconds = 10 minutes -GUIDELLM_PACKAGE = "git+https://github.com/neuralmagic/guidellm.git@http_backend" +GUIDELLM_PACKAGE = "git+https://github.com/neuralmagic/guidellm.git" class GuideLLMTask(BaseTask): + """ + guidellm_packages = [ + "huggingface-hub==0.34.3", + "triton==3.3.1", + "vllm==0.10.0", + "hf_xet", + """ guidellm_packages = [ "vllm", GUIDELLM_PACKAGE, @@ -23,6 +30,7 @@ def __init__( docker_image: str=DEFAULT_DOCKER_IMAGE, packages: Optional[Sequence[str]]=None, clearml_model: bool=False, + branch: str= DEFAULT_RESEARCH_BRANCH, task_type: str="training", vllm_kwargs: dict={}, target: str="http://localhost:8000/v1", @@ -45,6 +53,12 @@ def __init__( if "packages" in config_kwargs: packages = list(set(packages + config_kwargs.pop("packages"))) + # keep only the pinned version of a library + for pkg in packages: + if "==" in pkg and pkg.split("==")[0] in packages: + lib_name = pkg.split("==")[0] + packages.remove(lib_name) + # Initialize base parameters super().__init__( project_name=project_name, @@ -52,6 +66,7 @@ def __init__( docker_image=docker_image, packages=packages, task_type=task_type, + branch = branch, ) # Check for conflicts in configs and constructor arguments diff --git a/src/automation/tasks/scripts/guidellm_script.py b/src/automation/tasks/scripts/guidellm_script.py index 617b502..f9b6feb 100644 --- a/src/automation/tasks/scripts/guidellm_script.py +++ b/src/automation/tasks/scripts/guidellm_script.py @@ -1,11 +1,10 @@ - import os +import sys from clearml import Task from automation.utils import resolve_model_id, cast_args, kill_process_tree from automation.vllm import start_vllm_server from pyhocon import ConfigFactory - def main(configurations=None): task = Task.current_task() @@ -29,7 +28,6 @@ def main(configurations=None): guidellm_args = configurations.get("GuideLLM", {}) environment_args = configurations.get("environment", {}) vllm_args = configurations.get("vLLM", {}) - clearml_model = args["Args"]["clearml_model"] if isinstance(clearml_model, str): @@ -39,7 +37,6 @@ def main(configurations=None): if isinstance(force_download, str): force_download = force_download.lower() == "true" - # Resolve model_id model_id = resolve_model_id(args["Args"]["model"], clearml_model, force_download) @@ -54,7 +51,7 @@ def main(configurations=None): if not server_initialized: kill_process_tree(server_process.pid) task.upload_artifact(name="vLLM server log", artifact_object=server_log) - raise AssertionError("Server failed to intialize") + raise AssertionError("Server failed to initialize") # Parse through environment variables for k, v in environment_args.items(): @@ -62,13 +59,51 @@ def main(configurations=None): guidellm_args["model"] = model_id - from guidellm import generate_benchmark_report - guidellm_args = cast_args(guidellm_args, generate_benchmark_report) - report = generate_benchmark_report(**guidellm_args) - kill_process_tree(server_process.pid) + import json + import asyncio + from pathlib import Path + from guidellm.benchmark.entrypoints import benchmark_with_scenario + from guidellm.benchmark.scenario import GenerativeTextScenario, get_builtin_scenarios + + # user defined scenarios are a temporary fix until the guidellm bugs get fixed otherwise we would use the upstream scenarios + user_scenario = guidellm_args.get("scenario", "") + if user_scenario: + filepath = Path(os.path.join(".", "src", "automation", "standards", "benchmarking", f"{user_scenario}.json")) + if os.path.exists(filepath): + current_scenario = GenerativeTextScenario.from_file(filepath, dict(guidellm_args)) + else: + raise ValueError(f"Scenario path {filepath} does not exist") + #elif len(get_builtin_scenarios()) > 0: + # to be used when get_builtin_scenarios() bug is fixed + # current_scenario = GenerativeTextScenario.from_builtin(get_builtin_scenarios()[0], dict(guidellm_args)) + else: + filepath = Path(os.path.join(".", "src", "automation", "standards", "benchmarking", f"{user_scenario}.json")) + current_scenario = GenerativeTextScenario.from_file(filepath, dict(guidellm_args)) + + # Ensure output_path is set and consistent + output_path = Path(guidellm_args.get("output_path", "guidellm-output.json")) + guidellm_args["output_path"] = str(output_path) + + print("[DEBUG] Calling benchmark_with_scenario with:") + print(json.dumps(guidellm_args, indent=2)) + + executable_path = os.path.dirname(sys.executable) + vllm_path = os.path.join(executable_path, "vllm") + print(f"The vllm path is: {vllm_path}") - task.upload_artifact(name="guidellm guidance report", artifact_object=report.to_json()) - task.upload_artifact(name="vLLM server log", artifact_object=server_log) + try: + asyncio.run( + benchmark_with_scenario( + current_scenario, + output_path= output_path, + output_extras= None + ) + ) + + finally: + task.upload_artifact(name="guidellm guidance report", artifact_object=output_path) + task.upload_artifact(name="vLLM server log", artifact_object=server_log) + kill_process_tree(server_process.pid) if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/src/automation/vllm/server.py b/src/automation/vllm/server.py index 6036d65..4711efc 100644 --- a/src/automation/vllm/server.py +++ b/src/automation/vllm/server.py @@ -40,12 +40,15 @@ def start_vllm_server( subprocess_env[k] = str(v) else: if v == True or v == "True": - v = "true" - server_command.extend([f"--{k}", str(v)]) + server_command.append(f"--{k}") + else: + server_command.extend([f"--{k}", str(v)]) + server_log_file_name = f"{SERVER_LOG_PREFIX}_{task.id}.txt" server_log_file = open(server_log_file_name, "w") - server_process = subprocess.Popen(server_command, stdout=server_log_file, stderr=server_log_file, shell=False, env=subprocess_env) + server_process = subprocess.Popen(server_command, shell=False, env=subprocess_env) + #server_process = subprocess.Popen(server_command, stdout=server_log_file, stderr=server_log_file, shell=False, env=subprocess_env) delay = 5 server_initialized = False @@ -64,4 +67,4 @@ def start_vllm_server( if server_initialized: return server_process, True, server_log_file_name else: - return server_process, False, server_log_file_name \ No newline at end of file + return server_process, False, server_log_file_name