diff --git a/plugins/flytekit-inference/flytekitplugins/inference/nim/serve.py b/plugins/flytekit-inference/flytekitplugins/inference/nim/serve.py index 40645b1ed7..63ac0e52c4 100644 --- a/plugins/flytekit-inference/flytekitplugins/inference/nim/serve.py +++ b/plugins/flytekit-inference/flytekitplugins/inference/nim/serve.py @@ -33,6 +33,7 @@ def __init__( cpu: int = 1, gpu: int = 1, mem: str = "20Gi", + ephemeral_storage: str = "20Gi", shm_size: str = "16Gi", env: Optional[ dict[str, str] @@ -49,6 +50,7 @@ def __init__( :param cpu: The number of CPU cores requested for the model server container. Default is 1. :param gpu: The number of GPU cores requested for the model server container. Default is 1. :param mem: The amount of memory requested for the model server container. Default is "20Gi". + :param ephemeral_storage: The amount of ephemeral storage requested for the model server container. Default is "20Gi". :param shm_size: The size of the shared memory volume. Default is "16Gi". :param env: A dictionary of environment variables to be set in the model server container. :param hf_repo_ids: A list of Hugging Face repository IDs for LoRA adapters to be downloaded. @@ -72,6 +74,7 @@ def __init__( cpu=cpu, gpu=gpu, mem=mem, + ephemeral_storage=ephemeral_storage, env=env, ) diff --git a/plugins/flytekit-inference/flytekitplugins/inference/sidecar_template.py b/plugins/flytekit-inference/flytekitplugins/inference/sidecar_template.py index c4e2cc539d..742ef7e6e7 100644 --- a/plugins/flytekit-inference/flytekitplugins/inference/sidecar_template.py +++ b/plugins/flytekit-inference/flytekitplugins/inference/sidecar_template.py @@ -13,6 +13,7 @@ def __init__( cpu: int = 1, gpu: int = 1, mem: str = "1Gi", + ephemeral_storage: str = "1Gi", env: Optional[dict[str, str]] = None, download_inputs: bool = False, download_inputs_mem: str = "500Mi", @@ -36,6 +37,7 @@ def __init__( self._cpu = cpu self._gpu = gpu self._mem = mem + self._ephemeral_storage = ephemeral_storage self._download_inputs_mem = download_inputs_mem self._download_inputs_cpu = download_inputs_cpu self._env = env @@ -58,11 +60,13 @@ def __init__( "cpu": self._cpu, "nvidia.com/gpu": self._gpu, "memory": self._mem, + "ephemeral-storage": self._ephemeral_storage, }, limits={ "cpu": self._cpu, "nvidia.com/gpu": self._gpu, "memory": self._mem, + "ephemeral-storage": self._ephemeral_storage, }, ), restart_policy="Always", # treat this container as a sidecar