feat: scale in (#296)

raresgaia123 · web-flow · commit 2b0017a54909 · 2025-10-22T17:18:39.000-07:00
Added logic to determine if the job is underutilized based on effective throughput and queue trends. The system is underutilized when can handle load without last pipeline (remaining capacity &gt; demand) and queue is decreasing or stably low. If this condition is true, we scale in by removing last pipeline and do an update.
diff --git a/infscale/common/exceptions.py b/infscale/common/exceptions.py
@@ -59,6 +59,14 @@ def __init__(self, err_msg: str):
         super().__init__(err_msg)
 
 
+class InsufficientThroughput(InfScaleException):
+    """Exception for insufficient throughput."""
+
+    def __init__(self, err_msg: str):
+        """Initialize InsufficientThroughput exception instance."""
+        super().__init__(err_msg)
+
+
 class DifferentResourceAmount(InfScaleException):
     """Exception for different resource amounts."""
 
diff --git a/infscale/common/metrics.py b/infscale/common/metrics.py
@@ -97,6 +97,13 @@ class PerfMetrics:
     _sensitivity_factor: float = 1
     _qthresh: float = 10**9
 
+    # defines how much the queue must decrease (as a ratio of its average)
+    # For example:
+    #   q_drop_factor = 0.8 -> current queue < 80% of recent avg -> queue is dropping (~20% drop)
+    #   lower values = more aggressive scale-in detection (more sensitive to noise)
+    #   higher values = more conservative (requires stronger drop signal)
+    _q_drop_factor: float = 0.8
+
     _qlevel_rs: RollingStats = None
     _in_rate_rs: RollingStats = None
     _out_rate_rs: RollingStats = None
@@ -140,6 +147,37 @@ def is_congested(self) -> bool:
         """Return true if queue continues to build up."""
         return self.qlevel > self._qthresh
 
+    def is_underutilized(self) -> bool:
+        """Returns true if queue has a decreasing trend."""
+
+        # need enough samples to make a stable judgment
+        if not self._qlevel_rs.is_filled():
+            return False
+
+        # rolling average of current queue length
+        avg_q = self._qlevel_rs.mean()
+
+        # detect if queue is trending downward
+        # a decreasing qlevel indicates demand is below processing capacity
+        return self.qlevel < avg_q * self._q_drop_factor
+
+    def rate_to_scale_in(self, margin: float = 0.2) -> float:
+        """Return a safe arrival rate threshold to trigger scale-in.
+
+        This represents the effective arrival rate (including a safety margin)
+        below which the system is considered underutilized and can safely
+        reduce resources.
+
+        Args:
+        margin (float): A fractional buffer (e.g., 0.2 for 20%) added to the
+            average input rate to absorb short-term fluctuations and avoid
+            premature scaling in.
+
+        Returns:
+            float: The adjusted input rate threshold for scale-in decisions.
+        """
+        return self._in_rate_rs.mean() * (1 + margin)
+
     def rate_to_decongest(self) -> float:
         """Return a required rate to relieve congestion.
 
diff --git a/infscale/controller/autoscaler.py b/infscale/controller/autoscaler.py
@@ -25,6 +25,7 @@
 from infscale import get_logger
 from infscale.common.metrics import PerfMetrics
 from infscale.controller.job_context import JobContext, JobStateEnum
+from infscale.controller.planner import DemandData
 
 
 if TYPE_CHECKING:
@@ -75,8 +76,11 @@ async def run(self) -> None:
                 continue
 
             if not metrics.is_congested():
-                # TODO: if not congested, check if scale-in is necessary
                 self._congestion_count = 0
+
+                if metrics.is_underutilized():
+                    await self._scale_in(job_ctx, metrics)
+
                 continue
 
             if self._last_output_rate >= metrics.output_rate:
@@ -92,7 +96,8 @@ async def run(self) -> None:
 
     async def _scale_out(self, ctx: JobContext, metrics: PerfMetrics) -> None:
         rate = metrics.rate_to_decongest()
-        ctx.set_desired_rate(rate)
+        demand_data = DemandData(rate)
+        ctx.set_demand_data(demand_data)
 
         logger.debug(f"congested, desired rate = {rate}")
 
@@ -107,6 +112,24 @@ async def _scale_out(self, ctx: JobContext, metrics: PerfMetrics) -> None:
         self._last_output_rate = metrics.output_rate
         logger.debug("finished scaling-out")
 
+    async def _scale_in(self, ctx: JobContext, metrics: PerfMetrics) -> None:
+        rate = metrics.rate_to_scale_in()
+        demand_data = DemandData(rate, False)
+        ctx.set_demand_data(demand_data)
+
+        logger.debug(f"underutilized, desired rate = {rate}")
+
+        try:
+            await ctx.update()
+        except Exception as e:
+            logger.warning(f"exception: {e}")
+            self._last_run = time.perf_counter()
+            return
+
+        self._last_run = time.perf_counter()
+
+        logger.info("finished scaling-in")
+
     async def set_event(self, job_id: str, wrkr_id: str) -> None:
         """Set an autoscaling event for a given job and worker."""
         await self._event_queue.put((job_id, wrkr_id))
diff --git a/infscale/controller/job_context.py b/infscale/controller/job_context.py
@@ -43,6 +43,7 @@
 from infscale.controller.ctrl_dtype import CommandAction, CommandActionModel
 from infscale.controller.deployment.assignment import AssignmentCollection
 from infscale.controller.job_checker import JobChecker
+from infscale.controller.planner import DemandData
 
 
 if TYPE_CHECKING:
@@ -674,14 +675,14 @@ def __init__(self, ctrl: Controller, job_id: str):
         self.past_running_agent_info: dict[str, AgentMetaData] = {}
         self.job_checker = JobChecker(self.wrk_status)
 
-        self._desired_rate = 0.0
+        self._demand_data: DemandData = DemandData()
 
         global logger
         logger = get_logger()
 
-    def set_desired_rate(self, rate: float) -> None:
-        """Set diresed output rate for a job."""
-        self._desired_rate = rate
+    def set_demand_data(self, demand_data: DemandData) -> None:
+        """Set demand data for a job."""
+        self._demand_data = demand_data
 
     def get_agent_data(self, agent_id: str) -> AgentMetaData:
         """Return agent metadata."""
@@ -821,7 +822,7 @@ def process_cfg(self) -> None:
             self._new_cfg = self.ctrl.planner.build_config(
                 self.req.config,
                 self.ctrl.agent_contexts,
-                self._desired_rate,
+                self._demand_data,
                 self._cur_cfg,
             )
 
diff --git a/infscale/controller/planner.py b/infscale/controller/planner.py
@@ -20,7 +20,7 @@
 from dataclasses import dataclass
 from pathlib import Path
 
-from infscale.common.exceptions import InsufficientResources
+from infscale.common.exceptions import InsufficientResources, InsufficientThroughput
 from infscale.configs.job import JobConfig
 from infscale.configs.plan import ExecPlan
 from infscale.controller.agent_context import AgentContext
@@ -89,6 +89,14 @@ class PipelineData:
     total_throughput: float
 
 
+@dataclass
+class DemandData:
+    """DemandData class."""
+
+    rate: float = 0.0
+    scale_out: bool = True
+
+
 class Planner:
     """Planner class."""
 
@@ -106,21 +114,35 @@ def build_config(
         self,
         source: JobConfig,
         agent_ctxts: dict[str, AgentContext],
-        demand: float = 0,
+        demand_data: DemandData,
         base_cfg: JobConfig = None,
     ) -> JobConfig:
         """Build a config based on source config."""
         if not self._autoscale:
             # if autoscale is not enabled, we use source as is
             return source
 
+        rate, scale_out = demand_data.rate, demand_data.scale_out
+
+        if scale_out:
+            return self._get_scaled_out_cfg(source, agent_ctxts, rate, base_cfg)
+
+        return self._get_scaled_in_cfg(base_cfg, rate)
+
+    def _get_scaled_out_cfg(
+        self,
+        source: JobConfig,
+        agent_ctxts: dict[str, AgentContext],
+        rate: float,
+        base_cfg: JobConfig = None,
+    ) -> JobConfig:
         # if base_cfg is none, this is the first time we build a config,
         # so we need to place the dispatcher on a GPU
         # otherwise, we already have a base config, so we don't need to
         # spare a GPU for the dispatcher
         dispatcher_on_gpu = base_cfg is None
         solution = self._calculate_placement(
-            source, agent_ctxts, demand, dispatcher_on_gpu=dispatcher_on_gpu
+            source, agent_ctxts, rate, dispatcher_on_gpu=dispatcher_on_gpu
         )
 
         if solution is None:
@@ -146,6 +168,29 @@ def build_config(
         # gen = CfgGen(agent_ctxts, source, plan_list, "cuda", base_cfg)
         # return gen.generate()
 
+    def _get_scaled_in_cfg(self, cfg: JobConfig, rate: float) -> JobConfig:
+        # compute remaining capacity if we remove the last pipeline
+        total_thrpt = sum(
+            data.total_throughput for data in self.pipeline_data[cfg.job_id]
+        )
+        last_pipeline_thrpt = self.pipeline_data[cfg.job_id][-1].total_throughput
+
+        remaining_throughput = total_thrpt - last_pipeline_thrpt
+
+        # check if remaining capacity still comfortably exceeds current arrival rate
+        # margin ensures we don't scale in too early due to random dips
+        can_handle_load = remaining_throughput > rate
+
+        # return source config
+        if not can_handle_load:
+            raise InsufficientThroughput("Not enough remaining throughput for scale in")
+
+        data = self.pipeline_data[cfg.job_id].pop()
+
+        cfg = JobConfig.remove_pipeline(cfg, data.worker_ids)
+
+        return cfg
+
     def _set_pipeline_data(self, cfg: JobConfig, total_throughput) -> None:
         """Set pipeline data."""
         job_id = cfg.job_id