scaleapi
diff --git a/‎.ruff.toml‎
Lines changed: 2 additions & 2 deletions b/‎.ruff.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎charts/model-engine/Chart.yaml‎
Lines changed: 1 addition & 1 deletion b/‎charts/model-engine/Chart.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎charts/model-engine/templates/service_template_config_map.yaml‎
Lines changed: 8 additions & 0 deletions b/‎charts/model-engine/templates/service_template_config_map.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎model-engine/model_engine_server/domain/entities/model_bundle_entity.py‎
Lines changed: 6 additions & 0 deletions b/‎model-engine/model_engine_server/domain/entities/model_bundle_entity.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎model-engine/model_engine_server/inference/configs/service--http_forwarder.yaml‎
Lines changed: 1 addition & 0 deletions b/‎model-engine/model_engine_server/inference/configs/service--http_forwarder.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎model-engine/model_engine_server/inference/forwarding/forwarding.py‎
Lines changed: 97 additions & 0 deletions b/‎model-engine/model_engine_server/inference/forwarding/forwarding.py‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎model-engine/model_engine_server/inference/forwarding/http_forwarder.py‎
Lines changed: 160 additions & 2 deletions b/‎model-engine/model_engine_server/inference/forwarding/http_forwarder.py‎
Lines changed: 160 additions & 2 deletions
@@ -1,5 +1,5 @@
 # Same as Black.
 line-length = 100
-
-ignore = ["E501"]
+target-version = "py310"
+lint.ignore = ["E501"]
 exclude = ["gen", "alembic"]
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.1.9
+version: 0.1.10
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 
@@ -184,6 +184,10 @@ data:
                 - "forwarder.sync.extra_routes=${FORWARDER_EXTRA_ROUTES}"
                 - --set
                 - "forwarder.stream.extra_routes=${FORWARDER_EXTRA_ROUTES}"
+                - --set
+                - "forwarder.sync.forwarder_type=${FORWARDER_TYPE}"
+                - --set
+                - "forwarder.stream.forwarder_type=${FORWARDER_TYPE}"
               {{- $sync_forwarder_template_env | nindent 14 }}
               readinessProbe:
                 httpGet:
@@ -616,6 +620,10 @@ data:
                   - "forwarder.sync.extra_routes=${FORWARDER_EXTRA_ROUTES}"
                   - --set
                   - "forwarder.stream.extra_routes=${FORWARDER_EXTRA_ROUTES}"
+                  - --set
+                  - "forwarder.sync.forwarder_type=${FORWARDER_TYPE}"
+                  - --set
+                  - "forwarder.stream.forwarder_type=${FORWARDER_TYPE}"
                 {{- $sync_forwarder_template_env | nindent 16 }}
                 readinessProbe:
                   httpGet:
 
@@ -31,6 +31,11 @@ class ModelBundleFrameworkType(str, Enum):
     CUSTOM = "custom_base_image"
 
 
+class ForwarderType(str, Enum):
+    PASSTHROUGH = "passthrough"
+    DEFAULT = "default"
+
+
 class ModelBundleEnvironmentParams(BaseModel):
     """
     This is the entity-layer class for the Model Bundle environment parameters. Being an
@@ -158,6 +163,7 @@ class RunnableImageLike(BaseModel, ABC):
     protocol: Literal["http"]  # TODO: add support for other protocols (e.g. grpc)
     readiness_initial_delay_seconds: int = 120
     extra_routes: List[str] = Field(default_factory=list)
+    forwarder_type: Optional[ForwarderType] = ForwarderType.DEFAULT
     worker_command: Optional[List[str]] = None
     worker_env: Optional[Dict[str, str]] = None
 
 
@@ -19,4 +19,5 @@ forwarder:
     model_engine_unwrap: true
     serialize_results_as_string: false
     extra_routes: []
+
   max_concurrency: 100
@@ -627,6 +627,103 @@ def endpoint(route: str) -> str:
         )
 
 
+@dataclass
+class PassthroughForwarder(ModelEngineSerializationMixin):
+    passthrough_endpoint: str
+
+    async def _make_request(
+        self, request: Any, aioclient: aiohttp.ClientSession
+    ) -> aiohttp.ClientResponse:
+        headers: dict[str, str] = dict(request.headers)
+        excluded_headers: set[str] = {
+            "host",
+            "content-length",
+            "connection",
+        }
+        headers = {k: v for k, v in headers.items() if k.lower() not in excluded_headers}
+        url = request.url
+        target_url: str = f"{self.passthrough_endpoint.rstrip('/')}"
+
+        if url.query:
+            target_url = f"{target_url}?{url.query}"
+
+        return await aioclient.request(
+            method=request.method,
+            url=target_url,
+            data=await request.body() if request.method in ["POST", "PUT", "PATCH"] else None,
+            headers=headers,
+        )
+
+    async def forward_stream(self, request: Any):
+        async with aiohttp.ClientSession() as aioclient:
+            response = await self._make_request(request, aioclient)
+            response_headers = response.headers
+            yield (response_headers, response.status)
+
+            if response.status != 200:
+                yield await response.read()
+
+            async for chunk in response.content.iter_chunks():
+                yield chunk[0]
+
+            yield await response.read()
+
+    async def forward_sync(self, request: Any):
+        async with aiohttp.ClientSession() as aioclient:
+            response = await self._make_request(request, aioclient)
+            return response
+
+
+@dataclass(frozen=True)
+class LoadPassthroughForwarder:
+    user_port: int = DEFAULT_PORT
+    user_hostname: str = "localhost"
+    healthcheck_route: str = "/health"
+    passthrough_route: str = ""
+
+    def load(self, resources: Optional[Path], cache: Any) -> PassthroughForwarder:
+        if len(self.healthcheck_route) == 0:
+            raise ValueError("healthcheck route must be non-empty!")
+
+        if not self.healthcheck_route.startswith("/"):
+            raise ValueError(f"healthcheck route must start with /: {self.healthcheck_route=}")
+
+        if not (1 <= self.user_port <= 65535):
+            raise ValueError(f"Invalid port value: {self.user_port=}")
+
+        if len(self.user_hostname) == 0:
+            raise ValueError("hostname must be non-empty!")
+
+        if self.user_hostname != "localhost":
+            raise NotImplementedError(
+                "Currently only localhost-based user-code services are supported with forwarders! "
+                f"Cannot handle {self.user_hostname=}"
+            )
+
+        def endpoint(route: str) -> str:
+            return f"http://{self.user_hostname}:{self.user_port}{route}"
+
+        passthrough_endpoint: str = endpoint(self.passthrough_route)
+        hc: str = endpoint(self.healthcheck_route)
+
+        logger.info(f"Forwarding to user-defined service at: {self.user_hostname}:{self.user_port}")
+        logger.info(f"Passthrough endpoint:  {passthrough_endpoint}")
+        logger.info(f"Healthcheck endpoint: {hc}")
+
+        while True:
+            try:
+                if requests.get(hc).status_code == 200:
+                    break
+            except requests.exceptions.ConnectionError:
+                pass
+
+            logger.info(f"Waiting for user-defined service to be ready at {hc}...")
+            time.sleep(1)
+
+        logger.info(f"Creating PassthroughForwarder with endpoint: {passthrough_endpoint}")
+        return PassthroughForwarder(passthrough_endpoint=passthrough_endpoint)
+
+
 def load_named_config(config_uri, config_overrides=None):
     with open(config_uri, "rt") as rt:
         if config_uri.endswith(".json"):
 
@@ -7,14 +7,17 @@
 
 import orjson
 import uvicorn
-from fastapi import BackgroundTasks, Depends, FastAPI
+from fastapi import BackgroundTasks, Depends, FastAPI, Request
+from fastapi.responses import Response, StreamingResponse
 from model_engine_server.common.concurrency_limiter import MultiprocessingConcurrencyLimiter
 from model_engine_server.common.dtos.tasks import EndpointPredictV1Request
 from model_engine_server.core.loggers import logger_name, make_logger
 from model_engine_server.inference.forwarding.forwarding import (
     Forwarder,
     LoadForwarder,
+    LoadPassthroughForwarder,
     LoadStreamingForwarder,
+    PassthroughForwarder,
     StreamingForwarder,
     load_named_config,
 )
@@ -40,6 +43,8 @@ def get_forwarder_loader(destination_path: Optional[str] = None) -> LoadForwarde
         del config["extra_routes"]
     if destination_path:
         config["predict_route"] = destination_path
+    if "forwarder_type" in config:
+        del config["forwarder_type"]
     forwarder_loader = LoadForwarder(**config)
     return forwarder_loader
 
@@ -52,10 +57,40 @@ def get_streaming_forwarder_loader(
         del config["extra_routes"]
     if destination_path:
         config["predict_route"] = destination_path
+    if "forwarder_type" in config:
+        del config["forwarder_type"]
     streaming_forwarder_loader = LoadStreamingForwarder(**config)
     return streaming_forwarder_loader
 
 
+def get_stream_passthrough_forwarder_loader(
+    destination_path: Optional[str] = None,
+) -> LoadPassthroughForwarder:
+    config = {}
+    stream_config = get_config().get("stream", {})
+    for key in ["user_port", "user_hostname", "healthcheck_route"]:
+        config[key] = stream_config[key]
+    if destination_path:
+        config["passthrough_route"] = destination_path
+
+    passthrough_forwarder_loader = LoadPassthroughForwarder(**config)
+    return passthrough_forwarder_loader
+
+
+def get_sync_passthrough_forwarder_loader(
+    destination_path: Optional[str] = None,
+) -> LoadPassthroughForwarder:
+    config = {}
+    sync_config = get_config().get("sync", {})
+    for key in ["user_port", "user_hostname", "healthcheck_route"]:
+        config[key] = sync_config[key]
+    if destination_path:
+        config["passthrough_route"] = destination_path
+
+    passthrough_forwarder_loader = LoadPassthroughForwarder(**config)
+    return passthrough_forwarder_loader
+
+
 @lru_cache()
 def get_concurrency_limiter() -> MultiprocessingConcurrencyLimiter:
     config = get_config()
@@ -75,6 +110,41 @@ def load_streaming_forwarder(destination_path: Optional[str] = None) -> Streamin
     return get_streaming_forwarder_loader(destination_path).load(None, None)
 
 
+@lru_cache()
+def load_stream_passthrough_forwarder(
+    destination_path: Optional[str] = None,
+) -> PassthroughForwarder:
+    return get_stream_passthrough_forwarder_loader(destination_path).load(None, None)
+
+
+@lru_cache()
+def load_sync_passthrough_forwarder(destination_path: Optional[str] = None) -> PassthroughForwarder:
+    return get_sync_passthrough_forwarder_loader(destination_path).load(None, None)
+
+
+HOP_BY_HOP_HEADERS: list[str] = [
+    "proxy-authenticate",
+    "proxy-authorization",
+    "content-length",
+    "content-encoding",
+]
+
+
+def sanitize_response_headers(headers: dict, force_cache_bust: bool = False) -> dict:
+    lower_headers = {k.lower(): v for k, v in headers.items()}
+    # Delete hop by hop headers that should not be forwarded
+    for header in HOP_BY_HOP_HEADERS:
+        if header in lower_headers:
+            del lower_headers[header]
+
+    if force_cache_bust:
+        # force clients to refetch resources
+        lower_headers["cache-control"] = "no-store"
+        if "etag" in lower_headers:
+            del lower_headers["etag"]
+    return lower_headers
+
+
 async def predict(
     request: EndpointPredictV1Request,
     background_tasks: BackgroundTasks,
@@ -123,6 +193,35 @@ async def event_generator():
         return EventSourceResponse(event_generator())
 
 
+async def passthrough_stream(
+    request: Request,
+    forwarder: PassthroughForwarder = Depends(get_stream_passthrough_forwarder_loader),
+    limiter: MultiprocessingConcurrencyLimiter = Depends(get_concurrency_limiter),
+):
+    with limiter:
+        response = forwarder.forward_stream(request)
+        headers, status = await anext(response)
+        headers = sanitize_response_headers(headers)
+
+        async def content_generator():
+            async for chunk in response:
+                yield chunk
+
+        return StreamingResponse(content_generator(), headers=headers, status_code=status)
+
+
+async def passthrough_sync(
+    request: Request,
+    forwarder: PassthroughForwarder = Depends(get_sync_passthrough_forwarder_loader),
+    limiter: MultiprocessingConcurrencyLimiter = Depends(get_concurrency_limiter),
+):
+    with limiter:
+        response = await forwarder.forward_sync(request)
+        headers = sanitize_response_headers(response.headers)
+        content = await response.read()
+        return Response(content=content, status_code=response.status, headers=headers)
+
+
 async def serve_http(app: FastAPI, **uvicorn_kwargs: Any):  # pragma: no cover
     logger.info("Available routes are:")
     for route in app.routes:
@@ -177,7 +276,7 @@ async def init_app():
     def healthcheck():
         return "OK"
 
-    def add_extra_routes(app: FastAPI):
+    def add_extra_sync_or_stream_routes(app: FastAPI):
         """Read extra_routes from config and dynamically add routes to app"""
         config = get_config()
         sync_forwarders: Dict[str, Forwarder] = dict()
@@ -224,6 +323,65 @@ async def predict_or_stream(
                 methods=["POST"],
             )
 
+    def add_stream_passthrough_routes(app: FastAPI):
+        config = get_config()
+
+        passthrough_forwarders: Dict[str, PassthroughForwarder] = dict()
+        for route in config.get("stream", {}).get("extra_routes", []):
+            passthrough_forwarders[route] = load_stream_passthrough_forwarder(route)
+
+        for route in passthrough_forwarders:
+
+            def get_passthrough_forwarder(route=route):
+                return passthrough_forwarders.get(route)
+
+            async def passthrough_route(
+                request: Request,
+                passthrough_forwarder: PassthroughForwarder = Depends(get_passthrough_forwarder),
+                limiter=Depends(get_concurrency_limiter),
+            ):
+                return await passthrough_stream(request, passthrough_forwarder, limiter)
+
+            app.add_api_route(
+                path=route,
+                endpoint=passthrough_route,
+                methods=["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS"],
+            )
+
+    def add_sync_passthrough_routes(app: FastAPI):
+        config = get_config()
+
+        passthrough_forwarders: Dict[str, PassthroughForwarder] = dict()
+        for route in config.get("sync", {}).get("extra_routes", []):
+            passthrough_forwarders[route] = load_sync_passthrough_forwarder(route)
+
+        for route in passthrough_forwarders:
+
+            def get_passthrough_forwarder(route=route):
+                return passthrough_forwarders.get(route)
+
+            async def passthrough_route(
+                request: Request,
+                passthrough_forwarder: PassthroughForwarder = Depends(get_passthrough_forwarder),
+                limiter=Depends(get_concurrency_limiter),
+            ):
+                return await passthrough_sync(request, passthrough_forwarder, limiter)
+
+            app.add_api_route(
+                path=route,
+                endpoint=passthrough_route,
+                methods=["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS"],
+            )
+
+    def add_extra_routes(app: FastAPI):
+        config = get_config()
+        if config.get("stream", {}).get("forwarder_type") == "passthrough":
+            add_stream_passthrough_routes(app)
+        elif config.get("sync", {}).get("forwarder_type") == "passthrough":
+            add_sync_passthrough_routes(app)
+        else:
+            add_extra_sync_or_stream_routes(app)
+
     app.add_api_route(path="/healthz", endpoint=healthcheck, methods=["GET"])
     app.add_api_route(path="/readyz", endpoint=healthcheck, methods=["GET"])
     app.add_api_route(path="/predict", endpoint=predict, methods=["POST"])