Skip to content

Commit 1f33f7d

Browse files
committed
rebase
1 parent cbe58d8 commit 1f33f7d

File tree

29 files changed

+2101
-429
lines changed

29 files changed

+2101
-429
lines changed

examples/07_GPT-OSS-120B_SGLang_Example/run.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,7 @@
4343
AIME_GPTOSS_SGLang,
4444
)
4545
from inference_endpoint.dataset_manager.predefined.gpqa import GPQA, GPQA_GPTOSS_SGLang
46-
from inference_endpoint.endpoint_client.configs import (
47-
AioHttpConfig,
48-
HTTPClientConfig,
49-
)
46+
from inference_endpoint.endpoint_client.configs import HTTPClientConfig
5047
from inference_endpoint.endpoint_client.http_client import HTTPEndpointClient
5148
from inference_endpoint.endpoint_client.http_sample_issuer import HttpClientSampleIssuer
5249
from inference_endpoint.evaluation.extractor import ABCDExtractor, BoxedMathExtractor
@@ -95,9 +92,7 @@ def create_sglang_client(tmp_dir: Path) -> HTTPEndpointClient:
9592
api_type="sglang",
9693
)
9794

98-
aiohttp_config = AioHttpConfig()
99-
100-
client = HTTPEndpointClient(http_config, aiohttp_config)
95+
client = HTTPEndpointClient(http_config)
10196
return client
10297

10398

pyproject.toml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,11 @@ requires-python = ">=3.12"
2828
dependencies = [
2929
"asyncio",
3030
"typing-extensions>=4.0.0",
31-
"orjson==3.11.0",
32-
"aiohttp==3.12.15",
33-
"pyzmq==27.0.2",
34-
"uvloop==0.21.0",
35-
"msgspec==0.19.0",
31+
"orjson==3.11.5",
32+
"pyzmq==27.1.0",
33+
"uvloop==0.22.1",
34+
"msgspec==0.20.0",
35+
"httptools==0.7.1",
3636
]
3737

3838
[project.optional-dependencies]

requirements/base.txt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@ psutil==6.1.1
1313

1414
# Note: asyncio is part of Python 3.11+ standard library
1515
# Async and networking
16-
aiohttp==3.12.15
17-
pyzmq==27.0.2
18-
uvloop==0.21.0
16+
pyzmq==27.1.0
17+
uvloop==0.22.1
18+
httptools==0.7.1
1919

2020
# Data handling
2121
duckdb==1.4.0
22-
orjson==3.11.0
23-
msgspec==0.19.0
22+
orjson==3.11.5
23+
msgspec==0.20.0
2424
pydantic==2.12.0
2525
pydantic_core==2.41.1
2626

requirements/test.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ line-profiler==5.0.0
1616
Pympler==1.1
1717
scipy==1.16.3
1818

19-
# HTTP server and client for mock server fixture
20-
aiohttp>=3.8.0
19+
# HTTP server and client for mock server fixture (test-only dependency)
20+
aiohttp==3.13.3
2121
pytest-timeout==2.4.0
2222
pytest-xdist==3.8.0

src/inference_endpoint/commands/benchmark.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,7 @@
5555
from inference_endpoint.config.yaml_loader import ConfigError, ConfigLoader
5656
from inference_endpoint.core.types import QueryResult
5757
from inference_endpoint.dataset_manager.factory import DataLoaderFactory
58-
from inference_endpoint.endpoint_client.configs import (
59-
AioHttpConfig,
60-
HTTPClientConfig,
61-
)
58+
from inference_endpoint.endpoint_client.configs import HTTPClientConfig
6259
from inference_endpoint.endpoint_client.http_client import HTTPEndpointClient
6360
from inference_endpoint.endpoint_client.http_sample_issuer import HttpClientSampleIssuer
6461
from inference_endpoint.evaluation import Extractor
@@ -309,7 +306,7 @@ def _build_config_from_cli(
309306
dataloader_random_seed=42,
310307
),
311308
client=ClientSettings(
312-
workers=args.workers if args.workers else 4,
309+
workers=args.workers if args.workers else "auto",
313310
log_level="DEBUG" if verbose_level >= 2 else "INFO",
314311
),
315312
),
@@ -622,9 +619,7 @@ def _run_benchmark(
622619
log_level=config.settings.client.log_level,
623620
cpu_affinity=config.settings.client.cpu_affinity,
624621
)
625-
aiohttp_config = AioHttpConfig()
626-
627-
http_client = HTTPEndpointClient(http_config, aiohttp_config)
622+
http_client = HTTPEndpointClient(http_config)
628623
sample_issuer = HttpClientSampleIssuer(http_client)
629624

630625
except Exception as e:

src/inference_endpoint/commands/probe.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,7 @@
2323

2424
from inference_endpoint.config.schema import APIType
2525
from inference_endpoint.core.types import Query, QueryResult
26-
from inference_endpoint.endpoint_client.configs import (
27-
AioHttpConfig,
28-
HTTPClientConfig,
29-
)
26+
from inference_endpoint.endpoint_client.configs import HTTPClientConfig
3027
from inference_endpoint.endpoint_client.http_client import HTTPEndpointClient
3128
from inference_endpoint.exceptions import (
3229
ExecutionError,
@@ -70,9 +67,8 @@ async def run_probe_command(args: argparse.Namespace) -> None:
7067
api_type=api_type,
7168
num_workers=1,
7269
)
73-
aiohttp_config = AioHttpConfig()
7470
# Client creates its own event loop in a separate thread
75-
client = HTTPEndpointClient(http_config, aiohttp_config)
71+
client = HTTPEndpointClient(http_config)
7672

7773
logger.info(f"Sending {num_requests} requests...")
7874

src/inference_endpoint/config/schema.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,10 @@ class ClientSettings(BaseModel):
268268
269269
"""
270270

271-
workers: int = 4
271+
# Number of worker processes or "auto" for automatic detection
272+
# - "auto" uses max(4, loadgen_numa_domain_size - 1)
273+
workers: int | str = "auto"
274+
272275
record_worker_events: bool = False
273276
log_level: str = "INFO"
274277

@@ -511,10 +514,10 @@ def validate_client_settings(self) -> None:
511514
Raises:
512515
ValueError: If settings are invalid
513516
"""
514-
if self.settings.client.workers < 1:
515-
raise ValueError(
516-
f"workers must be >= 1, got {self.settings.client.workers}"
517-
)
517+
workers = self.settings.client.workers
518+
# "auto" is valid, integers must be >= 1
519+
if isinstance(workers, int) and workers < 1:
520+
raise ValueError(f"workers must be >= 1, got {workers}")
518521

519522
def validate_runtime_settings(self) -> None:
520523
"""Validate runtime settings are reasonable.

src/inference_endpoint/endpoint_client/README.md

Lines changed: 4 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,11 @@ HTTP client for LLM inference with multiprocessing workers and ZMQ communication
2727
## Usage
2828

2929
```python
30-
from inference_endpoint.endpoint_client import (
31-
HTTPEndpointClient,
32-
HTTPClientConfig,
33-
AioHttpConfig,
34-
ZMQConfig,
35-
)
30+
from inference_endpoint.endpoint_client import HTTPEndpointClient, HTTPClientConfig
3631
from inference_endpoint.core.types import Query
3732

3833
client = HTTPEndpointClient(
39-
HTTPClientConfig(endpoint_url="http://localhost:8000/v1/completions", num_workers=2),
40-
AioHttpConfig(),
41-
ZMQConfig(),
34+
HTTPClientConfig(endpoint_url="http://localhost:8000/v1/completions")
4235
)
4336

4437
# Sync issue (fire-and-forget)
@@ -64,19 +57,12 @@ if response:
6457
### With HttpClientSampleIssuer
6558

6659
```python
67-
from inference_endpoint.endpoint_client import (
68-
HTTPEndpointClient,
69-
HTTPClientConfig,
70-
AioHttpConfig,
71-
ZMQConfig,
72-
)
60+
from inference_endpoint.endpoint_client import HTTPEndpointClient, HTTPClientConfig
7361
from inference_endpoint.endpoint_client.http_sample_issuer import HttpClientSampleIssuer
7462
from inference_endpoint.load_generator.sample import Sample
7563

7664
client = HTTPEndpointClient(
77-
HTTPClientConfig(endpoint_url="http://localhost:8000/v1/completions", num_workers=4),
78-
AioHttpConfig(),
79-
ZMQConfig(),
65+
HTTPClientConfig(endpoint_url="http://localhost:8000/v1/completions", num_workers=4)
8066
)
8167
issuer = HttpClientSampleIssuer(client)
8268

@@ -86,18 +72,6 @@ issuer.issue(Sample(
8672
))
8773
```
8874

89-
## Configuration
90-
91-
```python
92-
HTTPClientConfig(
93-
endpoint_url="http://localhost:8000/v1/completions",
94-
num_workers=4, # Number of worker processes
95-
)
96-
97-
AioHttpConfig() # Socket, TCP, HTTP configs (use defaults)
98-
ZMQConfig() # IPC configs (use defaults)
99-
```
100-
10175
## Shutdown
10276

10377
Shutdown is optional. Workers and event loop thread are daemons - they terminate automatically with the main process.

src/inference_endpoint/endpoint_client/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,11 @@
1919
This module provides HTTP client implementation with multiprocessing and ZMQ.
2020
"""
2121

22-
from .configs import AioHttpConfig, HTTPClientConfig
22+
from .configs import HTTPClientConfig
2323
from .http_client import AsyncHttpEndpointClient, HTTPEndpointClient
2424

2525
__all__ = [
2626
"AsyncHttpEndpointClient",
2727
"HTTPEndpointClient",
2828
"HTTPClientConfig",
29-
"AioHttpConfig",
3029
]

0 commit comments

Comments
 (0)