mlx-maestro/example.config.toml at main · f1yn/mlx-maestro · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
[config]
# The interface to listen (and bind) to. Most setups benefit from 0.0.0.0
# as it avoids macOS firewall prompts and simplifies containerized deployments.
host = '0.0.0.0'
# Port offset applied to base service ports (e.g., base_port + offset)
port_offset = 10
# Maximum seconds to wait for a port to become available before failing startup
port_bind_timeout = 20
# Starting port for sidecar services (must be ≥ 1024 and avoid system ports)
sidecar_port_start = 10200
# Time (seconds) after last request before a sidecar is terminated
sidecar_timeout = 300
# Default timeout (seconds) for service processes after last request (overridable per service)
service_default_timeout = 300
# Minimum gap (seconds) between requests before a model is considered idle and eligible for unloading
request_gap = 15

# === SERVICE DEFINITIONS ===
# Each service defines a primary model endpoint with optional sidecar dependencies.
# Sidecar keys are HTTP paths (e.g., "/v1/embeddings") mapped to sidecar names.
[[service]]
name = "Primary-1"
port = 10101
model_id = "MODEL_ID_PLACEHOLDER"
volume = 0.7
extra_args = ["--trust-remote-code"]
# Sidecars represent isolated subset APIs, and are proxied the same way
sidecars = { "/v1/embeddings" = "Embedder", "/v1/audio" = "Audio" }

[[service]]
name = "Primary-2"
port = 10102
model_id = "MODEL_ID_PLACEHOLDER"
volume = 0.15
timeout = 60

[[service]]
name = "Primary-3"
port = 10103
model_id = "MODEL_ID_PLACEHOLDER"
volume = 0.9
extra_args = ["--trust-remote-code"]
sidecars = { "/v1/embeddings" = "Embedder", "/v1/audio" = "Audio" }

[[service]]
name = "Primary-4"
port = 10104
model_id = "MODEL_ID_PLACEHOLDER"
volume = 0.4
sidecars = { "/v1/embeddings" = "Embedder", "/v1/audio" = "Audio" }

[[service]]
name = "Vision"
port = 10105
model_id = "MODEL_ID_PLACEHOLDER"
volume = 0.5
# Custom launch command — supports {model} and {port} substitution
command = [
  "mlx-openai-server",
  "launch",
  "--model-path", "{model}",
  "--model-type", "multimodal",
  "--port", "{port}"
]

# === SIDECAR DEFINITIONS ===
# Sidecars are lightweight, stateless subprocesses that handle auxiliary tasks.
# They are spawned on first request and terminated after `sidecar_timeout` seconds of idleness.
[[sidecar]]
name = "Embedder"
model_id = "SIDECAR_EMBEDDER_MODEL_ID"
volume = 0.08
command = [
  "mlx-openai-server",
  "launch",
  "--model-path", "{model}",
  "--model-type", "embeddings",
  "--max-concurrency", "20",
  "--port", "{port}"
]

[[sidecar]]
name = "Audio"
model_id = "SIDECAR_AUDIO_MODEL_ID"
volume = 0.4
command = [
  "mlx-openai-server",
  "launch",
  "--model-path", "{model}",
  "--model-type", "whisper",
  "--port", "{port}"
]