deepgram-self-hosted-chart/common/standard_deploy/engine.aura-2-en.toml at master · voiceflow/deepgram-self-hosted-chart · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
### Keep in mind that all paths are in-container paths and do not need to exist
### on the host machine.

### Limit the number of active requests handled by a single Engine container.
### Engine will reject additional requests from API beyond this limit, and the
### API container will continue with the retry logic configured in `api.toml`.
###
### The default is no limit.
# max_active_requests =


### Configure license validation by passing in a DEEPGRAM_API_KEY environment variable
### See https://developers.deepgram.com/docs/deploy-deepgram-services#credentials
[license]
### Docker Compose and Podman Compose create a dedicated network that allows inter-container communication by app name.
### See [Networking in Compose](https://docs.docker.com/compose/networking/) for details.
server_url = ["https://license-proxy:8443", "https://license.deepgram.com"]


### Configure the server to listen for requests from the API.
[server]
### The IP address to listen on. Since this is likely running in a Docker
### container, you will probably want to listen on all interfaces.
host = "0.0.0.0"
### The port to listen on
port = 8080


### To support metrics we need to expose an Engine endpoint.
### See https://developers.deepgram.com/docs/metrics-guide#deepgram-engine
[metrics_server]
host = "0.0.0.0"
port = 9991


[model_manager]
### The number of models to have concurrently loaded in system memory.
### If managing a deployment with dozens of models this setting will
### help prevent instances where models consume too much memory and
### offload the models to disk as needed on a least-recently-used basis.
###
### The default is no limit.
# max_concurrently_loaded_models = 20

### Inference models. You can place these in one or multiple directories.
search_paths = ["/models"]


### Enable ancillary features
[features]
### Allow multichannel requests by setting this to true, set to false to disable
multichannel = true # or false
### Enables language detection *if* a valid language detection model is available
language_detection = true # or false
### Enables streaming entity formatting *if* a valid NER model is available
streaming_ner = true # or false

### Enable Flux turn-based streaming STT
[flux]
enabled = false # or true

### Size of audio chunks to process in seconds.
[chunking.batch]
# min_duration =
# max_duration =
[chunking.streaming]
# min_duration =
# max_duration =

### How often to return interim results, in seconds. Default is 1.0s.
###
### This value may be lowered to increase the frequency of interim results.
### However, this may cause a signficant decrease in number of concurrent
### streams supported by a single GPU. Please contact your Deepgram Account
### representative for more details.
# step = 1.0


### Engine will automatically enable half precision operations if your GPU supports
### them. You can explicitly enable or disable this behavior with the state parameter
### which supports enabled, disabled, and auto (the default).
[half_precision]
# state = "disabled" # or "enabled" or "auto"

[health]
### Controls whether Engine fails on startup if no GPU is detected
### Default: false
###
### While Engine can run without a GPU, production deployments require one for
### acceptable performance. Set to true to fail fast if no GPU is available,
### rather than running with severely degraded performance.
# gpu_required = true # or false