-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvalues.yaml
More file actions
306 lines (268 loc) · 10.8 KB
/
values.yaml
File metadata and controls
306 lines (268 loc) · 10.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# Default values for groundlight-edge-endpoint.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
# Global configuration for dependencies
global:
# Enable OpenTelemetry dependency when needed for logging
# Use: --set global.otelEnabled=true for local-splunk or cloud-splunk modes
otelEnabled: false
# The Kubernetees namespace where the Edge Endpoint will be deployed
# We recommend that you don't change this value unless you're doing something unusual
# like running more that one instance of the Edge Endpoint in the same cluster
namespace: "edge"
# The tag to use for the images. The value of `imageTag` is used for both the edge-endpoint and
# inference images, unless you add a specific tag for one or the other image
imageTag: "release"
edgeEndpointTag: ""
inferenceTag: ""
# Whether to use the minimal image for the inference server. Currently, the minimal image only
# supports binary and multiclass detectors, so be sure to keep this set to false if you're using
# an object detection or counting model.
useMinimalImage: false
# The image pull policy for the containers.
# The default value is "Always" which means that Kubernetes will always check to see if there's
# a new version of the image with the requested tag available when starting containers.
imagePullPolicy: "Always"
# The port that the Edge Endpoint will listen on on the host.
# Within the cluster, edge-endpoint always listens on port 30101 and can be addressed by
# other services in the cluster on that port using
# http://edge-endpoint-service.edge.svc.cluster.local:30101/ (or substitute the appropriate
# namespace if you've overriden the default value).
edgeEndpointPort: 30101
edgeEndpointHttpsPort: 30143
# This is used as the base of the name for the PersistentVolume. The full name of the volume is
# created by appending the namespace to this value. Generally, this should not be overridden.
# The PersistentVolume is used to store the model files and other data that the Edge Endpoint wants
# to persist between restarts. It is mapped to `/opt/groundlight/edge` on the host.
persistentVolumeNameBase: "edge-endpoint-pv"
# Edge Endpoint will default to using the GPU for inference. If you want to use the CPU instead,
# set this value to "cpu".
inferenceFlavor: "gpu"
# The user must provide the Groundlight API token as input or the deployment will fail
groundlightApiToken: ""
# For escalations and audits and calls that aren't handled by the Edge Endpoint, we forward
# to the Groundlight service in the cloud. If you're testing against another version
# of the Groundlight service (e.g., your own dev environment or the integ environment),
# you can override this value.
upstreamEndpoint: "https://api.groundlight.ai"
# Currently, all Groundlight services are deployed in the us-west-2 region
awsRegion: "us-west-2"
ecrRegistry: "767397850842.dkr.ecr.us-west-2.amazonaws.com"
# This sets the log level for all the containers, both edge endpoint and inference.
logLevel: "INFO"
# These values override the automated settings in _helpers.tpl to keep this short and sweet
# Don't override these
nameOverride: "edge-endpoint"
fullnameOverride: "edge-endpoint"
# Set some sensible limits on memory usage to avoid system crashing
k3sConfig:
enabled: true
evictionHardPercent: "15"
evictionSoftPercent: "25"
evictionHardMinGB: "4"
evictionSoftMinGB: "8"
evictionGracePeriod: "10s"
serviceAccount:
create: true
name: ""
# Logging Configuration
# loggingMode options:
# - "standard": Basic logging to stdout/files (no Splunk, no OTel)
# - "local-splunk": Deploy local Splunk container
# - "cloud-splunk": Does not deploy local Splunk container, points to cloud instance instead
loggingMode: "standard"
# Balena device configuration
balena:
deviceUuid: "none"
# Splunk Configuration (used for local-splunk and cloud-splunk modes)
splunk:
# Local Splunk container settings (for local-splunk mode)
local:
image:
repository: splunk/splunk
tag: "9.3.5"
password: "admin123"
hecToken: "abcd1234-5678-90ef-ghij-klmnopqrstuv"
service:
type: NodePort
webNodePort: 30080 # Splunk Web UI
hecNodePort: 30088 # HTTP Event Collector
managementNodePort: 30089 # Management port
persistence:
dataSize: "10Gi"
etcSize: "2Gi"
storageClass: "" # Use default storage class
resources:
limits:
cpu: 2000m
memory: 4Gi
requests:
cpu: 500m
memory: 2Gi
# Cloud Splunk settings (for cloud-splunk mode)
cloud:
endpoint: "https://your-splunk-instance:8088/services/collector"
token: "your-hec-token-here"
# Common Splunk settings
index: "edge_app"
source: "edge-endpoint"
sourcetype: "edge:endpoint:logs"
# OpenTelemetry Collector configuration
collector:
image:
repository: otel/opentelemetry-collector-contrib
tag: "0.91.0"
resources:
limits:
cpu: 200m
memory: 256Mi
requests:
cpu: 100m
memory: 128Mi
# OpenTelemetry Collector dependency configuration
# This configures the official OpenTelemetry Helm chart as a dependency
opentelemetry-collector:
# Enable the OpenTelemetry collector when logging modes require it
# This gets set dynamically based on loggingMode
enabled: false # Will be overridden via --set at deployment time
# DaemonSet mode for collecting logs from all pods on each node
mode: daemonset
namespaceOverride: "edge"
nameOverride: "opentelemetry-collector"
fullnameOverride: "opentelemetry-collector"
# Use the contrib image which includes all receivers/processors
image:
repository: otel/opentelemetry-collector-contrib
tag: "0.91.0"
# Resource configuration
resources:
limits:
cpu: 200m
memory: 256Mi
requests:
cpu: 100m
memory: 128Mi
# Preset configurations
presets:
kubernetesAttributes:
enabled: true
logsCollection:
enabled: false # Disabled due to unsupported 'container' operator in v0.91.0
kubeletMetrics:
enabled: false
config:
receivers:
filelog:
include:
- /var/log/pods/*/*/*.log
exclude:
- /var/log/pods/edge_opentelemetry-collector*/*/*.log
start_at: end
include_file_path: true
include_file_name: false
operators:
# Parse CRI format logs manually
- type: regex_parser
id: extract_metadata_from_filepath
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9\-]{36})\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
parse_from: attributes["log.file.path"]
# Parse CRI log format: timestamp stream partial_flag log_message
- type: regex_parser
id: parse_cri_format
regex: '^(?P<timestamp>[^\s]+) (?P<stream>stdout|stderr) (?P<partial>P|F) (?P<message>.*)$'
parse_from: body
timestamp:
parse_from: attributes.timestamp
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
# Extract log level from the message content
- type: regex_parser
id: extract_log_level
regex: '(?:^|.*\s)(?P<level>DEBUG|INFO|WARN|WARNING|ERROR|CRITICAL|FATAL)(?:[:\s]|$)'
parse_from: attributes.message
on_error: 'send'
# Parse JSON logs that are already structured
- type: json_parser
id: parse_json_logs
parse_from: attributes.message
on_error: 'send' # Continue if not JSON
# Parse FastAPI/Uvicorn HTTP access logs: "INFO: 10.42.0.1:50126 - "GET /health/ready HTTP/1.1" 200 OK"
- type: regex_parser
id: extract_fastapi_access_log
regex: '(?P<client_ip>\d+\.\d+\.\d+\.\d+):(?P<client_port>\d+)\s+-\s+"(?P<http_method>\w+)\s+(?P<http_path>[^?\s]+)(?:\?(?P<http_query>[^"\s]*?))?(?:\s+[^"]*)?"\s+(?P<status_code>\d+)'
parse_from: attributes.message
on_error: 'send'
# Parse NGINX access logs: "NGINX: 10.42.0.1 - - [24/Sep/2025:17:16:24 +0000] "GET /health/ready HTTP/1.1" 200 18 "-" "kube-probe/1.33""
- type: regex_parser
id: extract_nginx_access_log
regex: 'NGINX:\s+(?P<client_ip>\d+\.\d+\.\d+\.\d+)\s+-\s+-\s+\[(?P<nginx_timestamp>[^\]]+)\]\s+"(?P<http_method>\w+)\s+(?P<http_path>[^?\s]+)(?:\?(?P<http_query>[^"\s]*?))?(?:\s+[^"]*)?"\s+(?P<status_code>\d+)\s+(?P<response_size>\d+)\s+"(?P<referer>[^"]*)"\s+"(?P<user_agent>[^"]*)"'
parse_from: attributes.message
on_error: 'send'
processors:
# Add resource attributes for custom fields
resource:
attributes:
- key: service.name
value: "edge-endpoint"
action: upsert
- key: deployment.environment
value: "edge"
action: upsert
- key: balena.device.uuid
value: "${env:BALENA_DEVICE_UUID}"
action: upsert
exporters:
# Splunk HEC exporter configuration
splunk_hec:
endpoint: "http://splunk.edge.svc.cluster.local:8088/services/collector"
token: "abcd1234-5678-90ef-ghij-klmnopqrstuv"
index: "edge_app"
source: "edge-endpoint"
sourcetype: "edge:endpoint:logs"
disable_compression: false
timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 120s
service:
pipelines:
logs:
receivers: [filelog]
processors: [k8sattributes, resource, memory_limiter, batch]
exporters: [splunk_hec]
# Mount host directories for log collection
extraVolumes:
- name: varlogpods
hostPath:
path: /var/log/pods
- name: varlibdockercontainers
hostPath:
path: /var/lib/docker/containers
extraVolumeMounts:
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
- name: varlibdockercontainers
mountPath: /var/lib/docker/containers
readOnly: true
# Environment variables for custom attributes
extraEnvs:
- name: BALENA_DEVICE_UUID
valueFrom:
configMapKeyRef:
name: balena-config
key: device-uuid
# Security context to read log files
securityContext:
runAsUser: 0
runAsGroup: 0
# Network healer - automatically recovers service networking after host IP changes
#
# An alternative experimental approach is implemented on the `tim/network-hooks` branch:
# a NetworkManager dispatcher hook that restarts k3s when network/IP changes are detected.
# That approach has worked in testing, but it likely deserves deeper consideration before use
# in production deployments (operational risk, platform assumptions, and restart frequency).
networkHealer:
enabled: true
checkIntervalSeconds: 10