edge-endpoint/deploy/helm/groundlight-edge-endpoint/values.yaml at main · groundlight/edge-endpoint · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# Default values for groundlight-edge-endpoint.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.

# Global configuration for dependencies
global:
  # Enable OpenTelemetry dependency when needed for logging
  # Use: --set global.otelEnabled=true for local-splunk or cloud-splunk modes
  otelEnabled: false

# The Kubernetees namespace where the Edge Endpoint will be deployed
# We recommend that you don't change this value unless you're doing something unusual
# like running more that one instance of the Edge Endpoint in the same cluster
namespace: "edge"

# The tag to use for the images. The value of `imageTag` is used for both the edge-endpoint and
# inference images, unless you add a specific tag for one or the other image
imageTag: "release"
edgeEndpointTag: ""
inferenceTag: ""

# Whether to use the minimal image for the inference server. Currently, the minimal image only
# supports binary and multiclass detectors, so be sure to keep this set to false if you're using
# an object detection or counting model.
useMinimalImage: false

# The image pull policy for the containers.
# The default value is "Always" which means that Kubernetes will always check to see if there's
# a new version of the image with the requested tag available when starting containers.
imagePullPolicy: "Always"

# The port that the Edge Endpoint will listen on on the host.
# Within the cluster, edge-endpoint always listens on port 30101 and can be addressed by
# other services in the cluster on that port using
# http://edge-endpoint-service.edge.svc.cluster.local:30101/ (or substitute the appropriate
# namespace if you've overriden the default value).
edgeEndpointPort: 30101
edgeEndpointHttpsPort: 30143

# This is used as the base of the name for the PersistentVolume. The full name of the volume is
# created by appending the namespace to this value. Generally, this should not be overridden.
# The PersistentVolume is used to store the model files and other data that the Edge Endpoint wants
# to persist between restarts. It is mapped to `/opt/groundlight/edge` on the host.
persistentVolumeNameBase: "edge-endpoint-pv"

# Edge Endpoint will default to using the GPU for inference. If you want to use the CPU instead,
# set this value to "cpu".
inferenceFlavor: "gpu"

# The user must provide the Groundlight API token as input or the deployment will fail
groundlightApiToken: ""

# For escalations and audits and calls that aren't handled by the Edge Endpoint, we forward
# to the Groundlight service in the cloud. If you're testing against another version
# of the Groundlight service (e.g., your own dev environment or the integ environment),
# you can override this value.
upstreamEndpoint: "https://api.groundlight.ai"

# Currently, all Groundlight services are deployed in the us-west-2 region
awsRegion: "us-west-2"

ecrRegistry: "767397850842.dkr.ecr.us-west-2.amazonaws.com"

# This sets the log level for all the containers, both edge endpoint and inference.
logLevel: "INFO"

# These values override the automated settings in _helpers.tpl to keep this short and sweet
# Don't override these
nameOverride: "edge-endpoint"
fullnameOverride: "edge-endpoint"

# Set some sensible limits on memory usage to avoid system crashing
k3sConfig:
  enabled: true
  evictionHardPercent: "15"
  evictionSoftPercent: "25"
  evictionHardMinGB: "4"
  evictionSoftMinGB: "8"
  evictionGracePeriod: "10s"

serviceAccount:
  create: true
  name: ""

# Logging Configuration
# loggingMode options:
#   - "standard": Basic logging to stdout/files (no Splunk, no OTel)
#   - "local-splunk": Deploy local Splunk container
#   - "cloud-splunk": Does not deploy local Splunk container, points to cloud instance instead
loggingMode: "standard"

# Balena device configuration
balena:
  deviceUuid: "none"

# Splunk Configuration (used for local-splunk and cloud-splunk modes)
splunk:
  # Local Splunk container settings (for local-splunk mode)
  local:
    image:
      repository: splunk/splunk
      tag: "9.3.5"
    password: "admin123"
    hecToken: "abcd1234-5678-90ef-ghij-klmnopqrstuv"
    service:
      type: NodePort
      webNodePort: 30080    # Splunk Web UI
      hecNodePort: 30088    # HTTP Event Collector
      managementNodePort: 30089  # Management port
    persistence:
      dataSize: "10Gi"
      etcSize: "2Gi"
      storageClass: ""  # Use default storage class
    resources:
      limits:
        cpu: 2000m
        memory: 4Gi
      requests:
        cpu: 500m
        memory: 2Gi

  # Cloud Splunk settings (for cloud-splunk mode)
  cloud:
    endpoint: "https://your-splunk-instance:8088/services/collector"
    token: "your-hec-token-here"

  # Common Splunk settings
  index: "edge_app"
  source: "edge-endpoint"
  sourcetype: "edge:endpoint:logs"

  # OpenTelemetry Collector configuration
  collector:
    image:
      repository: otel/opentelemetry-collector-contrib
      tag: "0.91.0"
    resources:
      limits:
        cpu: 200m
        memory: 256Mi
      requests:
        cpu: 100m
        memory: 128Mi

# OpenTelemetry Collector dependency configuration
# This configures the official OpenTelemetry Helm chart as a dependency
opentelemetry-collector:
  # Enable the OpenTelemetry collector when logging modes require it
  # This gets set dynamically based on loggingMode
  enabled: false  # Will be overridden via --set at deployment time

  # DaemonSet mode for collecting logs from all pods on each node
  mode: daemonset

  namespaceOverride: "edge"

  nameOverride: "opentelemetry-collector"
  fullnameOverride: "opentelemetry-collector"

  # Use the contrib image which includes all receivers/processors
  image:
    repository: otel/opentelemetry-collector-contrib
    tag: "0.91.0"

  # Resource configuration
  resources:
    limits:
      cpu: 200m
      memory: 256Mi
    requests:
      cpu: 100m
      memory: 128Mi

  # Preset configurations
  presets:
    kubernetesAttributes:
      enabled: true
    logsCollection:
      enabled: false  # Disabled due to unsupported 'container' operator in v0.91.0
    kubeletMetrics:
      enabled: false

  config:
    receivers:
      filelog:
        include:
          - /var/log/pods/*/*/*.log
        exclude:
          - /var/log/pods/edge_opentelemetry-collector*/*/*.log
        start_at: end
        include_file_path: true
        include_file_name: false
        operators:
          # Parse CRI format logs manually
          - type: regex_parser
            id: extract_metadata_from_filepath
            regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9\-]{36})\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
            parse_from: attributes["log.file.path"]
          # Parse CRI log format: timestamp stream partial_flag log_message
          - type: regex_parser
            id: parse_cri_format
            regex: '^(?P<timestamp>[^\s]+) (?P<stream>stdout|stderr) (?P<partial>P|F) (?P<message>.*)$'
            parse_from: body
            timestamp:
              parse_from: attributes.timestamp
              layout: '%Y-%m-%dT%H:%M:%S.%LZ'
          # Extract log level from the message content
          - type: regex_parser
            id: extract_log_level
            regex: '(?:^|.*\s)(?P<level>DEBUG|INFO|WARN|WARNING|ERROR|CRITICAL|FATAL)(?:[:\s]|$)'
            parse_from: attributes.message
            on_error: 'send'
          # Parse JSON logs that are already structured
          - type: json_parser
            id: parse_json_logs
            parse_from: attributes.message
            on_error: 'send'  # Continue if not JSON
          # Parse FastAPI/Uvicorn HTTP access logs: "INFO: 10.42.0.1:50126 - "GET /health/ready HTTP/1.1" 200 OK"
          - type: regex_parser
            id: extract_fastapi_access_log
            regex: '(?P<client_ip>\d+\.\d+\.\d+\.\d+):(?P<client_port>\d+)\s+-\s+"(?P<http_method>\w+)\s+(?P<http_path>[^?\s]+)(?:\?(?P<http_query>[^"\s]*?))?(?:\s+[^"]*)?"\s+(?P<status_code>\d+)'
            parse_from: attributes.message
            on_error: 'send'
          # Parse NGINX access logs: "NGINX: 10.42.0.1 - - [24/Sep/2025:17:16:24 +0000] "GET /health/ready HTTP/1.1" 200 18 "-" "kube-probe/1.33""
          - type: regex_parser
            id: extract_nginx_access_log
            regex: 'NGINX:\s+(?P<client_ip>\d+\.\d+\.\d+\.\d+)\s+-\s+-\s+\[(?P<nginx_timestamp>[^\]]+)\]\s+"(?P<http_method>\w+)\s+(?P<http_path>[^?\s]+)(?:\?(?P<http_query>[^"\s]*?))?(?:\s+[^"]*)?"\s+(?P<status_code>\d+)\s+(?P<response_size>\d+)\s+"(?P<referer>[^"]*)"\s+"(?P<user_agent>[^"]*)"'
            parse_from: attributes.message
            on_error: 'send'

    processors:
      # Add resource attributes for custom fields
      resource:
        attributes:
          - key: service.name
            value: "edge-endpoint"
            action: upsert
          - key: deployment.environment
            value: "edge"
            action: upsert
          - key: balena.device.uuid
            value: "${env:BALENA_DEVICE_UUID}"
            action: upsert

    exporters:
      # Splunk HEC exporter configuration
      splunk_hec:
        endpoint: "http://splunk.edge.svc.cluster.local:8088/services/collector"
        token: "abcd1234-5678-90ef-ghij-klmnopqrstuv"
        index: "edge_app"
        source: "edge-endpoint"
        sourcetype: "edge:endpoint:logs"
        disable_compression: false
        timeout: 10s
        retry_on_failure:
          enabled: true
          initial_interval: 5s
          max_interval: 30s
          max_elapsed_time: 120s

    service:
      pipelines:
        logs:
          receivers: [filelog]
          processors: [k8sattributes, resource, memory_limiter, batch]
          exporters: [splunk_hec]

  # Mount host directories for log collection
  extraVolumes:
    - name: varlogpods
      hostPath:
        path: /var/log/pods
    - name: varlibdockercontainers
      hostPath:
        path: /var/lib/docker/containers

  extraVolumeMounts:
    - name: varlogpods
      mountPath: /var/log/pods
      readOnly: true
    - name: varlibdockercontainers
      mountPath: /var/lib/docker/containers
      readOnly: true

  # Environment variables for custom attributes
  extraEnvs:
    - name: BALENA_DEVICE_UUID
      valueFrom:
        configMapKeyRef:
          name: balena-config
          key: device-uuid

  # Security context to read log files
  securityContext:
    runAsUser: 0
    runAsGroup: 0

# Network healer - automatically recovers service networking after host IP changes
#
# An alternative experimental approach is implemented on the `tim/network-hooks` branch:
# a NetworkManager dispatcher hook that restarts k3s when network/IP changes are detected.
# That approach has worked in testing, but it likely deserves deeper consideration before use
# in production deployments (operational risk, platform assumptions, and restart frequency).
networkHealer:
  enabled: true
  checkIntervalSeconds: 10