Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cmd/collectors/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ const (
queryCPUUtil = `100 * (1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle", job="kubernetes-service-endpoints"}[5m])))`
queryRAMUtil = `100 * (1 - (node_memory_MemAvailable_bytes{job="kubernetes-service-endpoints"} / node_memory_MemTotal_bytes{job="kubernetes-service-endpoints"}))`
queryDiskUsed = `100 * (1 - node_filesystem_avail_bytes{mountpoint="/", job="kubernetes-service-endpoints"} / node_filesystem_size_bytes{mountpoint="/", job="kubernetes-service-endpoints"})`
queryNetRX = `node_network_receive_bytes_total{device="enp5s0", job="kubernetes-service-endpoints"}`
queryNetTX = `node_network_transmit_bytes_total{device="enp5s0", job="kubernetes-service-endpoints"}`
queryNetRX = `node_network_receive_bytes_total{device="eno1", job="kubernetes-service-endpoints"}`
queryNetTX = `node_network_transmit_bytes_total{device="eno1", job="kubernetes-service-endpoints"}`
queryTemp = `node_hwmon_temp_celsius{job="kubernetes-service-endpoints"}`
)

Expand Down
34 changes: 19 additions & 15 deletions docs/notes/k3s-operations.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,18 @@ nix-shell --run "tofu apply"
Since this is a custom internal service, the image must be built and sideloaded into k3s.

```bash
# 1. Build locally
docker build -t collectors:v0.1.0 -f docker/collectors/Dockerfile .
# 1. Build locally (using podman)
podman build -t collectors:v0.1.0 -f docker/collectors/Dockerfile .

# 2. Export and Import
docker save -o collectors.tar collectors:v0.1.0
podman save -o collectors.tar localhost/collectors:v0.1.0
sudo k3s ctr images import collectors.tar

# 3. Cleanup
# 3. Tag for K3s local lookup
sudo k3s ctr images tag localhost/collectors:v0.1.0 collectors:v0.1.0
sudo k3s ctr images tag localhost/collectors:v0.1.0 docker.io/library/collectors:v0.1.0

# 4. Cleanup
rm collectors.tar
```

Expand Down Expand Up @@ -175,24 +179,24 @@ The platform utilizes **NodePort** to bridge host-based services (MCP agents, pr

## 📊 Resource Limits Summary

- *Last Updated: 2026-02-22*
- *Last Updated: 2026-03-09 (High Performance Profile)*

| Component | CPU Req | RAM Req | CPU Limit | RAM Limit | Purpose |
| :--- | :--- | :--- | :--- | :--- | :--- |
| **collectors** | 5m | 20Mi | 50m | 80Mi | Telemetry Collection |
| **grafana** | 10m | 150Mi | 100m | 250Mi | Visualization |
| **loki** | 100m | 256Mi | 300m | 640Mi | Log Storage |
| **minio** | 100m | 256Mi | 200m | 512Mi | S3 Storage Backend |
| **opentelemetry** | 20m | 100Mi | 150m | 256Mi | Trace Gateway |
| **postgres** | 50m | 200Mi | 200m | 400Mi | Relational Data |
| **prometheus** | 20m | 400Mi | 100m | 600Mi | Metrics Storage |
| **tempo** | 50m | 256Mi | 200m | 512Mi | Trace Storage |
| **thanos** | 10m | 50Mi | 50m | 150Mi | Long-term Metrics Access |
| **grafana** | 50m | 256Mi | 200m | 512Mi | Visualization |
| **loki** | 200m | 512Mi | 1000m | 2Gi | Log Storage |
| **minio** | 200m | 512Mi | 500m | 1Gi | S3 Storage Backend |
| **opentelemetry** | 50m | 200Mi | 300m | 512Mi | Trace Gateway |
| **postgres** | 100m | 512Mi | 500m | 1Gi | Relational Data |
| **prometheus** | 100m | 1Gi | 500m | 2Gi | Metrics Storage |
| **tempo** | 100m | 512Mi | 500m | 1Gi | Trace Storage |
| **thanos** | 50m | 128Mi | 200m | 512Mi | Long-term Metrics Access |

**Understanding Usage Totals:**

- **Mini Total (365m CPU / 1.61Gi RAM)**: The sum of all *Requests* (guaranteed resources).
- **Max Total (1.35 Cores / 3.32Gi RAM)**: The sum of all *Limits* (burst ceiling).
- **Mini Total (~0.86 Cores / 4.6Gi RAM)**: The sum of all *Requests* (guaranteed resources).
- **Max Total (~3.75 Cores / 9.6Gi RAM)**: The sum of all *Limits* (burst ceiling).

---

Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module observability-hub

go 1.26.0
go 1.25.7

require (
github.com/DATA-DOG/go-sqlmock v1.5.2
Expand Down
8 changes: 6 additions & 2 deletions internal/proxy/webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,12 @@ func WebhookHandler(w http.ResponseWriter, r *http.Request) {
"event", eventType,
"merged", merged,
)
// We use an absolute path to the script for reliability
cmd := exec.Command("/home/server/software/observability-hub/scripts/gitops_sync.sh", repo)
// Try to get script path from environment, otherwise use relative path
scriptPath := os.Getenv("GITOPS_SYNC_SCRIPT")
if scriptPath == "" {
scriptPath = "scripts/gitops_sync.sh"
}
cmd := exec.Command(scriptPath, repo)
output, err := cmd.CombinedOutput()
if err != nil {
syncSpan.RecordError(err)
Expand Down
6 changes: 3 additions & 3 deletions k3s/grafana/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ initChownData:
securityContext:
runAsNonRoot: false
runAsUser: 0
allowPrivilegeEscalation: false
allowPrivilegeEscalation: true
# Needs to be able to chown the data volume
readOnlyRootFilesystem: false
capabilities:
Expand Down Expand Up @@ -122,10 +122,10 @@ service:
resources:
requests:
cpu: 50m
memory: 128Mi
memory: 256Mi
limits:
cpu: 200m
memory: 384Mi
memory: 512Mi

# Security settings and UI tweaks
grafana.ini:
Expand Down
8 changes: 4 additions & 4 deletions k3s/loki/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,11 @@ singleBinary:
name: minio-secret
resources:
requests:
cpu: 100m
memory: 256Mi
cpu: 200m
memory: 512Mi
limits:
cpu: 300m
memory: 640Mi
cpu: 1000m
memory: 2Gi
podSecurityContext:
fsGroup: 10001
runAsGroup: 10001
Expand Down
6 changes: 3 additions & 3 deletions k3s/minio/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ service:

resources:
requests:
memory: 256Mi
cpu: 100m
limits:
memory: 512Mi
cpu: 200m
limits:
memory: 1Gi
cpu: 500m

# Deployment Pod Security Context
securityContext:
Expand Down
12 changes: 6 additions & 6 deletions k3s/opentelemetry/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ image:

resources:
requests:
cpu: 20m
memory: 100Mi
cpu: 50m
memory: 200Mi
limits:
cpu: 150m
memory: 256Mi
cpu: 300m
memory: 512Mi

podSecurityContext:
fsGroup: 10001
Expand Down Expand Up @@ -82,9 +82,9 @@ alternateConfig:
processors: [batch, resource]
exporters: [loki]

# Expose as NodePort so host-level systemd services (e.g. proxy) can reach the collector
# Expose as LoadBalancer so host-level systemd services (e.g. proxy) can reach the collector via localhost
service:
type: NodePort
type: LoadBalancer

ports:
otlp:
Expand Down
27 changes: 14 additions & 13 deletions k3s/postgres/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@ primary:
# Fix for linter: explicitly set unhealthyPodEvictionPolicy
# Note: bitnami chart v18.3.0 uses this key
unhealthyPodEvictionPolicy: "AlwaysAllow"

podSecurityContext:
enabled: true
fsGroup: 1001
fsGroupChangePolicy: Always

containerSecurityContext:
enabled: true
readOnlyRootFilesystem: true
Expand All @@ -39,35 +39,35 @@ primary:
capabilities:
drop:
- ALL

resources:
requests:
cpu: 50m
memory: 200Mi
cpu: 100m
memory: 512Mi
limits:
cpu: 200m
memory: 400Mi
cpu: 500m
memory: 1Gi

persistence:
enabled: true
storageClass: local-path
size: 10Gi

extraVolumes:
- name: run-volume
emptyDir: {}
extraVolumeMounts:
- name: run-volume
mountPath: /var/run/postgresql

service:
type: NodePort
type: LoadBalancer
nodePorts:
postgresql: 30432

readReplicas:
enabled: false

volumePermissions:
enabled: true
image:
Expand All @@ -87,3 +87,4 @@ volumePermissions:
runAsUser: 0
runAsNonRoot: false
readOnlyRootFilesystem: true

8 changes: 4 additions & 4 deletions k3s/prometheus/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ server:
storage.tsdb.max-block-duration: 2h
resources:
requests:
cpu: 20m
memory: 400Mi
limits:
cpu: 100m
memory: 600Mi
memory: 1Gi
limits:
cpu: 500m
memory: 2Gi
thanos:
sidecar:
enabled: false
Expand Down
8 changes: 4 additions & 4 deletions k3s/tempo/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ tempo:
replicas: 1
resources:
requests:
cpu: 50m
memory: 256Mi
limits:
cpu: 200m
cpu: 100m
memory: 512Mi
limits:
cpu: 500m
memory: 1Gi

# Enable read-only root filesystem
securityContext:
Expand Down
16 changes: 8 additions & 8 deletions k3s/thanos/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ storegateway:

resources:
requests:
cpu: 10m
memory: 50Mi
limits:
cpu: 50m
memory: 150Mi
memory: 128Mi
limits:
cpu: 200m
memory: 512Mi

podSecurityContext:
enabled: true
Expand Down Expand Up @@ -77,11 +77,11 @@ compactor:

resources:
requests:
cpu: 10m
memory: 50Mi
limits:
cpu: 50m
memory: 150Mi
memory: 128Mi
limits:
cpu: 200m
memory: 512Mi

podSecurityContext:
enabled: true
Expand Down
5 changes: 4 additions & 1 deletion makefiles/common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ NS ?= observability
KC ?= kubectl -n $(NS)
HELM ?= helm --namespace $(NS)

# Container Engine (Default to Podman)
DOCKER ?= podman

# Dynamic Nix Detection
USE_NIX = $(shell if command -v nix-shell >/dev/null 2>&1 && [ -z "$$IN_NIX_SHELL" ] && [ "$$GITHUB_ACTIONS" != "true" ]; then echo "yes"; else echo "no"; fi)

Expand All @@ -27,7 +30,7 @@ adr:

# Markdown Linting
lint:
docker run --rm -v "$(PWD):/data" -w /data $(LINT_IMAGE) --fix "**/*.md"
$(DOCKER) run --rm -v "$(PWD):/data" -w /data $(LINT_IMAGE) --fix "**/*.md"

# Configuration Linting (HCL & GitHub Actions)
lint-configs:
Expand Down
18 changes: 15 additions & 3 deletions makefiles/k3s.mk
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# K3s Orchestration
.PHONY: build-collectors k3s-collectors-up k3s-status k3s-df k3s-prune k3s-logs-% k3s-backup-% kube-lint

BACKUP_DIR ?= /home/server2/backups/manual

# Maintenance
kube-lint:
@echo "Linting Kubernetes manifests..."
Expand All @@ -19,11 +21,21 @@ k3s-prune:

build-collectors:
@echo "Building Collectors image..."
docker build -t collectors:v0.1.0 -f docker/collectors/Dockerfile .
docker save -o collectors.tar collectors:v0.1.0
$(DOCKER) build -t collectors:v0.1.0 -f docker/collectors/Dockerfile .
$(DOCKER) save -o collectors.tar localhost/collectors:v0.1.0
sudo k3s ctr images import collectors.tar
sudo k3s ctr images tag localhost/collectors:v0.1.0 collectors:v0.1.0
sudo k3s ctr images tag localhost/collectors:v0.1.0 docker.io/library/collectors:v0.1.0
rm collectors.tar

build-postgres:
@echo "Building custom Postgres image..."
$(DOCKER) build -t postgres-pod:17 -f docker/postgres/Dockerfile .
$(DOCKER) save -o postgres-pod.tar localhost/postgres-pod:17
sudo k3s ctr images import postgres-pod.tar
sudo k3s ctr images tag localhost/postgres-pod:17 docker.io/library/postgres-pod:17
rm postgres-pod.tar

k3s-collectors-up:
@echo "Regenerating Collectors manifest..."
$(NIX_RUN) "helm template collectors k3s/collectors -f k3s/collectors/values.yaml --namespace $(NS) > k3s/collectors/manifest.yaml"
Expand Down Expand Up @@ -56,7 +68,7 @@ k3s-backup-%:
$(KC) scale --replicas=0 $$RESOURCE; \
echo "Waiting for pods to terminate..."; \
$(KC) wait --for=delete pod -l $$( $(KC) get $$RESOURCE -o jsonpath='{.spec.selector.matchLabels}' | jq -r 'to_entries | .[] | .key + "=" + .value' | paste -sd "," - ) --timeout=60s || true; \
BACKUP_DIR="/home/server/backups/manual"; \
BACKUP_DIR="$(BACKUP_DIR)"; \
sudo mkdir -p $$BACKUP_DIR; \
TIMESTAMP=$$(date +%Y%m%d_%H%M%S); \
BACKUP_PATH="$$BACKUP_DIR/$*_"$$TIMESTAMP".tar.gz"; \
Expand Down
9 changes: 5 additions & 4 deletions makefiles/systemd.mk
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,23 @@

# Define exact units to install
ACTIVE_UNITS = proxy.service tailscale-gate.service openbao.service \
traffic-generator.service traffic-generator.timer \
ingestion.service ingestion.timer \
mcp-telemetry.service

.PHONY: install-services reload-services uninstall-services bao-status

install-services:
@echo "🔗 Linking active units..."
@echo "📦 Installing active units..."
@for unit in $(ACTIVE_UNITS); do \
sudo ln -sf $(CURDIR)/systemd/$$unit /etc/systemd/system/$$unit; \
sudo rm -f /etc/systemd/system/$$unit; \
sudo cp $(CURDIR)/systemd/$$unit /etc/systemd/system/$$unit; \
sudo chmod 644 /etc/systemd/system/$$unit; \
done
@sudo systemctl daemon-reload
@echo "🟢 Enabling services..."
@sudo systemctl enable --now proxy.service tailscale-gate.service openbao.service
@echo "⏰ Enabling timers..."
@sudo systemctl enable --now ingestion.timer traffic-generator.timer
@sudo systemctl enable --now ingestion.timer

reload-services:
@echo "Reloading systemd units..."
Expand Down
Loading