Cardinal-Cryptography · kroist · Nov 7, 2025 · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025
diff --git a/.github/workflows/build-and-push-uptime-service.yml b/.github/workflows/build-and-push-uptime-service.yml
@@ -0,0 +1,47 @@
+---
+name: Build and push Uptime Service docker image
+
+on:
+  workflow_call:
+    inputs:
+      ref:
+        description: "git ref: hash, branch, tag to build uptime-service files from"
+        type: string
+        required: true
+
+jobs:
+  main:
+    name: Build Uptime Service
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Checkout source code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+          fetch-depth: 0
+
+      - name: Call action get-ref-properties
+        id: get-ref-properties
+        uses: Cardinal-Cryptography/github-actions/get-ref-properties@v7
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Public Amazon ECR
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ vars.ECR_PUBLIC_HOST }}
+          username: ${{ secrets.AWS_MAINNET_ECR_CC_ACCESS_KEY_ID }}
+          password: ${{ secrets.AWS_MAINNET_ECR_CC_ACCESS_KEY }}
+
+      - name: Build and push docker image
+        id: build-image
+        uses: docker/build-push-action@v6
+        with:
+          context: ./ts/uptime-service
+          file: ./ts/uptime-service/Dockerfile
+          push: true
+          # yamllint disable rule:line-length
+          tags: |
+            ${{ vars.ECR_CC_RES_PUBLIC_REGISTRY }}uptime-service:${{ steps.get-ref-properties.outputs.sha }}
+            ${{ github.ref == 'refs/heads/main' && format('{0}uptime-service:latest', vars.ECR_CC_RES_PUBLIC_REGISTRY) || '' }}
diff --git a/ts/pnpm-workspace.yaml b/ts/pnpm-workspace.yaml
@@ -6,3 +6,4 @@ packages:
   - "shielder-sdk"
   - "shielder-sdk-tests"
   - "!shielder-sdk-crypto-mobile"
+  - "!uptime-service"
diff --git a/ts/uptime-service/.env.example b/ts/uptime-service/.env.example
@@ -0,0 +1,27 @@
+# Port for the metrics HTTP server
+PORT=9615
+
+# Interval between health check probes in milliseconds
+PROBE_INTERVAL=10000
+
+# HTTP request timeout in milliseconds
+TIMEOUT=5000
+
+# List of endpoints to monitor (JSON array format)
+# Each endpoint should have:
+#   - name: Unique identifier for the service
+#   - url: Full URL of the health endpoint
+#   - method: HTTP method (optional, defaults to GET)
+#   - expectedStatus: Expected HTTP status code (optional, defaults to 200)
+ENDPOINTS='[
+  {
+    "name": "example-api",
+    "url": "http://localhost:3000/health",
+    "method": "GET",
+    "expectedStatus": 200
+  },
+  {
+    "name": "example-database",
+    "url": "http://localhost:5432/health"
+  }
+]'
diff --git a/ts/uptime-service/.gitignore b/ts/uptime-service/.gitignore
@@ -0,0 +1,7 @@
+node_modules/
+.env
+*.log
+.DS_Store
+dist/
+build/
+coverage/
diff --git a/ts/uptime-service/Dockerfile b/ts/uptime-service/Dockerfile
@@ -0,0 +1,41 @@
+FROM oven/bun:1.1.38-slim AS builder
+
+WORKDIR /app
+
+# Copy package files
+COPY package.json bun.lock* ./
+
+# Install dependencies
+RUN bun install --frozen-lockfile --production
+
+# Copy source code
+COPY src ./src
+
+FROM oven/bun:1.1.38-slim
+
+WORKDIR /app
+
+# Install ca-certificates for HTTPS requests
+RUN apt-get update && \
+    apt-get install -y ca-certificates && \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy dependencies and source from builder
+COPY --from=builder /app/node_modules ./node_modules
+COPY --from=builder /app/package.json ./package.json
+COPY --from=builder /app/src ./src
+
+# Create non-root user
+RUN useradd -r -s /bin/false appuser && \
+    chown -R appuser:appuser /app
+
+USER appuser
+
+# Expose metrics port
+EXPOSE 9615
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+  CMD bun run -e "fetch('http://localhost:9615/health').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))"
-EXPOSE 9615
-
-# Health check
-HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
-  CMD bun run -e "fetch('http://localhost:9615/health').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))"
+# Ensure the runtime listens on the probed port
+ENV PORT=9615
+EXPOSE 9615
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+  CMD bun run -e "fetch('http://localhost:9615/health').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))"
-HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
-  CMD bun run -e "fetch('http://localhost:9615/health').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))"
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+  CMD bun -e "fetch('http://localhost:9615/health').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))"
-EXPOSE 9615
-
-# Health check
-HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
-  CMD bun run -e "fetch('http://localhost:9615/health').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))"
+# Ensure the runtime listens on the probed port
+ENV PORT=9615
+EXPOSE 9615
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+  CMD bun run -e "fetch('http://localhost:9615/health').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))"
-HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
-  CMD bun run -e "fetch('http://localhost:9615/health').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))"
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+  CMD bun -e "fetch('http://localhost:9615/health').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))"
+
+ENTRYPOINT ["bun", "run", "src/index.js"]
diff --git a/ts/uptime-service/README.md b/ts/uptime-service/README.md
@@ -0,0 +1,181 @@
+# Uptime Monitoring Service
+
+A lightweight Node.js/Bun service that monitors health endpoints and exposes metrics in Prometheus format for Grafana dashboards and alerting.
+
+## Prerequisites
+
+- [Bun](https://bun.sh/) installed on your system
+- Services with health check endpoints to monitor
+
+## Installation
+
+1. Clone or download this repository
+2. Install dependencies:
+
+   ```bash
+   bun install
+   ```
+
+3. Create a `.env` file based on `.env.example`:
+
+   ```bash
+   cp .env.example .env
+   ```
+
+4. Configure your endpoints in the `.env` file
+
+## Configuration
+
+All configuration is done via environment variables:
+
+| Variable         | Description                         | Default | Required |
+| ---------------- | ----------------------------------- | ------- | -------- |
+| `PORT`           | Port for the metrics server         | `9090`  | No       |
+| `PROBE_INTERVAL` | Interval between health checks (ms) | `30000` | No       |
+| `TIMEOUT`        | HTTP request timeout (ms)           | `5000`  | No       |
+| `ENDPOINTS`      | JSON array of endpoints to monitor  | -       | Yes      |
+
+### Endpoint Configuration
+
+The `ENDPOINTS` variable should contain a JSON array with the following structure:
+
+```json
+[
+  {
+    "name": "api-service",
+    "url": "http://api.example.com/health",
+    "method": "GET",
+    "expectedStatus": 200
+  },
+  {
+    "name": "database",
+    "url": "http://localhost:5432/health"
+  }
+]
+```
+
+**Endpoint fields:**
+
+- `name` (required): Unique identifier for the service
+- `url` (required): Full URL of the health endpoint
+- `method` (optional): HTTP method, defaults to `GET`
+- `expectedStatus` (optional): Expected HTTP status code, defaults to `200`
+
+### Example Configuration
+
+```env
+PORT=9090
+PROBE_INTERVAL=30000
+TIMEOUT=5000
+ENDPOINTS='[
+  {"name":"frontend","url":"http://localhost:3000/health"},
+  {"name":"backend-api","url":"http://localhost:8080/health","expectedStatus":200},
+  {"name":"redis","url":"http://localhost:6379/health"}
+]'
+```
+
+## Running the Service
+
+### Development Mode
+
+```bash
+bun run dev
+```
+
+This runs the service with auto-reload on file changes.
+
+### Production Mode
+
+```bash
+bun start
+```
+
+Or run directly:
+
+```bash
+bun run src/index.js
+```
+
+## Exposed Endpoints
+
+The service exposes the following HTTP endpoints:
+
+- **`/metrics`** - Prometheus metrics endpoint (for scraping)
+- **`/health`** - Health check for the service itself
+- **`/`** - Service information and available endpoints
+
+## Prometheus Metrics
+
+The service exposes the following metrics:
+
+### `service_up`
+
+**Type:** Gauge  
+**Description:** Service availability status (1 = up, 0 = down)  
+**Labels:** `service_name`, `endpoint`
+
+### `service_response_time_seconds`
+
+**Type:** Histogram  
+**Description:** Service response time in seconds  
+**Labels:** `service_name`, `endpoint`  
+**Buckets:** 0.001, 0.01, 0.1, 0.5, 1, 2, 5, 10
+
+### `service_last_probe_timestamp`
+
+**Type:** Gauge  
+**Description:** Unix timestamp of the last probe attempt  
+**Labels:** `service_name`, `endpoint`
+
+## Grafana Dashboard
+
+### Example Queries
+
+**Current Uptime Status:**
+
+```promql
+service_up
+```
+
+**Uptime Percentage (last 24h):**
+
+```promql
+avg_over_time(service_up[24h]) * 100
+```
+
+**Average Response Time:**
+
+```promql
+rate(service_response_time_seconds_sum[5m]) / rate(service_response_time_seconds_count[5m])
+```
+
+### Alert Rules
+
+**Service Down Alert:**
+
+```yaml
+groups:
+  - name: uptime_alerts
+    rules:
+      - alert: ServiceDown
+        expr: service_up == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Service {{ $labels.service_name }} is down"
+          description: "{{ $labels.service_name }} has been down for more than 2 minutes"
+```
+
+**High Response Time Alert:**
+
+```yaml
+- alert: HighResponseTime
+  expr: rate(service_response_time_seconds_sum[5m]) / rate(service_response_time_seconds_count[5m]) > 1
+  for: 5m
+  labels:
+    severity: warning
+  annotations:
+    summary: "High response time for {{ $labels.service_name }}"
+    description: "{{ $labels.service_name }} response time is above 1 second"
+```