diff --git a/.github/workflows/client-linters.yml b/.github/workflows/client-linters.yml index 7fab009f..203a7934 100644 --- a/.github/workflows/client-linters.yml +++ b/.github/workflows/client-linters.yml @@ -34,6 +34,3 @@ jobs: - name: Run Type Check run: npx --package=typescript@latest -- tsc --build . working-directory: client - - - diff --git a/.github/workflows/deploy-to-k8s.yml b/.github/workflows/deploy-to-k8s.yml index 05bc1571..3d40c0de 100644 --- a/.github/workflows/deploy-to-k8s.yml +++ b/.github/workflows/deploy-to-k8s.yml @@ -217,7 +217,6 @@ jobs: build-args: API_URL=${{ needs.setup.outputs.api_url }} platforms: linux/amd64 - deploy: needs: - build-client @@ -248,6 +247,7 @@ jobs: AUTH_URL=auth.whiteboard.student.k8s.aet.cit.tum.de GENAI_URL=genai.whiteboard.student.k8s.aet.cit.tum.de REALTIME_URL=realtime.whiteboard.student.k8s.aet.cit.tum.de + METRICS_URL=metrics.whiteboard.student.k8s.aet.cit.tum.de echo "NAMESPACE=production" >> $GITHUB_ENV echo "IMAGE_TAG=latest" >> $GITHUB_ENV echo "VALUES_FILE=./infrastructure/whiteboard-app/production.values.yaml" >> $GITHUB_ENV @@ -261,6 +261,7 @@ jobs: AUTH_URL=staging.auth.whiteboard.student.k8s.aet.cit.tum.de GENAI_URL=staging.genai.whiteboard.student.k8s.aet.cit.tum.de REALTIME_URL=staging.realtime.whiteboard.student.k8s.aet.cit.tum.de + METRICS_URL=staging.metrics.whiteboard.student.k8s.aet.cit.tum.de echo "NAMESPACE=staging" >> $GITHUB_ENV echo "IMAGE_TAG=develop" >> $GITHUB_ENV echo "VALUES_FILE=./infrastructure/whiteboard-app/staging.values.yaml" >> $GITHUB_ENV @@ -275,6 +276,7 @@ jobs: AUTH_URL=$BRANCH_SAFE.auth.whiteboard.student.k8s.aet.cit.tum.de GENAI_URL=$BRANCH_SAFE.genai.whiteboard.student.k8s.aet.cit.tum.de REALTIME_URL=$BRANCH_SAFE.realtime.whiteboard.student.k8s.aet.cit.tum.de + METRICS_URL=$BRANCH_SAFE.metrics.whiteboard.student.k8s.aet.cit.tum.de echo "NAMESPACE=$BRANCH_SAFE" >> $GITHUB_ENV echo "IMAGE_TAG=$BRANCH_SAFE" >> $GITHUB_ENV echo "VALUES_FILE=./infrastructure/whiteboard-app/pullrequest.values.yaml" >> $GITHUB_ENV @@ -290,6 +292,7 @@ jobs: echo "GENAI_URL=$GENAI_URL" >> $GITHUB_ENV echo "OPEN_WEB_UI_API_KEY=${{ secrets.OPEN_WEB_UI_API_KEY }}" >> $GITHUB_ENV echo "REALTIME_URL=$REALTIME_URL" >> $GITHUB_ENV + echo "METRICS_URL=$METRICS_URL" >> $GITHUB_ENV echo "KEYCLOAK_CLIENT_SECRET=$KEYCLOAK_CLIENT_SECRET" >> $GITHUB_ENV echo "NEXTAUTH_SECRET=$NEXTAUTH_SECRET" >> $GITHUB_ENV echo "POSTGRESQL_SECRET=$POSTGRESQL_SECRET" >> $GITHUB_ENV @@ -313,15 +316,17 @@ jobs: if [[ "$BRANCH" == "main" ]]; then RELEASE_NAME="whiteboard-production" + OBSERVABILITY_RELEASE_NAME="whiteboard-observability-production" elif [[ "$BRANCH" == "develop" ]]; then RELEASE_NAME="whiteboard-staging" + OBSERVABILITY_RELEASE_NAME="whiteboard-observability-staging" else PR_NUMBER=${{ github.event.pull_request.number }} RELEASE_NAME="whiteboard-pr-${PR_NUMBER}" fi echo "RELEASE_NAME=${RELEASE_NAME}" >> $GITHUB_ENV - echo "release-name=${RELEASE_NAME}" >> $GITHUB_OUTPUT + echo "OBSERVABILITY_RELEASE_NAME=${OBSERVABILITY_RELEASE_NAME}" >> $GITHUB_ENV - name: Install Helm uses: azure/setup-helm@v3 @@ -352,6 +357,23 @@ jobs: --set keycloak.externalDatabase.password="${{ env.POSTGRESQL_SECRET }}" \ --set keycloak.auth.adminPassword="${{ env.KEYCLOAK_SECRET }}" \ + - name: Deploy Observability Stack with Helm + if: github.ref == 'refs/heads/develop' || github.ref == 'refs/heads/main' + run: | + helm upgrade ${{ env.OBSERVABILITY_RELEASE_NAME }} ./infrastructure/whiteboard-observability/ \ + -f ${{ env.VALUES_FILE }} \ + -n tsd-${{ env.NAMESPACE }} \ + --create-namespace \ + --install \ + --atomic \ + --kubeconfig ${{ env.KUBECONFIG }} \ + --set namespace="${{ env.NAMESPACE }}" \ + --set client.url="${{ env.CLIENT_URL }}" \ + --set server.url="${{ env.SERVER_URL }}" \ + --set genai.url="${{ env.GENAI_URL }}" \ + --set realtime.url="${{ env.REALTIME_URL }}" \ + --set metrics.url="${{ env.METRICS_URL }}" + comment-pr: needs: deploy runs-on: ubuntu-latest diff --git a/.github/workflows/genai-linters.yml b/.github/workflows/genai-linters.yml index 209e78e7..0f871a48 100644 --- a/.github/workflows/genai-linters.yml +++ b/.github/workflows/genai-linters.yml @@ -29,7 +29,7 @@ jobs: run: | cd genai ruff check . - + - name: GenAI format (auto-fix) run: | cd genai diff --git a/.github/workflows/genai-tests.yml b/.github/workflows/genai-tests.yml index 0578a198..2ea3caa7 100644 --- a/.github/workflows/genai-tests.yml +++ b/.github/workflows/genai-tests.yml @@ -26,7 +26,7 @@ jobs: uv pip install -r ./genai/requirements.txt --system - name: GenAI tests - env: + env: OPEN_WEB_UI_API_KEY: ${{ secrets.OPEN_WEB_UI_API_KEY }} API_URL: ${{ vars.API_URL }} run: | diff --git a/client/src/api/genai/generated/api.ts b/client/src/api/genai/generated/api.ts index fb44ba26..74d24aba 100644 --- a/client/src/api/genai/generated/api.ts +++ b/client/src/api/genai/generated/api.ts @@ -60,10 +60,10 @@ export interface HTTPValidationError { export interface TextRequest { /** * - * @type {Array} + * @type {string} * @memberof TextRequest */ - user_text: Array; + user_text: string; } /** * @@ -211,6 +211,45 @@ export const DefaultApiAxiosParamCreator = function ( options: localVarRequestOptions, }; }, + /** + * Endpoint that serves Prometheus metrics. + * @summary Metrics + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + metricsMetricsGet: async ( + options: RawAxiosRequestConfig = {}, + ): Promise => { + const localVarPath = `/metrics`; + // use dummy base URL string because the URL constructor only accepts absolute URLs. + const localVarUrlObj = new URL(localVarPath, DUMMY_BASE_URL); + let baseOptions; + if (configuration) { + baseOptions = configuration.baseOptions; + } + + const localVarRequestOptions = { + method: "GET", + ...baseOptions, + ...options, + }; + const localVarHeaderParameter = {} as any; + const localVarQueryParameter = {} as any; + + setSearchParams(localVarUrlObj, localVarQueryParameter); + let headersFromBaseOptions = + baseOptions && baseOptions.headers ? baseOptions.headers : {}; + localVarRequestOptions.headers = { + ...localVarHeaderParameter, + ...headersFromBaseOptions, + ...options.headers, + }; + + return { + url: toPathString(localVarUrlObj), + options: localVarRequestOptions, + }; + }, /** * * @summary Rephrase Text @@ -382,6 +421,32 @@ export const DefaultApiFp = function (configuration?: Configuration) { configuration, )(axios, localVarOperationServerBasePath || basePath); }, + /** + * Endpoint that serves Prometheus metrics. + * @summary Metrics + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + async metricsMetricsGet( + options?: RawAxiosRequestConfig, + ): Promise< + (axios?: AxiosInstance, basePath?: string) => AxiosPromise + > { + const localVarAxiosArgs = + await localVarAxiosParamCreator.metricsMetricsGet(options); + const localVarOperationServerIndex = configuration?.serverIndex ?? 0; + const localVarOperationServerBasePath = + operationServerMap["DefaultApi.metricsMetricsGet"]?.[ + localVarOperationServerIndex + ]?.url; + return (axios, basePath) => + createRequestFunction( + localVarAxiosArgs, + globalAxios, + BASE_PATH, + configuration, + )(axios, localVarOperationServerBasePath || basePath); + }, /** * * @summary Rephrase Text @@ -484,6 +549,17 @@ export const DefaultApiFactory = function ( .healthCheckHealthGet(options) .then((request) => request(axios, basePath)); }, + /** + * Endpoint that serves Prometheus metrics. + * @summary Metrics + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + metricsMetricsGet(options?: RawAxiosRequestConfig): AxiosPromise { + return localVarFp + .metricsMetricsGet(options) + .then((request) => request(axios, basePath)); + }, /** * * @summary Rephrase Text @@ -554,6 +630,19 @@ export class DefaultApi extends BaseAPI { .then((request) => request(this.axios, this.basePath)); } + /** + * Endpoint that serves Prometheus metrics. + * @summary Metrics + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public metricsMetricsGet(options?: RawAxiosRequestConfig) { + return DefaultApiFp(this.configuration) + .metricsMetricsGet(options) + .then((request) => request(this.axios, this.basePath)); + } + /** * * @summary Rephrase Text diff --git a/client/src/api/genai/generated/docs/DefaultApi.md b/client/src/api/genai/generated/docs/DefaultApi.md index 6024228d..f5b18353 100644 --- a/client/src/api/genai/generated/docs/DefaultApi.md +++ b/client/src/api/genai/generated/docs/DefaultApi.md @@ -6,6 +6,7 @@ All URIs are relative to *http://localhost:8000* |------------- | ------------- | -------------| |[**completeTextCompletionPost**](#completetextcompletionpost) | **POST** /completion | Complete Text| |[**healthCheckHealthGet**](#healthcheckhealthget) | **GET** /health | Health Check| +|[**metricsMetricsGet**](#metricsmetricsget) | **GET** /metrics | Metrics| |[**rephraseTextRephrasePost**](#rephrasetextrephrasepost) | **POST** /rephrase | Rephrase Text| |[**summarizeTextSummarizationPost**](#summarizetextsummarizationpost) | **POST** /summarization | Summarize Text| @@ -83,6 +84,50 @@ const { status, data } = await apiInstance.healthCheckHealthGet(); This endpoint does not have any parameters. +### Return type + +**any** + +### Authorization + +No authorization required + +### HTTP request headers + + - **Content-Type**: Not defined + - **Accept**: application/json + + +### HTTP response details +| Status code | Description | Response headers | +|-------------|-------------|------------------| +|**200** | Successful Response | - | + +[[Back to top]](#) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to Model list]](../README.md#documentation-for-models) [[Back to README]](../README.md) + +# **metricsMetricsGet** +> any metricsMetricsGet() + +Endpoint that serves Prometheus metrics. + +### Example + +```typescript +import { + DefaultApi, + Configuration +} from './api'; + +const configuration = new Configuration(); +const apiInstance = new DefaultApi(configuration); + +const { status, data } = await apiInstance.metricsMetricsGet(); +``` + +### Parameters +This endpoint does not have any parameters. + + ### Return type **any** diff --git a/client/src/api/genai/generated/docs/TextRequest.md b/client/src/api/genai/generated/docs/TextRequest.md index 4501c09f..98c5e74a 100644 --- a/client/src/api/genai/generated/docs/TextRequest.md +++ b/client/src/api/genai/generated/docs/TextRequest.md @@ -5,7 +5,7 @@ Name | Type | Description | Notes ------------ | ------------- | ------------- | ------------- -**user_text** | **Array<string>** | | [default to undefined] +**user_text** | **string** | | [default to undefined] ## Example diff --git a/client/src/components/style-bar/StyleBar.tsx b/client/src/components/style-bar/StyleBar.tsx index 68a6832a..955dea3d 100644 --- a/client/src/components/style-bar/StyleBar.tsx +++ b/client/src/components/style-bar/StyleBar.tsx @@ -100,11 +100,11 @@ const StyleBar = ({ let data; if (action === "rephrase") { - data = await rephraseText({ user_text: [selectedNodeLabel] }); + data = await rephraseText({ user_text: selectedNodeLabel }); } else if (action === "complete") { - data = await completeText({ user_text: [selectedNodeLabel] }); + data = await completeText({ user_text: selectedNodeLabel }); } else { - data = await summarizedText({ user_text: [selectedNodeLabel] }); + data = await summarizedText({ user_text: selectedNodeLabel }); } const llmResponse = data.llm_response; diff --git a/client/src/components/text-node/TextNode.tsx b/client/src/components/text-node/TextNode.tsx index 0b7d7aca..c275efb9 100644 --- a/client/src/components/text-node/TextNode.tsx +++ b/client/src/components/text-node/TextNode.tsx @@ -7,8 +7,6 @@ import { Position, useReactFlow, } from "@xyflow/react"; -import { Eye, EyeOff } from "lucide-react"; -import { Button } from "@/components/ui/button"; import { getFontStyle, handleStyle, @@ -16,6 +14,9 @@ import { } from "@/types/NodeProperties"; import StyleBar from "@/components/style-bar/StyleBar"; import { updateNode } from "@/util/updateNode"; +import { useAmIOwner } from "@/hooks/api/whiteboard.api"; +import { useGetMe } from "@/hooks/api/account.api"; +import { useParams } from "next/navigation"; interface TextNodeProps extends NodeProps { id: string; @@ -40,9 +41,14 @@ function hexToRgb(hex: string) { export default function TextNode({ id, data, selected }: TextNodeProps) { const [isEditing, setIsEditing] = useState(false); const [text, setText] = useState(data.label as string); - const [showStyleBar, setShowStyleBar] = useState(true); const { setNodes } = useReactFlow(); + const params = useParams(); + const whiteboardId = Number(params.id); + + const { data: user } = useGetMe(); + const { data: isOwner } = useAmIOwner(whiteboardId, user?.id); + const { nodeProperties, label } = data; const bgRgb = hexToRgb(nodeProperties.color); const borderRgb = hexToRgb(nodeProperties.borderColor); @@ -68,7 +74,7 @@ export default function TextNode({ id, data, selected }: TextNodeProps) { return ( <> - {showStyleBar && ( + {isOwner && ( )} - -
- -
-
- +
0.1", + "interval":"", + "refId":"Anno" + }, + "textFormat":"High rate of client errors detected, exceeding 0.1 requests per second", + "titleFormat":"Client Error Spike" + }, + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "enable":true, + "hide":false, + "iconColor":"red", + "name":"Server Error Spike", + "tagKeys":"server, error", + "target":{ + "expr":"sum(rate(http_requests_total{status=~\"5xx\"}[5m])) > 0.1", + "interval":"", + "refId":"Anno" + }, + "textFormat":"Server error rate exceeded threshold (>0.1 req/s)", + "titleFormat":"Server Error Spike" + } + ] + }, + "description":"Dashboard showing system metrics including request count, latency, and error rate.", + "editable":true, + "fiscalYearStartMonth":0, + "graphTooltip":0, + "id":3, + "links":[ + + ], + "panels":[ + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "fieldConfig":{ + "defaults":{ + "color":{ + "mode":"palette-classic" + }, + "custom":{ + "axisBorderShow":false, + "axisCenteredZero":false, + "axisColorMode":"text", + "axisLabel":"", + "axisPlacement":"auto", + "barAlignment":0, + "barWidthFactor":0.6, + "drawStyle":"line", + "fillOpacity":20, + "gradientMode":"none", + "hideFrom":{ + "legend":false, + "tooltip":false, + "viz":false + }, + "insertNulls":false, + "lineInterpolation":"smooth", + "lineWidth":2, + "pointSize":5, + "scaleDistribution":{ + "type":"linear" + }, + "showPoints":"auto", + "spanNulls":false, + "stacking":{ + "group":"A", + "mode":"none" + }, + "thresholdsStyle":{ + "mode":"area" + } + }, + "mappings":[ + + ], + "thresholds":{ + "mode":"absolute", + "steps":[ + { + "color":"green" + }, + { + "color":"orange", + "value":70 + }, + { + "color":"red", + "value":80 + } + ] + } + }, + "overrides":[ + + ] + }, + "gridPos":{ + "h":17, + "w":12, + "x":0, + "y":0 + }, + "id":1, + "options":{ + "legend":{ + "calcs":[ + "mean", + "max" + ], + "displayMode":"table", + "placement":"bottom", + "showLegend":true + }, + "tooltip":{ + "hideZeros":false, + "mode":"multi", + "sort":"none" + } + }, + "pluginVersion":"12.0.2", + "targets":[ + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "editorMode":"code", + "expr":"sum by (handler, method, status) (increase(http_requests_total{job=\"genai_job\"}[5m]))", + "legendFormat":"{{method}} {{status}} {{uri}}", + "range":true, + "refId":"A" + } + ], + "title":"Request Count", + "type":"timeseries" + }, + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "fieldConfig":{ + "defaults":{ + "color":{ + "mode":"palette-classic" + }, + "custom":{ + "axisBorderShow":false, + "axisCenteredZero":false, + "axisColorMode":"text", + "axisLabel":"Average Latency (seconds)", + "axisPlacement":"auto", + "barAlignment":0, + "barWidthFactor":0.6, + "drawStyle":"line", + "fillOpacity":20, + "gradientMode":"none", + "hideFrom":{ + "legend":false, + "tooltip":false, + "viz":false + }, + "insertNulls":false, + "lineInterpolation":"smooth", + "lineWidth":2, + "pointSize":5, + "scaleDistribution":{ + "type":"linear" + }, + "showPoints":"auto", + "spanNulls":false, + "stacking":{ + "group":"A", + "mode":"none" + }, + "thresholdsStyle":{ + "mode":"area" + } + }, + "mappings":[ + + ], + "thresholds":{ + "mode":"absolute", + "steps":[ + { + "color":"green" + }, + { + "color":"red", + "value":80 + } + ] + } + }, + "overrides":[ + + ] + }, + "gridPos":{ + "h":17, + "w":12, + "x":12, + "y":0 + }, + "id":2, + "options":{ + "legend":{ + "calcs":[ + "mean", + "max" + ], + "displayMode":"table", + "placement":"bottom", + "showLegend":true + }, + "tooltip":{ + "hideZeros":false, + "mode":"multi", + "sort":"none" + } + }, + "pluginVersion":"12.0.2", + "targets":[ + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "editorMode":"code", + "expr":"sum(rate(http_request_duration_seconds_sum{job=\"genai_job\"}[5m])) by (method, handler) /\nsum(rate(http_request_duration_seconds_count{job=\"genai_job\"}[5m])) by (method, handler)", + "legendFormat":"{{method}} {{handler}}", + "range":true, + "refId":"A" + } + ], + "title":"Latency", + "type":"timeseries" + }, + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "fieldConfig":{ + "defaults":{ + "color":{ + "mode":"palette-classic" + }, + "custom":{ + "axisBorderShow":false, + "axisCenteredZero":false, + "axisColorMode":"text", + "axisLabel":"", + "axisPlacement":"auto", + "barAlignment":0, + "barWidthFactor":0.6, + "drawStyle":"line", + "fillOpacity":0, + "gradientMode":"none", + "hideFrom":{ + "legend":false, + "tooltip":false, + "viz":false + }, + "insertNulls":false, + "lineInterpolation":"linear", + "lineWidth":1, + "pointSize":5, + "scaleDistribution":{ + "type":"linear" + }, + "showPoints":"auto", + "spanNulls":false, + "stacking":{ + "group":"A", + "mode":"none" + }, + "thresholdsStyle":{ + "mode":"off" + } + }, + "mappings":[ + + ], + "thresholds":{ + "mode":"absolute", + "steps":[ + { + "color":"green" + }, + { + "color":"red", + "value":80 + } + ] + } + }, + "overrides":[ + + ] + }, + "gridPos":{ + "h":12, + "w":12, + "x":0, + "y":17 + }, + "id":4, + "options":{ + "legend":{ + "calcs":[ + + ], + "displayMode":"list", + "placement":"bottom", + "showLegend":true + }, + "tooltip":{ + "hideZeros":false, + "mode":"single", + "sort":"none" + } + }, + "pluginVersion":"12.0.2", + "targets":[ + { + "editorMode":"code", + "expr":"sum by (handler, method) (increase(http_requests_total{status=~\"4..\", job=\"genai_job\"}[5m]))", + "legendFormat":"{{label_name}}", + "range":true, + "refId":"A" + }, + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "editorMode":"code", + "expr":"sum by (handler, method) (increase(http_requests_total{status=~\"4..\", job=\"genai_job\"}[5m]))", + "hide":false, + "instant":false, + "legendFormat":"__auto", + "range":true, + "refId":"B" + } + ], + "title":"Errors", + "type":"timeseries" + }, + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "description":"This dashboard displays the average number of input and output tokens generated per request for each operation (completion, summarization, rephrase_text) over time. The values represent the mean input and output token count, calculated every 5 minutes, grouped by operation.", + "fieldConfig":{ + "defaults":{ + "color":{ + "mode":"palette-classic" + }, + "custom":{ + "axisBorderShow":false, + "axisCenteredZero":false, + "axisColorMode":"text", + "axisLabel":"", + "axisPlacement":"auto", + "barAlignment":0, + "barWidthFactor":0.6, + "drawStyle":"line", + "fillOpacity":0, + "gradientMode":"none", + "hideFrom":{ + "legend":false, + "tooltip":false, + "viz":false + }, + "insertNulls":false, + "lineInterpolation":"linear", + "lineWidth":1, + "pointSize":5, + "scaleDistribution":{ + "type":"linear" + }, + "showPoints":"auto", + "spanNulls":false, + "stacking":{ + "group":"A", + "mode":"none" + }, + "thresholdsStyle":{ + "mode":"off" + } + }, + "mappings":[ + + ], + "thresholds":{ + "mode":"absolute", + "steps":[ + { + "color":"green" + }, + { + "color":"red", + "value":80 + } + ] + } + }, + "overrides":[ + + ] + }, + "gridPos":{ + "h":12, + "w":12, + "x":12, + "y":17 + }, + "id":5, + "options":{ + "legend":{ + "calcs":[ + + ], + "displayMode":"list", + "placement":"bottom", + "showLegend":true + }, + "tooltip":{ + "hideZeros":false, + "mode":"single", + "sort":"none" + } + }, + "pluginVersion":"12.0.2", + "targets":[ + { + "editorMode":"code", + "exemplar":false, + "expr":"rate(llm_token_count_sum{type=\"output\"}[5m]) \n/\nrate(llm_token_count_count{type=\"output\"}[5m])", + "format":"time_series", + "instant":false, + "legendFormat":"output - {{operation}}", + "range":true, + "refId":"A" + }, + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "editorMode":"code", + "expr":"rate(llm_token_count_sum{type=\"input\"}[5m]) \n/\nrate(llm_token_count_count{type=\"input\"}[5m])", + "hide":false, + "instant":false, + "legendFormat":"input - {{operation}}", + "range":true, + "refId":"B" + } + ], + "title":"LLM API Token Analytics", + "type":"timeseries" + } + ], + "preload":false, + "refresh":"10s", + "schemaVersion":41, + "tags":[ + "monitoring", + "alerts" + ], + "templating":{ + "list":[ + + ] + }, + "time":{ + "from":"now-6h", + "to":"now" + }, + "timepicker":{ + + }, + "timezone":"browser", + "title":"GenAi System Metrics Dashboard", + "uid":"genai-metrics-dashboard1111", + "version":2 +} diff --git a/docker/grafana/provisioning/dashboards/realtime-dashboard.json b/docker/grafana/provisioning/dashboards/realtime-dashboard.json new file mode 100644 index 00000000..fac0edbf --- /dev/null +++ b/docker/grafana/provisioning/dashboards/realtime-dashboard.json @@ -0,0 +1,593 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "enable": true, + "hide": false, + "iconColor": "red", + "name": "WebSocket error", + "tagKeys": "websocket_errors", + "target": { + "expr": "(\n rate(websocket_read_errors[5m]) +\n rate(websocket_write_errors[5m]) +\n rate(websocket_upgrade_errors[5m])\n) > 0.1", + "interval": "", + "refId": "Anno" + }, + "textFormat": "High WebSocket error rate detected - exceeding 0.1 errors per second", + "titleFormat": "WebSocket Error Rate" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 3, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 10, + "x": 0, + "y": 0 + }, + "id": 5, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "editorMode": "code", + "expr": "websocket_connections_active", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Websocket Connections Active", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 15, + "w": 14, + "x": 10, + "y": 0 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "editorMode": "code", + "expr": "rate(websocket_connection_duration_sum[5m]) / rate(websocket_connection_duration_count[5m])", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Connection Duration Summary", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "histogram_quantile(0.95, sum(rate(websocket_connection_duration_bucket[5m])) by (le))" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [] + } + ] + }, + "gridPos": { + "h": 11, + "w": 10, + "x": 0, + "y": 8 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(websocket_sent_messages[1m])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "Messages Sent", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "rate(websocket_received_messages[1m])", + "hide": false, + "instant": false, + "legendFormat": "Messages Received", + "range": true, + "refId": "B" + } + ], + "title": "Messages Sent/Received Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "description": "Distribution of WebSocket connection durations across different time buckets. Each bar represents a cumulative count of connections that lasted less than or equal to the specified duration (in seconds). For example, '60s' shows connections lasting up to 60 seconds, '120s' shows connections up to 120 seconds, and so on up to '+Inf' (unlimited duration).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "noValue" + } + ] + } + ] + }, + "gridPos": { + "h": 17, + "w": 14, + "x": 10, + "y": 15 + }, + "id": 4, + "options": { + "displayMode": "gradient", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "websocket_connection_duration_bucket{le=~\".+\"}", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "{{le}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Websocket Connection Duration", + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "Time", + "60.0", + "120.0", + "300.0", + "600.0", + "900.0", + "1200.0", + "1800.0", + "+Inf" + ] + } + } + } + ], + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "websocket_read_errors" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 13, + "w": 10, + "x": 0, + "y": 19 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "editorMode": "code", + "expr": "rate(websocket_read_errors[1m])", + "legendFormat": "websocket_read_errors", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "rate(websocket_write_errors[1m])", + "hide": false, + "instant": false, + "legendFormat": "websocket_write_errors", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "rate(websocket_upgrade_errors[1m])", + "hide": false, + "instant": false, + "legendFormat": "websocket_upgrade_errors", + "range": true, + "refId": "C" + } + ], + "title": "Websocket Error Rates", + "type": "timeseries" + } + ], + "preload": false, + "schemaVersion": 41, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Realtime System Metrics", + "uid": "275dd5ce-c9b5-4ed9-a6d0-cbac478716a1", + "version": 4 +} \ No newline at end of file diff --git a/docker/grafana/provisioning/dashboards/server-dashboard.json b/docker/grafana/provisioning/dashboards/server-dashboard.json new file mode 100644 index 00000000..caa6f15f --- /dev/null +++ b/docker/grafana/provisioning/dashboards/server-dashboard.json @@ -0,0 +1,394 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "enable": true, + "hide": false, + "iconColor": "orange", + "name": "Client Error Spike", + "tagKeys": "client-error, http-4xx", + "target": { + "expr": "sum(rate(http_server_requests_seconds_count{status=~\"4..\"}[5m])) > 0.1", + "interval": "", + "refId": "Anno" + }, + "textFormat": "High rate of client errors detected, exceeding 0.1 requests per second", + "titleFormat": "Client Error Spike" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "enable": true, + "hide": false, + "iconColor": "red", + "name": "Server Error Spike", + "tagKeys": "server, error", + "target": { + "expr": "sum(rate(http_server_requests_seconds_count{status=~\"5..\"}[5m])) > 0.1", + "interval": "", + "refId": "Anno" + }, + "textFormat": "Server error rate exceeded threshold (>0.1 req/s)", + "titleFormat": "Server Error Spike" + } + ] + }, + "description": "Dashboard showing system metrics including request count, latency, and error rate.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 17, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(http_server_requests_seconds_count[5m])) by (method, uri)", + "legendFormat": "{{method}} {{status}} {{uri}}", + "range": true, + "refId": "A" + } + ], + "title": "Request Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Average Latency (seconds)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 17, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": " sum(rate(http_server_requests_seconds_sum[5m])) by (method, uri) /\nsum(rate(http_server_requests_seconds_count[5m])) by (method, uri)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(http_server_requests_seconds_count{outcome=\"CLIENT_ERROR\"}[5m])) by (method, uri, status)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(http_server_requests_seconds_count{status=~\"5..\"}[5m])) by (method, uri, status)", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "Errors", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "10s", + "schemaVersion": 41, + "tags": [ + "monitoring", + "alerts" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Server System Metrics Dashboard", + "uid": "system-metrics-dashboard111112", + "version": 1 +} \ No newline at end of file diff --git a/docker/grafana/provisioning/datasources/prometheus.yml b/docker/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 00000000..8049912b --- /dev/null +++ b/docker/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,8 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true \ No newline at end of file diff --git a/docker/mailhog/Dockerfile b/docker/mailhog/Dockerfile new file mode 100644 index 00000000..51730005 --- /dev/null +++ b/docker/mailhog/Dockerfile @@ -0,0 +1,25 @@ +FROM golang:1.18-alpine as builder + +# Install MailHog: +RUN apk --no-cache add --virtual build-dependencies \ + git \ + && mkdir -p /root/gocode \ + && export GOPATH=/root/gocode \ + && go install github.com/mailhog/MailHog@latest + +FROM alpine:3 +# Add mailhog user/group with uid/gid 1000. +# This is a workaround for boot2docker issue #581, see +# https://github.com/boot2docker/boot2docker/issues/581 +RUN adduser -D -u 1000 mailhog + +COPY --from=builder /root/gocode/bin/MailHog /usr/local/bin/ + +USER mailhog + +WORKDIR /home/mailhog + +ENTRYPOINT ["MailHog"] + +# Expose the SMTP and HTTP ports: +EXPOSE 1025 8025 \ No newline at end of file diff --git a/docker/prometheus/Dockerfile b/docker/prometheus/Dockerfile new file mode 100644 index 00000000..9c042ad1 --- /dev/null +++ b/docker/prometheus/Dockerfile @@ -0,0 +1 @@ +FROM prom/prometheus:v3.5.0 \ No newline at end of file diff --git a/docker/prometheus/config/alert.rules.yml b/docker/prometheus/config/alert.rules.yml new file mode 100644 index 00000000..e7aa7f87 --- /dev/null +++ b/docker/prometheus/config/alert.rules.yml @@ -0,0 +1,11 @@ +groups: + - name: service-availability + rules: + - alert: Service Down + expr: up{job=~"server_job|genai_job"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Service {{ $labels.job }} is down" + description: "Service {{ $labels.job }} on {{ $labels.instance }} has been down for more than 1 minute" \ No newline at end of file diff --git a/docker/prometheus/config/prometheus.yml b/docker/prometheus/config/prometheus.yml new file mode 100644 index 00000000..ee883ff0 --- /dev/null +++ b/docker/prometheus/config/prometheus.yml @@ -0,0 +1,31 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +alerting: + alertmanagers: + - static_configs: + - targets: + - 'alertmanager:9093' + +rule_files: + - "/etc/prometheus/alert.rules.yml" + +scrape_configs: + - job_name: 'server_job' + metrics_path: '/actuator/prometheus' + static_configs: + - targets: + - 'server:9091' + + - job_name: 'genai_job' + metrics_path: '/metrics' + static_configs: + - targets: + - 'genai:8000' + + - job_name: 'realtime_job' + metrics_path: '/metrics' + static_configs: + - targets: + - 'realtime:9090' \ No newline at end of file diff --git a/genai/app/core/__init__.py b/genai/app/core/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/genai/app/main.py b/genai/app/main.py index dc7b521a..01fbc3db 100644 --- a/genai/app/main.py +++ b/genai/app/main.py @@ -1,7 +1,10 @@ +from fastapi import FastAPI +from prometheus_fastapi_instrumentator import Instrumentator +from prometheus_client import Histogram import os import requests from typing import Any, List, Optional -from fastapi import FastAPI, HTTPException, APIRouter +from fastapi import HTTPException, APIRouter from pydantic import BaseModel from langchain.llms.base import LLM from langchain.callbacks.manager import CallbackManagerForLLMRun @@ -11,6 +14,13 @@ from fastapi.openapi.utils import get_openapi from dotenv import load_dotenv +# Initialize FastAPI app +app = FastAPI( + title="LLM Service", + description="OpenWebUI powered LLM service for text operations", + version="1.0.0", +) + # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -19,6 +29,8 @@ router = APIRouter() +Instrumentator().instrument(app).expose(app) + # Environment configuration OPEN_WEB_UI_API_KEY = os.getenv("OPEN_WEB_UI_API_KEY") API_URL = os.getenv("API_URL") @@ -26,6 +38,14 @@ CLIENT_URL = os.getenv("CLIENT_URL") GENAI_URL = os.getenv("GENAI_URL") + +LLM_TOKEN_COUNT = Histogram( + "llm_token_count", + "Number of tokens in requests/responses", + labelnames=["operation", "type"], +) + + class OpenWebUILLM(LLM): api_url: str = API_URL api_key: str = OPEN_WEB_UI_API_KEY @@ -84,14 +104,6 @@ def _call( raise Exception(f"API request failed: {str(e)}") -# Initialize FastAPI app -app = FastAPI( - title="LLM Service", - description="OpenWebUI powered LLM service for text operations", - version="1.0.0", -) - - @app.get("/v3/api-docs", include_in_schema=False) def custom_openapi(): return JSONResponse( @@ -117,7 +129,7 @@ def custom_openapi(): class TextRequest(BaseModel): - user_text: List[str] + user_text: str class TextResponse(BaseModel): @@ -126,19 +138,29 @@ class TextResponse(BaseModel): @router.post("/completion", response_model=TextResponse) async def complete_text(request: TextRequest): + operation = "completion" + try: - input_text = " ".join(request.user_text) + input_tokens = len(request.user_text.split(" ")) + LLM_TOKEN_COUNT.labels(operation=operation, type="input").observe(input_tokens) + prompt = f"""Complete the following text with exactly one natural sentence: - {input_text} - + {request.user_text} + Rules: - ALWAYS start your response with the exact input text - Add only ONE sentence - Keep the style consistent - Make it coherent with the input """ - logger.info(f"Processing completion request for text: {input_text}") + logger.info(f"Processing completion request for text: {request.user_text}") result = llm(prompt) + + output_tokens = len(result.split()) + LLM_TOKEN_COUNT.labels(operation=operation, type="output").observe( + output_tokens + ) + logger.info(f"Generated completion: {result}") return TextResponse(llm_response=result) except Exception as e: @@ -148,11 +170,22 @@ async def complete_text(request: TextRequest): @router.post("/summarization", response_model=TextResponse) async def summarize_text(request: TextRequest): + operation = "summarization" + try: + input_tokens = len(request.user_text.split(" ")) + LLM_TOKEN_COUNT.labels(operation=operation, type="input").observe(input_tokens) + prompt = f"""Summarize the following text concisely: - {' '.join(request.user_text)} + {request.user_text} """ result = llm(prompt) + + output_tokens = len(result.split()) + LLM_TOKEN_COUNT.labels(operation=operation, type="output").observe( + output_tokens + ) + return TextResponse(llm_response=result) except Exception as e: logger.error(f"Summarization error: {str(e)}") @@ -161,21 +194,30 @@ async def summarize_text(request: TextRequest): @router.post("/rephrase", response_model=TextResponse) async def rephrase_text(request: TextRequest): + operation = "rephrase_text" logger.info(f"Received rephrase request: {request}") + try: - input_text = " ".join(request.user_text) - word_count = len(input_text.split()) + input_tokens = len(request.user_text.split(" ")) + LLM_TOKEN_COUNT.labels(operation=operation, type="input").observe(input_tokens) + + word_count = len(request.user_text.split()) prompt = f"""Rephrase the following text: - {input_text} - + {request.user_text} + Rules: - Keep EXACTLY {word_count} words - Maintain the original meaning - Use similar tone and style - Make it sound natural """ - logger.info(f"Received rephrase request: {input_text}") + logger.info(f"Received rephrase request: {request.user_text}") result = llm(prompt) + + output_tokens = len(result.split()) + LLM_TOKEN_COUNT.labels(operation=operation, type="output").observe( + output_tokens + ) # Ensure exact word count result_words = result.split() if len(result_words) > word_count: diff --git a/genai/app/services/__init__.py b/genai/app/services/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/genai/app/test.py b/genai/app/test.py index 5acc29fd..02f65146 100644 --- a/genai/app/test.py +++ b/genai/app/test.py @@ -3,27 +3,29 @@ client = TestClient(app) + def test_health_check(): - response = client.get("/health") + response = client.get("/health") assert response.status_code == 200 assert "status" in response.json() def test_completion(): - payload = {"user_text": ["This is a test input."]} + payload = {"user_text": "This is a test input."} response = client.post("/completion", json=payload) assert response.status_code == 200 assert "llm_response" in response.json() + def test_summarization(): - payload = {"user_text": ["This is a long sentence that needs summarizing."]} + payload = {"user_text": "This is a long sentence that needs summarizing."} response = client.post("/summarization", json=payload) assert response.status_code == 200 assert "llm_response" in response.json() def test_rephrase(): - payload = {"user_text": ["This is a sample sentence."]} + payload = {"user_text": "This is a sample sentence."} response = client.post("/rephrase", json=payload) assert response.status_code == 200 - assert "llm_response" in response.json() \ No newline at end of file + assert "llm_response" in response.json() diff --git a/genai/app/utils/__init__.py b/genai/app/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/genai/requirements.txt b/genai/requirements.txt index f0bf846a..a56d35c1 100644 --- a/genai/requirements.txt +++ b/genai/requirements.txt @@ -1,7 +1,9 @@ annotated-types==0.7.0 +anyio==4.9.0 authlib==1.3.1 certifi==2025.4.26 cffi==1.17.1 +charset-normalizer==3.4.2 click==8.1.8 cryptography==43.0.3 deprecation==2.1.0 @@ -9,6 +11,7 @@ dnspython==2.7.0 email-validator==2.2.0 fastapi==0.115.12 fastapi-cli==0.0.7 +greenlet==3.2.3 grpcio==1.71.0 grpcio-health-checking==1.71.0 grpcio-tools==1.71.0 @@ -19,11 +22,21 @@ httpx==0.28.1 idna==3.10 iniconfig==2.1.0 jinja2==3.1.6 +jsonpatch==1.33 +jsonpointer==3.0.0 +langchain==0.3.26 +langchain-core==0.3.69 +langchain-text-splitters==0.3.8 +langsmith==0.4.8 markdown-it-py==3.0.0 markupsafe==3.0.2 mdurl==0.1.2 +orjson==3.11.0 packaging==23.2 +pip==25.0.1 pluggy==1.5.0 +prometheus-client==0.22.1 +prometheus-fastapi-instrumentator==7.1.0 protobuf==5.29.4 pycparser==2.22 pydantic==2.11.4 @@ -35,21 +48,24 @@ python-dotenv==1.1.0 python-multipart==0.0.20 pyyaml==6.0.2 requests==2.31.0 +requests-toolbelt==1.0.0 rich==14.0.0 rich-toolkit==0.14.5 ruff==0.11.8 setuptools==80.4.0 shellingham==1.5.4 sniffio==1.3.1 +sqlalchemy==2.0.41 starlette==0.46.2 +tenacity==9.1.2 typer==0.15.3 typing-extensions==4.13.2 typing-inspection==0.4.0 +urllib3==2.5.0 uvicorn==0.34.2 uvloop==0.21.0 validators==0.34.0 watchfiles==1.0.5 weaviate-client==4.14.1 websockets==15.0.1 -langchain>=0.1.0 -langchain-core>=0.1.10 +zstandard==0.23.0 diff --git a/infrastructure/whiteboard-app/Chart.lock b/infrastructure/whiteboard-app/Chart.lock index e5dc3a08..00b613af 100644 --- a/infrastructure/whiteboard-app/Chart.lock +++ b/infrastructure/whiteboard-app/Chart.lock @@ -9,4 +9,4 @@ dependencies: repository: https://charts.bitnami.com/bitnami version: 21.2.12 digest: sha256:68d94162b9c62e8d173c984b419f1837de111fe9d75c381a1555bc09860e543b -generated: "2025-07-16T13:59:46.643364+02:00" +generated: "2025-07-19T17:14:37.691057+02:00" diff --git a/infrastructure/whiteboard-observability/.helmignore b/infrastructure/whiteboard-observability/.helmignore new file mode 100644 index 00000000..0e8a0eb3 --- /dev/null +++ b/infrastructure/whiteboard-observability/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/infrastructure/whiteboard-observability/Chart.lock b/infrastructure/whiteboard-observability/Chart.lock new file mode 100644 index 00000000..afdfd431 --- /dev/null +++ b/infrastructure/whiteboard-observability/Chart.lock @@ -0,0 +1,12 @@ +dependencies: +- name: grafana + repository: https://grafana.github.io/helm-charts + version: 9.2.10 +- name: prometheus + repository: https://prometheus-community.github.io/helm-charts + version: 27.28.0 +- name: mailhog + repository: https://codecentric.github.io/helm-charts + version: 5.8.0 +digest: sha256:b1512e01eadc92c03de33e061fdf24f4ba22e910166fbe57fd8805f0ffca1e9d +generated: "2025-07-20T13:30:29.991761+02:00" diff --git a/infrastructure/whiteboard-observability/Chart.yaml b/infrastructure/whiteboard-observability/Chart.yaml new file mode 100644 index 00000000..78717bdf --- /dev/null +++ b/infrastructure/whiteboard-observability/Chart.yaml @@ -0,0 +1,16 @@ +apiVersion: v2 +name: whiteboard-observability +description: A Helm chart for Kubernetes +type: application +version: 0.1.0 +appVersion: "1.16.0" +dependencies: + - name: grafana + version: 9.2.10 + repository: "https://grafana.github.io/helm-charts" + - name: prometheus + version: 27.28.0 + repository: "https://prometheus-community.github.io/helm-charts" + - name: mailhog + version: 5.8.0 + repository: https://codecentric.github.io/helm-charts diff --git a/infrastructure/whiteboard-observability/charts/grafana-9.2.10.tgz b/infrastructure/whiteboard-observability/charts/grafana-9.2.10.tgz new file mode 100644 index 00000000..d9cbb75e Binary files /dev/null and b/infrastructure/whiteboard-observability/charts/grafana-9.2.10.tgz differ diff --git a/infrastructure/whiteboard-observability/charts/mailhog-5.8.0.tgz b/infrastructure/whiteboard-observability/charts/mailhog-5.8.0.tgz new file mode 100644 index 00000000..9cdfd149 Binary files /dev/null and b/infrastructure/whiteboard-observability/charts/mailhog-5.8.0.tgz differ diff --git a/infrastructure/whiteboard-observability/charts/prometheus-27.28.0.tgz b/infrastructure/whiteboard-observability/charts/prometheus-27.28.0.tgz new file mode 100644 index 00000000..bf979b48 Binary files /dev/null and b/infrastructure/whiteboard-observability/charts/prometheus-27.28.0.tgz differ diff --git a/infrastructure/whiteboard-observability/files/grafana/provisioning/dashboards/dashboards.yml b/infrastructure/whiteboard-observability/files/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 00000000..0e8f7220 --- /dev/null +++ b/infrastructure/whiteboard-observability/files/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,23 @@ +apiVersion: 1 + +providers: + - name: "Server System Metrics Dashboard" + type: file + editable: true + updateIntervalSeconds: 10 + options: + path: /etc/grafana/provisioning/dashboards/server-dashboard.json + + - name: "GenAi System Metrics Dashboard" + type: file + editable: true + updateIntervalSeconds: 10 + options: + path: /etc/grafana/provisioning/dashboards/genai-dashboard.json + + - name: "Realtime System Metrics Dashboard" + type: file + editable: true + updateIntervalSeconds: 10 + options: + path: /etc/grafana/provisioning/dashboards/realtime-dashboard.json \ No newline at end of file diff --git a/infrastructure/whiteboard-observability/files/grafana/provisioning/dashboards/genai-dashboard.json b/infrastructure/whiteboard-observability/files/grafana/provisioning/dashboards/genai-dashboard.json new file mode 100644 index 00000000..bc3a1fc6 --- /dev/null +++ b/infrastructure/whiteboard-observability/files/grafana/provisioning/dashboards/genai-dashboard.json @@ -0,0 +1,522 @@ +{ + "annotations":{ + "list":[ + { + "builtIn":1, + "datasource":{ + "type":"grafana", + "uid":"-- Grafana --" + }, + "enable":true, + "hide":true, + "iconColor":"rgba(0, 211, 255, 1)", + "name":"Annotations & Alerts", + "type":"dashboard" + }, + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "enable":true, + "hide":false, + "iconColor":"orange", + "name":"Client Error Spike", + "target":{ + "expr":"sum(rate(http_requests_total{status=~\"4xx\"}[5m])) > 0.1", + "interval":"", + "refId":"Anno" + }, + "textFormat":"High rate of client errors detected, exceeding 0.1 requests per second", + "titleFormat":"Client Error Spike" + }, + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "enable":true, + "hide":false, + "iconColor":"red", + "name":"Server Error Spike", + "tagKeys":"server, error", + "target":{ + "expr":"sum(rate(http_requests_total{status=~\"5xx\"}[5m])) > 0.1", + "interval":"", + "refId":"Anno" + }, + "textFormat":"Server error rate exceeded threshold (>0.1 req/s)", + "titleFormat":"Server Error Spike" + } + ] + }, + "description":"Dashboard showing system metrics including request count, latency, and error rate.", + "editable":true, + "fiscalYearStartMonth":0, + "graphTooltip":0, + "id":3, + "links":[ + + ], + "panels":[ + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "fieldConfig":{ + "defaults":{ + "color":{ + "mode":"palette-classic" + }, + "custom":{ + "axisBorderShow":false, + "axisCenteredZero":false, + "axisColorMode":"text", + "axisLabel":"", + "axisPlacement":"auto", + "barAlignment":0, + "barWidthFactor":0.6, + "drawStyle":"line", + "fillOpacity":20, + "gradientMode":"none", + "hideFrom":{ + "legend":false, + "tooltip":false, + "viz":false + }, + "insertNulls":false, + "lineInterpolation":"smooth", + "lineWidth":2, + "pointSize":5, + "scaleDistribution":{ + "type":"linear" + }, + "showPoints":"auto", + "spanNulls":false, + "stacking":{ + "group":"A", + "mode":"none" + }, + "thresholdsStyle":{ + "mode":"area" + } + }, + "mappings":[ + + ], + "thresholds":{ + "mode":"absolute", + "steps":[ + { + "color":"green" + }, + { + "color":"orange", + "value":70 + }, + { + "color":"red", + "value":80 + } + ] + } + }, + "overrides":[ + + ] + }, + "gridPos":{ + "h":17, + "w":12, + "x":0, + "y":0 + }, + "id":1, + "options":{ + "legend":{ + "calcs":[ + "mean", + "max" + ], + "displayMode":"table", + "placement":"bottom", + "showLegend":true + }, + "tooltip":{ + "hideZeros":false, + "mode":"multi", + "sort":"none" + } + }, + "pluginVersion":"12.0.2", + "targets":[ + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "editorMode":"code", + "expr":"sum by (handler, method, status) (increase(http_requests_total{job=\"genai_job\"}[5m]))", + "legendFormat":"{{method}} {{status}} {{uri}}", + "range":true, + "refId":"A" + } + ], + "title":"Request Count", + "type":"timeseries" + }, + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "fieldConfig":{ + "defaults":{ + "color":{ + "mode":"palette-classic" + }, + "custom":{ + "axisBorderShow":false, + "axisCenteredZero":false, + "axisColorMode":"text", + "axisLabel":"Average Latency (seconds)", + "axisPlacement":"auto", + "barAlignment":0, + "barWidthFactor":0.6, + "drawStyle":"line", + "fillOpacity":20, + "gradientMode":"none", + "hideFrom":{ + "legend":false, + "tooltip":false, + "viz":false + }, + "insertNulls":false, + "lineInterpolation":"smooth", + "lineWidth":2, + "pointSize":5, + "scaleDistribution":{ + "type":"linear" + }, + "showPoints":"auto", + "spanNulls":false, + "stacking":{ + "group":"A", + "mode":"none" + }, + "thresholdsStyle":{ + "mode":"area" + } + }, + "mappings":[ + + ], + "thresholds":{ + "mode":"absolute", + "steps":[ + { + "color":"green" + }, + { + "color":"red", + "value":80 + } + ] + } + }, + "overrides":[ + + ] + }, + "gridPos":{ + "h":17, + "w":12, + "x":12, + "y":0 + }, + "id":2, + "options":{ + "legend":{ + "calcs":[ + "mean", + "max" + ], + "displayMode":"table", + "placement":"bottom", + "showLegend":true + }, + "tooltip":{ + "hideZeros":false, + "mode":"multi", + "sort":"none" + } + }, + "pluginVersion":"12.0.2", + "targets":[ + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "editorMode":"code", + "expr":"sum(rate(http_request_duration_seconds_sum{job=\"genai_job\"}[5m])) by (method, handler) /\nsum(rate(http_request_duration_seconds_count{job=\"genai_job\"}[5m])) by (method, handler)", + "legendFormat":"{{method}} {{handler}}", + "range":true, + "refId":"A" + } + ], + "title":"Latency", + "type":"timeseries" + }, + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "fieldConfig":{ + "defaults":{ + "color":{ + "mode":"palette-classic" + }, + "custom":{ + "axisBorderShow":false, + "axisCenteredZero":false, + "axisColorMode":"text", + "axisLabel":"", + "axisPlacement":"auto", + "barAlignment":0, + "barWidthFactor":0.6, + "drawStyle":"line", + "fillOpacity":0, + "gradientMode":"none", + "hideFrom":{ + "legend":false, + "tooltip":false, + "viz":false + }, + "insertNulls":false, + "lineInterpolation":"linear", + "lineWidth":1, + "pointSize":5, + "scaleDistribution":{ + "type":"linear" + }, + "showPoints":"auto", + "spanNulls":false, + "stacking":{ + "group":"A", + "mode":"none" + }, + "thresholdsStyle":{ + "mode":"off" + } + }, + "mappings":[ + + ], + "thresholds":{ + "mode":"absolute", + "steps":[ + { + "color":"green" + }, + { + "color":"red", + "value":80 + } + ] + } + }, + "overrides":[ + + ] + }, + "gridPos":{ + "h":12, + "w":12, + "x":0, + "y":17 + }, + "id":4, + "options":{ + "legend":{ + "calcs":[ + + ], + "displayMode":"list", + "placement":"bottom", + "showLegend":true + }, + "tooltip":{ + "hideZeros":false, + "mode":"single", + "sort":"none" + } + }, + "pluginVersion":"12.0.2", + "targets":[ + { + "editorMode":"code", + "expr":"sum by (handler, method) (increase(http_requests_total{status=~\"4..\", job=\"genai_job\"}[5m]))", + "legendFormat":"{{label_name}}", + "range":true, + "refId":"A" + }, + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "editorMode":"code", + "expr":"sum by (handler, method) (increase(http_requests_total{status=~\"4..\", job=\"genai_job\"}[5m]))", + "hide":false, + "instant":false, + "legendFormat":"__auto", + "range":true, + "refId":"B" + } + ], + "title":"Errors", + "type":"timeseries" + }, + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "description":"This dashboard displays the average number of input and output tokens generated per request for each operation (completion, summarization, rephrase_text) over time. The values represent the mean input and output token count, calculated every 5 minutes, grouped by operation.", + "fieldConfig":{ + "defaults":{ + "color":{ + "mode":"palette-classic" + }, + "custom":{ + "axisBorderShow":false, + "axisCenteredZero":false, + "axisColorMode":"text", + "axisLabel":"", + "axisPlacement":"auto", + "barAlignment":0, + "barWidthFactor":0.6, + "drawStyle":"line", + "fillOpacity":0, + "gradientMode":"none", + "hideFrom":{ + "legend":false, + "tooltip":false, + "viz":false + }, + "insertNulls":false, + "lineInterpolation":"linear", + "lineWidth":1, + "pointSize":5, + "scaleDistribution":{ + "type":"linear" + }, + "showPoints":"auto", + "spanNulls":false, + "stacking":{ + "group":"A", + "mode":"none" + }, + "thresholdsStyle":{ + "mode":"off" + } + }, + "mappings":[ + + ], + "thresholds":{ + "mode":"absolute", + "steps":[ + { + "color":"green" + }, + { + "color":"red", + "value":80 + } + ] + } + }, + "overrides":[ + + ] + }, + "gridPos":{ + "h":12, + "w":12, + "x":12, + "y":17 + }, + "id":5, + "options":{ + "legend":{ + "calcs":[ + + ], + "displayMode":"list", + "placement":"bottom", + "showLegend":true + }, + "tooltip":{ + "hideZeros":false, + "mode":"single", + "sort":"none" + } + }, + "pluginVersion":"12.0.2", + "targets":[ + { + "editorMode":"code", + "exemplar":false, + "expr":"rate(llm_token_count_sum{type=\"output\"}[5m]) \n/\nrate(llm_token_count_count{type=\"output\"}[5m])", + "format":"time_series", + "instant":false, + "legendFormat":"output - {{operation}}", + "range":true, + "refId":"A" + }, + { + "datasource":{ + "type":"prometheus", + "uid":"PBFA97CFB590B2093" + }, + "editorMode":"code", + "expr":"rate(llm_token_count_sum{type=\"input\"}[5m]) \n/\nrate(llm_token_count_count{type=\"input\"}[5m])", + "hide":false, + "instant":false, + "legendFormat":"input - {{operation}}", + "range":true, + "refId":"B" + } + ], + "title":"LLM API Token Analytics", + "type":"timeseries" + } + ], + "preload":false, + "refresh":"10s", + "schemaVersion":41, + "tags":[ + "monitoring", + "alerts" + ], + "templating":{ + "list":[ + + ] + }, + "time":{ + "from":"now-6h", + "to":"now" + }, + "timepicker":{ + + }, + "timezone":"browser", + "title":"GenAi System Metrics Dashboard", + "uid":"genai-metrics-dashboard1111", + "version":2 +} diff --git a/infrastructure/whiteboard-observability/files/grafana/provisioning/dashboards/realtime-dashboard.json b/infrastructure/whiteboard-observability/files/grafana/provisioning/dashboards/realtime-dashboard.json new file mode 100644 index 00000000..fac0edbf --- /dev/null +++ b/infrastructure/whiteboard-observability/files/grafana/provisioning/dashboards/realtime-dashboard.json @@ -0,0 +1,593 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "enable": true, + "hide": false, + "iconColor": "red", + "name": "WebSocket error", + "tagKeys": "websocket_errors", + "target": { + "expr": "(\n rate(websocket_read_errors[5m]) +\n rate(websocket_write_errors[5m]) +\n rate(websocket_upgrade_errors[5m])\n) > 0.1", + "interval": "", + "refId": "Anno" + }, + "textFormat": "High WebSocket error rate detected - exceeding 0.1 errors per second", + "titleFormat": "WebSocket Error Rate" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 3, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 10, + "x": 0, + "y": 0 + }, + "id": 5, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "editorMode": "code", + "expr": "websocket_connections_active", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Websocket Connections Active", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 15, + "w": 14, + "x": 10, + "y": 0 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "editorMode": "code", + "expr": "rate(websocket_connection_duration_sum[5m]) / rate(websocket_connection_duration_count[5m])", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Connection Duration Summary", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "histogram_quantile(0.95, sum(rate(websocket_connection_duration_bucket[5m])) by (le))" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [] + } + ] + }, + "gridPos": { + "h": 11, + "w": 10, + "x": 0, + "y": 8 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(websocket_sent_messages[1m])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "Messages Sent", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "rate(websocket_received_messages[1m])", + "hide": false, + "instant": false, + "legendFormat": "Messages Received", + "range": true, + "refId": "B" + } + ], + "title": "Messages Sent/Received Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "description": "Distribution of WebSocket connection durations across different time buckets. Each bar represents a cumulative count of connections that lasted less than or equal to the specified duration (in seconds). For example, '60s' shows connections lasting up to 60 seconds, '120s' shows connections up to 120 seconds, and so on up to '+Inf' (unlimited duration).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "noValue" + } + ] + } + ] + }, + "gridPos": { + "h": 17, + "w": 14, + "x": 10, + "y": 15 + }, + "id": 4, + "options": { + "displayMode": "gradient", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "websocket_connection_duration_bucket{le=~\".+\"}", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "{{le}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Websocket Connection Duration", + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "Time", + "60.0", + "120.0", + "300.0", + "600.0", + "900.0", + "1200.0", + "1800.0", + "+Inf" + ] + } + } + } + ], + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "websocket_read_errors" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 13, + "w": 10, + "x": 0, + "y": 19 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "editorMode": "code", + "expr": "rate(websocket_read_errors[1m])", + "legendFormat": "websocket_read_errors", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "rate(websocket_write_errors[1m])", + "hide": false, + "instant": false, + "legendFormat": "websocket_write_errors", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "rate(websocket_upgrade_errors[1m])", + "hide": false, + "instant": false, + "legendFormat": "websocket_upgrade_errors", + "range": true, + "refId": "C" + } + ], + "title": "Websocket Error Rates", + "type": "timeseries" + } + ], + "preload": false, + "schemaVersion": 41, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Realtime System Metrics", + "uid": "275dd5ce-c9b5-4ed9-a6d0-cbac478716a1", + "version": 4 +} \ No newline at end of file diff --git a/infrastructure/whiteboard-observability/files/grafana/provisioning/dashboards/server-dashboard.json b/infrastructure/whiteboard-observability/files/grafana/provisioning/dashboards/server-dashboard.json new file mode 100644 index 00000000..caa6f15f --- /dev/null +++ b/infrastructure/whiteboard-observability/files/grafana/provisioning/dashboards/server-dashboard.json @@ -0,0 +1,394 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "enable": true, + "hide": false, + "iconColor": "orange", + "name": "Client Error Spike", + "tagKeys": "client-error, http-4xx", + "target": { + "expr": "sum(rate(http_server_requests_seconds_count{status=~\"4..\"}[5m])) > 0.1", + "interval": "", + "refId": "Anno" + }, + "textFormat": "High rate of client errors detected, exceeding 0.1 requests per second", + "titleFormat": "Client Error Spike" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "enable": true, + "hide": false, + "iconColor": "red", + "name": "Server Error Spike", + "tagKeys": "server, error", + "target": { + "expr": "sum(rate(http_server_requests_seconds_count{status=~\"5..\"}[5m])) > 0.1", + "interval": "", + "refId": "Anno" + }, + "textFormat": "Server error rate exceeded threshold (>0.1 req/s)", + "titleFormat": "Server Error Spike" + } + ] + }, + "description": "Dashboard showing system metrics including request count, latency, and error rate.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 17, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(http_server_requests_seconds_count[5m])) by (method, uri)", + "legendFormat": "{{method}} {{status}} {{uri}}", + "range": true, + "refId": "A" + } + ], + "title": "Request Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Average Latency (seconds)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 17, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": " sum(rate(http_server_requests_seconds_sum[5m])) by (method, uri) /\nsum(rate(http_server_requests_seconds_count[5m])) by (method, uri)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(http_server_requests_seconds_count{outcome=\"CLIENT_ERROR\"}[5m])) by (method, uri, status)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(http_server_requests_seconds_count{status=~\"5..\"}[5m])) by (method, uri, status)", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "Errors", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "10s", + "schemaVersion": 41, + "tags": [ + "monitoring", + "alerts" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Server System Metrics Dashboard", + "uid": "system-metrics-dashboard111112", + "version": 1 +} \ No newline at end of file diff --git a/infrastructure/whiteboard-observability/files/grafana/provisioning/datasources/prometheus.yml b/infrastructure/whiteboard-observability/files/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 00000000..20ab3d19 --- /dev/null +++ b/infrastructure/whiteboard-observability/files/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,8 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: '{{ printf "http://%s-prometheus-server" .Release.Name }}' + isDefault: true \ No newline at end of file diff --git a/infrastructure/whiteboard-observability/files/prometheus/alert.rules.yml b/infrastructure/whiteboard-observability/files/prometheus/alert.rules.yml new file mode 100644 index 00000000..502c3530 --- /dev/null +++ b/infrastructure/whiteboard-observability/files/prometheus/alert.rules.yml @@ -0,0 +1,11 @@ +groups: + - name: service-availability + rules: + - alert: Service Down + expr: up{job=~"server_job|genai_job|realtime_job"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Service {{ $labels.job }} is down" + description: "Service {{ $labels.job }} on {{ $labels.instance }} has been down for more than 1 minute" \ No newline at end of file diff --git a/infrastructure/whiteboard-observability/files/prometheus/prometheus.yml b/infrastructure/whiteboard-observability/files/prometheus/prometheus.yml new file mode 100644 index 00000000..c6f4c4d5 --- /dev/null +++ b/infrastructure/whiteboard-observability/files/prometheus/prometheus.yml @@ -0,0 +1,32 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +alerting: + alertmanagers: + - static_configs: + - targets: + - '{{ printf "%s-alertmanager" .Release.Name }}:9093' + +rule_files: + - "/etc/prometheus/alert.rules.yml" + +scrape_configs: + - job_name: 'server_job' + metrics_path: '/actuator/prometheus' + scheme: https + static_configs: + - targets: + - '{{ .Values.server.url }}' + + - job_name: 'genai_job' + metrics_path: '/metrics' + static_configs: + - targets: + - '{{ .Values.genai.url }}' + + - job_name: 'realtime_job' + metrics_path: '/metrics' + static_configs: + - targets: + - '{{ .Values.realtime.url }}' \ No newline at end of file diff --git a/infrastructure/whiteboard-observability/production.values.yaml b/infrastructure/whiteboard-observability/production.values.yaml new file mode 100644 index 00000000..f3ab5686 --- /dev/null +++ b/infrastructure/whiteboard-observability/production.values.yaml @@ -0,0 +1,92 @@ +grafana: + service: + port: 3000 + rbac: + create: false + namespaced: true + adminUser: admin + adminPassword: admin + extraVolumes: + - name: grafana-datasources-config + configMap: + name: grafana-datasources-configmap + - name: grafana-dashboards-config + configMap: + name: grafana-dashboards-configmap + extraVolumeMounts: + - name: grafana-datasources-config + mountPath: /etc/grafana/provisioning/datasources + - name: grafana-dashboards-config + mountPath: /etc/grafana/provisioning/dashboards + +prometheus: + rbac: + create: false + kube-state-metrics: + enabled: false + prometheus-node-exporter: + enabled: false + server: + extraVolumes: + - name: prometheus-config + configMap: + name: prometheus-configmap + extraVolumeMounts: + - name: prometheus-config + mountPath: /etc/prometheus + defaultFlagsOverride: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--web.enable-lifecycle' + alertmanager: + config: + global: + smtp_smarthost: 'whiteboard-observability-production-mailhog:1025' + smtp_from: 'alertmanager@whiteboard.student.k8s.aet.cit.tum.de' + smtp_require_tls: false + route: + receiver: 'mailhog-alerts' + group_by: [ 'alertname' ] + group_wait: 10s + group_interval: 1m + repeat_interval: 30m + receivers: + - name: 'mailhog-alerts' + email_configs: + - to: 'teamserverdown@whiteboard.student.k8s.aet.cit.tum.de' + from: 'alertmanager@whiteboard.student.k8s.aet.cit.tum.de' + smarthost: 'whiteboard-observability-production-mailhog:1025' + send_resolved: true + +ingress: + enabled: true + className: "nginx" + annotations: + cert-manager.io/cluster-issuer: "letsencrypt-prod" + nginx.ingress.kubernetes.io/rewrite-target: / + nginx.ingress.kubernetes.io/use-forwarded-headers: "true" + nginx.ingress.kubernetes.io/proxy-buffer-size: "8k" + tls: + hosts: + - '{{ .Values.metrics.url }}' + - "mailhog.whiteboard.student.k8s.aet.cit.tum.de" + secretName: '{{ .Values.namespace }}-whiteboard-observability-devops25-tls' + rules: + - host: '{{ .Values.metrics.url }}' + paths: + - path: / + pathType: Prefix + service: + name: '{{ printf "%s-grafana" .Release.Name }}' + port: + number: 3000 + - host: "mailhog.whiteboard.student.k8s.aet.cit.tum.de" + paths: + - path: / + pathType: Prefix + service: + name: '{{ printf "%s-mailhog" .Release.Name }}' + port: + number: 8025 \ No newline at end of file diff --git a/infrastructure/whiteboard-observability/staging.values.yaml b/infrastructure/whiteboard-observability/staging.values.yaml new file mode 100644 index 00000000..e772c1bb --- /dev/null +++ b/infrastructure/whiteboard-observability/staging.values.yaml @@ -0,0 +1,92 @@ +grafana: + service: + port: 3000 + rbac: + create: false + namespaced: true + adminUser: admin + adminPassword: admin + extraVolumes: + - name: grafana-datasources-config + configMap: + name: grafana-datasources-configmap + - name: grafana-dashboards-config + configMap: + name: grafana-dashboards-configmap + extraVolumeMounts: + - name: grafana-datasources-config + mountPath: /etc/grafana/provisioning/datasources + - name: grafana-dashboards-config + mountPath: /etc/grafana/provisioning/dashboards + +prometheus: + rbac: + create: false + kube-state-metrics: + enabled: false + prometheus-node-exporter: + enabled: false + server: + extraVolumes: + - name: prometheus-config + configMap: + name: prometheus-configmap + extraVolumeMounts: + - name: prometheus-config + mountPath: /etc/prometheus + defaultFlagsOverride: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--web.enable-lifecycle' + alertmanager: + config: + global: + smtp_smarthost: 'whiteboard-observability-staging-mailhog:1025' + smtp_from: 'alertmanager@staging.whiteboard.student.k8s.aet.cit.tum.de' + smtp_require_tls: false + route: + receiver: 'mailhog-alerts' + group_by: [ 'alertname' ] + group_wait: 10s + group_interval: 1m + repeat_interval: 30m + receivers: + - name: 'mailhog-alerts' + email_configs: + - to: 'teamserverdown@staging.whiteboard.student.k8s.aet.cit.tum.de' + from: 'alertmanager@staging.whiteboard.student.k8s.aet.cit.tum.de' + smarthost: 'whiteboard-observability-staging-mailhog:1025' + send_resolved: true + +ingress: + enabled: true + className: "nginx" + annotations: + cert-manager.io/cluster-issuer: "letsencrypt-prod" + nginx.ingress.kubernetes.io/rewrite-target: / + nginx.ingress.kubernetes.io/use-forwarded-headers: "true" + nginx.ingress.kubernetes.io/proxy-buffer-size: "8k" + tls: + hosts: + - '{{ .Values.metrics.url }}' + - "staging.mailhog.whiteboard.student.k8s.aet.cit.tum.de" + secretName: '{{ .Values.namespace }}-whiteboard-observability-devops25-tls' + rules: + - host: '{{ .Values.metrics.url }}' + paths: + - path: / + pathType: Prefix + service: + name: '{{ printf "%s-grafana" .Release.Name }}' + port: + number: 3000 + - host: "staging.mailhog.whiteboard.student.k8s.aet.cit.tum.de" + paths: + - path: / + pathType: Prefix + service: + name: '{{ printf "%s-mailhog" .Release.Name }}' + port: + number: 8025 \ No newline at end of file diff --git a/infrastructure/whiteboard-observability/templates/grafana-configmap.yaml b/infrastructure/whiteboard-observability/templates/grafana-configmap.yaml new file mode 100644 index 00000000..3ce9f9b0 --- /dev/null +++ b/infrastructure/whiteboard-observability/templates/grafana-configmap.yaml @@ -0,0 +1,24 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-datasources-configmap +data: + prometheus.yml: |- +{{- $dataSourceConfig := .Files.Get "files/grafana/provisioning/datasources/prometheus.yml" }} +{{ tpl $dataSourceConfig . | indent 4 }} + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboards-configmap +data: + dashboards.yml: |- +{{- $dashboardsConfig := .Files.Get "files/grafana/provisioning/dashboards/dashboards.yml" }} +{{ tpl $dashboardsConfig . | indent 4 }} + server-dashboard.json: |- +{{ .Files.Get "files/grafana/provisioning/dashboards/server-dashboard.json" | indent 4 }} + genai-dashboard.json: |- +{{ .Files.Get "files/grafana/provisioning/dashboards/genai-dashboard.json" | indent 4 }} + realtime-dashboard.json: |- +{{ .Files.Get "files/grafana/provisioning/dashboards/realtime-dashboard.json" | indent 4 }} \ No newline at end of file diff --git a/infrastructure/whiteboard-observability/templates/ingress.yaml b/infrastructure/whiteboard-observability/templates/ingress.yaml new file mode 100644 index 00000000..f7616cd6 --- /dev/null +++ b/infrastructure/whiteboard-observability/templates/ingress.yaml @@ -0,0 +1,34 @@ +{{- if .Values.ingress.enabled -}} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: "whiteboard-observability-ingress" + {{- $annotations := .Values.ingress.annotations | default dict }} + {{- if $annotations }} + annotations: + {{- toYaml $annotations | nindent 4 }} + {{- end }} +spec: + tls: + - hosts: + {{- range .Values.ingress.tls.hosts }} + - {{ tpl . $ }} + {{- end }} + secretName: {{ tpl .Values.ingress.tls.secretName $ }} + ingressClassName: nginx + rules: + {{- range .Values.ingress.rules }} + - host: {{ tpl .host $ }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + pathType: {{ .pathType }} + backend: + service: + name: {{ tpl .service.name $ | quote }} + port: + number: {{ .service.port.number }} + {{- end}} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/infrastructure/whiteboard-observability/templates/prometheus-configmap.yaml b/infrastructure/whiteboard-observability/templates/prometheus-configmap.yaml new file mode 100644 index 00000000..bd09c3ca --- /dev/null +++ b/infrastructure/whiteboard-observability/templates/prometheus-configmap.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-configmap +data: + prometheus.yml: |- +{{- $config := .Files.Get "files/prometheus/prometheus.yml" }} +{{ tpl $config . | indent 4 }} + alert.rules.yml: |- +{{ .Files.Get "files/prometheus/alert.rules.yml" | indent 4 }} \ No newline at end of file diff --git a/realtime/go.mod b/realtime/go.mod index 2beec722..dc18154a 100644 --- a/realtime/go.mod +++ b/realtime/go.mod @@ -17,6 +17,7 @@ require ( require ( github.com/KyleBanks/depth v1.2.1 // indirect + github.com/beorn7/perks v1.0.1 // indirect github.com/bytedance/sonic v1.13.3 // indirect github.com/bytedance/sonic/loader v0.2.4 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect @@ -42,8 +43,13 @@ require ( github.com/mattn/go-isatty v0.0.20 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/prometheus/client_golang v1.22.0 // indirect + github.com/prometheus/client_model v0.6.1 // indirect + github.com/prometheus/common v0.62.0 // indirect + github.com/prometheus/procfs v0.15.1 // indirect github.com/rogpeppe/go-internal v1.14.1 // indirect github.com/sagikazarmark/locafero v0.7.0 // indirect github.com/sourcegraph/conc v0.3.0 // indirect diff --git a/realtime/go.sum b/realtime/go.sum index 4c1b29a8..5c224bc6 100644 --- a/realtime/go.sum +++ b/realtime/go.sum @@ -1,5 +1,7 @@ github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= @@ -83,10 +85,20 @@ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= +github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io= +github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/redis/go-redis/v9 v9.11.0 h1:E3S08Gl/nJNn5vkxd2i78wZxWAPNZgUNTp8WIJUAiIs= github.com/redis/go-redis/v9 v9.11.0/go.mod h1:huWgSWd8mW6+m0VPhJjSSQ+d6Nh1VICQ6Q5lHuCH/Iw= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= diff --git a/realtime/pkg/api/handler/whiteboard.go b/realtime/pkg/api/handler/whiteboard.go index f72240ce..ac5fafb1 100644 --- a/realtime/pkg/api/handler/whiteboard.go +++ b/realtime/pkg/api/handler/whiteboard.go @@ -2,20 +2,24 @@ package handler import ( "context" + "github.com/AET-DevOps25/team-server-down/pkg/api/metrics" "github.com/AET-DevOps25/team-server-down/pkg/mq" "github.com/gin-gonic/gin" "github.com/gorilla/websocket" "log" "net/http" + "time" ) type WhiteboardHandler struct { - mq *mq.RedisMQ + mq *mq.RedisMQ + metrics *metrics.Metrics } -func NewWhiteboardHandler(redisMQ *mq.RedisMQ) *WhiteboardHandler { +func NewWhiteboardHandler(redisMQ *mq.RedisMQ, metrics *metrics.Metrics) *WhiteboardHandler { return &WhiteboardHandler{ redisMQ, + metrics, } } @@ -33,10 +37,20 @@ func (wh *WhiteboardHandler) GetWhiteboardEvents(c *gin.Context) { conn, err := upgrader.Upgrade(c.Writer, c.Request, nil) if err != nil { log.Printf("WebSocket upgrade failed: %v", err) + wh.metrics.WebsocketUpgradeErrors.Inc() return } defer conn.Close() + wh.metrics.WebsocketConnectionsActive.Inc() + defer wh.metrics.WebsocketConnectionsActive.Dec() + + start := time.Now() + defer func() { + duration := time.Since(start).Seconds() + wh.metrics.WebsocketConnectionDuration.Observe(duration) + }() + ctx, cancel := context.WithCancel(c.Request.Context()) defer cancel() @@ -79,9 +93,11 @@ func (wh *WhiteboardHandler) GetWhiteboardEvents(c *gin.Context) { return } if err := conn.WriteMessage(websocket.TextMessage, msg); err != nil { + wh.metrics.WebsocketWriteErrors.Inc() cancel() return } + wh.metrics.WebsocketSentMessages.Inc() } } } @@ -91,15 +107,29 @@ func (wh *WhiteboardHandler) PublishWhiteboardEvents(c *gin.Context) { conn, err := upgrader.Upgrade(c.Writer, c.Request, nil) if err != nil { + log.Printf("WebSocket upgrade failed: %v", err) + wh.metrics.WebsocketUpgradeErrors.Inc() return } defer conn.Close() + wh.metrics.WebsocketConnectionsActive.Inc() + defer wh.metrics.WebsocketConnectionsActive.Dec() + + start := time.Now() + defer func() { + duration := time.Since(start).Seconds() + wh.metrics.WebsocketConnectionDuration.Observe(duration) + }() + for { _, message, err := conn.ReadMessage() if err != nil { + wh.metrics.WebsocketReadErrors.Inc() break } + wh.metrics.WebsocketReceivedMessages.Inc() + err = wh.mq.Publish(whiteboardId, string(message)) if err != nil { log.Printf("Failed to publish message: %v", err) diff --git a/realtime/pkg/api/metrics/metrics.go b/realtime/pkg/api/metrics/metrics.go new file mode 100644 index 00000000..1b6c197d --- /dev/null +++ b/realtime/pkg/api/metrics/metrics.go @@ -0,0 +1,66 @@ +package metrics + +import "github.com/prometheus/client_golang/prometheus" + +type Metrics struct { + WebsocketConnectionsActive prometheus.Gauge + WebsocketConnectionDuration prometheus.Histogram + WebsocketUpgradeErrors prometheus.Counter + WebsocketReadErrors prometheus.Counter + WebsocketWriteErrors prometheus.Counter + WebsocketSentMessages prometheus.Counter + WebsocketReceivedMessages prometheus.Counter +} + +func NewMetrics(reg *prometheus.Registry) *Metrics { + m := &Metrics{ + WebsocketConnectionsActive: prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "websocket_connections_active", + Help: "Number of active websocket connections", + }), + WebsocketConnectionDuration: prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "websocket_connection_duration", + Help: "Duration of websocket connections", + Buckets: []float64{ + 60, // 1 min + 120, // 2 min + 300, // 5 min + 600, // 10 min + 900, // 15 min + 1200, // 20 min + 1800, // 30 min + }, + }), + WebsocketUpgradeErrors: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "websocket_upgrade_errors", + Help: "Number of websocket upgrade errors", + }), + WebsocketReadErrors: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "websocket_read_errors", + Help: "Number of websocket read errors", + }), + WebsocketWriteErrors: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "websocket_write_errors", + Help: "Number of websocket write errors", + }), + WebsocketSentMessages: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "websocket_sent_messages", + Help: "Number of sent websocket messages", + }), + WebsocketReceivedMessages: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "websocket_received_messages", + Help: "Number of received websocket messages", + }), + } + + reg.MustRegister( + m.WebsocketConnectionsActive, + m.WebsocketConnectionDuration, + m.WebsocketUpgradeErrors, + m.WebsocketReadErrors, + m.WebsocketWriteErrors, + m.WebsocketSentMessages, + m.WebsocketReceivedMessages, + ) + return m +} diff --git a/realtime/pkg/api/metrics/provider.go b/realtime/pkg/api/metrics/provider.go new file mode 100644 index 00000000..a9839105 --- /dev/null +++ b/realtime/pkg/api/metrics/provider.go @@ -0,0 +1,11 @@ +package metrics + +import "github.com/prometheus/client_golang/prometheus" + +func ProvideRegistry() *prometheus.Registry { + return prometheus.NewRegistry() +} + +func ProvideMetrics(reg *prometheus.Registry) *Metrics { + return NewMetrics(reg) +} diff --git a/realtime/pkg/api/server.go b/realtime/pkg/api/server.go index 6b0965e1..a78ad579 100644 --- a/realtime/pkg/api/server.go +++ b/realtime/pkg/api/server.go @@ -4,6 +4,8 @@ import ( _ "github.com/AET-DevOps25/team-server-down/cmd/api/docs" "github.com/AET-DevOps25/team-server-down/pkg/api/handler" "github.com/gin-gonic/gin" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" swaggerFiles "github.com/swaggo/files" ginSwagger "github.com/swaggo/gin-swagger" ) @@ -12,14 +14,20 @@ type Server struct { engine *gin.Engine } -func NewServer(rootHandler *handler.RootHandler, whiteboardHandler *handler.WhiteboardHandler) *Server { +func NewServer( + rootHandler *handler.RootHandler, + whiteboardHandler *handler.WhiteboardHandler, + reg *prometheus.Registry, +) *Server { engine := gin.New() - engine.Use(gin.Logger()) + engine.Use(gin.Logger(), gin.Recovery()) engine.GET("/", rootHandler.GetRoot) engine.GET("/swagger/*any", ginSwagger.WrapHandler(swaggerFiles.Handler)) + engine.GET("/metrics", gin.WrapH(promhttp.HandlerFor(reg, promhttp.HandlerOpts{}))) + engine.GET("/ws/whiteboard/:whiteboardId/subscribe", whiteboardHandler.GetWhiteboardEvents) engine.GET("/ws/whiteboard/:whiteboardId/publish", whiteboardHandler.PublishWhiteboardEvents) return &Server{engine: engine} diff --git a/realtime/pkg/di/wire.go b/realtime/pkg/di/wire.go index cf875224..684ba831 100644 --- a/realtime/pkg/di/wire.go +++ b/realtime/pkg/di/wire.go @@ -6,6 +6,7 @@ package di import ( http "github.com/AET-DevOps25/team-server-down/pkg/api" "github.com/AET-DevOps25/team-server-down/pkg/api/handler" + "github.com/AET-DevOps25/team-server-down/pkg/api/metrics" "github.com/AET-DevOps25/team-server-down/pkg/config" "github.com/AET-DevOps25/team-server-down/pkg/mq" "github.com/google/wire" @@ -17,6 +18,8 @@ func InitializeAPI(cfg config.Config) (*http.Server, error) { handler.NewRootHandler, handler.NewWhiteboardHandler, mq.NewRedisMQ, + metrics.ProvideRegistry, + metrics.ProvideMetrics, ) return &http.Server{}, nil diff --git a/realtime/pkg/di/wire_gen.go b/realtime/pkg/di/wire_gen.go index 96a6092b..736234d1 100644 --- a/realtime/pkg/di/wire_gen.go +++ b/realtime/pkg/di/wire_gen.go @@ -9,6 +9,7 @@ package di import ( "github.com/AET-DevOps25/team-server-down/pkg/api" "github.com/AET-DevOps25/team-server-down/pkg/api/handler" + "github.com/AET-DevOps25/team-server-down/pkg/api/metrics" "github.com/AET-DevOps25/team-server-down/pkg/config" "github.com/AET-DevOps25/team-server-down/pkg/mq" ) @@ -18,7 +19,9 @@ import ( func InitializeAPI(cfg config.Config) (*http.Server, error) { rootHandler := handler.NewRootHandler() redisMQ := mq.NewRedisMQ(cfg) - whiteboardHandler := handler.NewWhiteboardHandler(redisMQ) - server := http.NewServer(rootHandler, whiteboardHandler) + registry := metrics.ProvideRegistry() + metricsMetrics := metrics.ProvideMetrics(registry) + whiteboardHandler := handler.NewWhiteboardHandler(redisMQ, metricsMetrics) + server := http.NewServer(rootHandler, whiteboardHandler, registry) return server, nil } diff --git a/server/build.gradle b/server/build.gradle index ca2cb201..0209c94f 100644 --- a/server/build.gradle +++ b/server/build.gradle @@ -32,6 +32,9 @@ dependencies { implementation 'org.springframework.boot:spring-boot-starter-security' implementation 'org.hibernate.validator:hibernate-validator' implementation 'com.auth0:java-jwt:4.5.0' + implementation 'org.springframework.boot:spring-boot-starter-actuator' + implementation 'io.micrometer:micrometer-core' + implementation 'io.micrometer:micrometer-registry-prometheus' testImplementation "org.mockito:mockito-core" testImplementation "org.mockito:mockito-junit-jupiter" testImplementation 'org.springframework.boot:spring-boot-starter-test' diff --git a/server/src/main/java/de/tum/cit/aet/devops/teamserverdown/security/JWTAuthenticationFilter.java b/server/src/main/java/de/tum/cit/aet/devops/teamserverdown/security/JWTAuthenticationFilter.java index 51e12d9b..452d6cb8 100644 --- a/server/src/main/java/de/tum/cit/aet/devops/teamserverdown/security/JWTAuthenticationFilter.java +++ b/server/src/main/java/de/tum/cit/aet/devops/teamserverdown/security/JWTAuthenticationFilter.java @@ -51,6 +51,8 @@ protected void doFilterInternal( @Override protected boolean shouldNotFilter(HttpServletRequest request) { String path = request.getServletPath(); - return path.startsWith("/v3/api-docs") || path.startsWith("/swagger-ui"); + return path.startsWith("/v3/api-docs") + || path.startsWith("/swagger-ui") + || path.startsWith("/actuator"); } } diff --git a/server/src/main/resources/application.yaml b/server/src/main/resources/application.yaml index 868c171a..b0e5bd1d 100644 --- a/server/src/main/resources/application.yaml +++ b/server/src/main/resources/application.yaml @@ -24,4 +24,13 @@ spring: dialect: org.hibernate.dialect.PostgreSQLDialect flyway: enabled: true - validate-on-migrate: true \ No newline at end of file + validate-on-migrate: true + +management: + endpoints: + web: + exposure: + include: [ "prometheus" ] + endpoint: + prometheus: + access: unrestricted \ No newline at end of file