Skip to content

Commit 0c35185

Browse files
authored
Introduced Prometheus metrics endpoint for scraping (#1466)
* Initial Python support * .NET support * Introduced Prometheus metrics endpoint for scraping * Streamline port numbers * Remove test application files and add new .NET and Python test applications with Helm charts - Deleted the TypeScript test application including package.json, server.ts, and tsconfig.json. - Added a new .NET test application with Dockerfile, project file, and source code. - Created a Helm chart for the .NET test application including deployment and service definitions. - Added a new Python test application with Dockerfile, source code, and Helm chart. - Each application includes a simple HTTP server that can call a target URL and handle errors. * Cleanup * Minor refactoring * Cleanup * Enable private preview languages in validation apps * Refactor validate-mutation script to use dynamic environment variable and init container arrays * Add support for Python and .NET targets in the test app configuration * Enhance error handling in call-target route with logging * Refactor error handling in call-target route to throw exceptions instead of writing error messages * Add build and push instructions to Dockerfiles for various test apps * Remove redundant status code setting for unexpected errors in call-target route * Ensure imagePullPolicy is set to Always for nodejs-source-app container * Add global exception handler for unhandled exceptions in the .NET application * Refactor error handling in /call-target endpoint to log exceptions and return error responses * Remove deprecated Kubernetes resource definitions and related configurations for app monitoring * Remove obsolete configuration files and change log for app monitoring * Move .NET test app to MVC * Refactor /call-target endpoint to log exceptions and improve error handling * Enhance error handling in /call-target endpoint with OpenTelemetry integration * Refactor telemetry validation in validate_ai.sh to conditionally skip exceptions for Dotnet app and simplify error handling in dotnet-test-app.cs * Improve health checks for app-monitoring-webhook deployment by validating pod readiness and container status * Enhance deployment health check for app-monitoring-webhook by tracking rollout duration and verifying running pods * Add validation output for MutatingWebhookConfiguration in test-cronjob.sh * Enhance test-cronjob.sh by adding validation outputs for secret store and MutatingWebhookConfiguration states after modifications * Enhance deployment health checks for app-monitoring-webhook by validating pod and container states specific to the latest ReplicaSet * Fix secret store reference in test-cronjob.sh to use correct secret name * Refactor deployment health check for app-monitoring-webhook to improve rollout status validation and error handling * Refactor message in test-cronjob.sh for clarity on secret store repair process * Fix caBundle value in MutatingWebhookConfiguration for app-monitoring-webhook * Fix casing of caBundle in test-cronjob.sh output messages for consistency * Update Helm chart from aks-rp repo * Update README.md to include R2D process completion step and clarify rollout monitoring instructions * Update README.md to clarify the R2D process and enhance instructions for monitoring AKS RP rollouts * Update DotNet SDK image tag to 1.0.0-beta5 * Better coverage for Python and DotNet in tests * Improve error logging in Mutator by including exception message in heartbeat metric * Refactor server to use http for Prom endpoint server * Expose CR count and instrumented namespace count metrics to prom scraper * Update Python agent image tag to 1.0.0b25-aks * Revert "Update Python agent image tag to 1.0.0b25-aks" This reverts commit 4793286.
1 parent fdcb2c7 commit 0c35185

File tree

6 files changed

+105
-11
lines changed

6 files changed

+105
-11
lines changed

appmonitoring/ts/src/Dockerfile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@ ENV IMAGE_TAG=$IMAGE_TAG
99
ARG TARGETARCH
1010
ENV ARCH=$TARGETARCH
1111

12+
ARG WEBHOOK_PORT=1337
13+
ENV WEBHOOK_PORT=$WEBHOOK_PORT
14+
15+
ARG PROM_PORT=4000
16+
ENV PROM_PORT=$PROM_PORT
17+
1218
RUN tdnf upgrade -y && tdnf clean all
1319

1420
# Smoke test ensuring that NodeJS was installed.
@@ -20,5 +26,6 @@ COPY ./out/* /mutating-webhook
2026
COPY ./package.json /mutating-webhook
2127
COPY ./node_modules /mutating-webhook/node_modules
2228

23-
EXPOSE 1337
29+
EXPOSE $WEBHOOK_PORT
30+
EXPOSE $PROM_PORT
2431
CMD [ "node", "server.js" ]

appmonitoring/ts/src/K8sWatcher.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,11 @@ export class K8sWatcher {
1818
const k8sApi = kc.makeApiClient(k8s.CustomObjectsApi);
1919
const watch = new k8s.Watch(kc);
2020

21-
logger.registerWatchdog(Watchdogs.SecondsSinceLastSuccessfulCRList, () => (new Date().getTime() - K8sWatcher.lastSuccessfulListTimestamp.getTime()) / 1000.0);
21+
logger.registerWatchdog(Watchdogs.SecondsSinceLastSuccessfulCRList, () => {
22+
const secondsElapsed = (new Date().getTime() - K8sWatcher.lastSuccessfulListTimestamp.getTime()) / 1000.0;
23+
logger.SecondsSinceLastSuccessfulCRListSummary.observe(secondsElapsed);
24+
return secondsElapsed;
25+
});
2226

2327
let latestResourceVersion: string = null;
2428
while (true) { // eslint-disable-line

appmonitoring/ts/src/LoggerWrapper.ts

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ import { PodInfo } from "./RequestDefinition.js";
66
import log4js from "log4js";
77
import { InstrumentationCRsCollection } from "./InstrumentationCRsCollection.js";
88

9+
import { Registry, collectDefaultMetrics, Summary } from "prom-client";
10+
911
const { configure, getLogger } = log4js;
1012

1113
configure({
@@ -128,6 +130,11 @@ class LocalLogger {
128130
}
129131
}
130132

133+
public Register: Registry;
134+
public SecondsSinceLastSuccessfulCRListSummary: Summary;
135+
public CRCountSummary: Summary;
136+
public InstrumentedNamespaceCountSummary: Summary;
137+
131138
private static instance: LocalLogger = null;
132139

133140
private isUnitTestMode = false;
@@ -146,6 +153,28 @@ class LocalLogger {
146153
this.clusterMetadata = new ClusterMetadata(clusterArmId, clusterArmRegion, podName, imageTag, arch);
147154

148155
this.log.info(`Application Insights has been set up and started. Default telemetry client is: ${this.client}, cluster metadata: ${JSON.stringify(this.clusterMetadata)}`);
156+
157+
this.Register = new Registry();
158+
159+
collectDefaultMetrics({ register: this.Register });
160+
161+
this.SecondsSinceLastSuccessfulCRListSummary = new Summary({
162+
name: 'cr_list_seconds_since_last_successful',
163+
help: 'Seconds elapsed since the last successful CR list call'
164+
});
165+
this.CRCountSummary = new Summary({
166+
name: 'cr_count',
167+
help: 'Number of CRs in the cluster'
168+
});
169+
this.InstrumentedNamespaceCountSummary = new Summary({
170+
name: 'instrumented_namespace_count',
171+
help: 'Number of namespaces in the cluster that have at least one CR'
172+
});
173+
this.Register.registerMetric(this.SecondsSinceLastSuccessfulCRListSummary);
174+
this.Register.registerMetric(this.CRCountSummary);
175+
this.Register.registerMetric(this.InstrumentedNamespaceCountSummary);
176+
177+
this.log.info(`Prometheus client has been set up.`);
149178
}
150179

151180
public trace(message: string, operationId: string, requestMetadata: RequestMetadata) {

appmonitoring/ts/src/package-lock.json

Lines changed: 27 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

appmonitoring/ts/src/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
"@types/npm": "^7.19.0",
2121
"applicationinsights": "^3.7.0",
2222
"log4js": "^6.9.1",
23-
"node-forge": "^1.3.1"
23+
"node-forge": "^1.3.1",
24+
"prom-client": "^15.1.3"
2425
},
2526
"engines": {
2627
"node": "^20.4.0"

appmonitoring/ts/src/server.ts

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
import * as https from "https";
1+
import * as http from "http";
2+
import * as https from "https";
23
import { Mutator } from "./Mutator.js";
34
import { Events, HeartbeatMetrics, HeartbeatLogs, logger, RequestMetadata } from "./LoggerWrapper.js";
45
import { InstrumentationCR, IAdmissionReview } from "./RequestDefinition.js";
@@ -91,8 +92,25 @@ try {
9192
throw e;
9293
}
9394

94-
const port = process.env.port || 1337;
95-
logger.info(`listening on port ${port}`, operationId, null);
95+
const promPort = process.env.PROM_PORT;
96+
logger.info(`Prom endpoint is available on port ${promPort}`, operationId, null);
97+
const promServer = http.createServer(async (req, res) => {
98+
if (req.method === "GET" && (req.url === "/metrics" || req.url === "/metrics/") && logger?.Register) {
99+
try {
100+
res.writeHead(200, { "Content-Type": logger.Register.contentType });
101+
res.end(await logger.Register.metrics());
102+
} catch (e) {
103+
res.writeHead(500, { "Content-Type": "application/json" });
104+
res.end(JSON.stringify(e));
105+
}
106+
} else {
107+
res.writeHead(404);
108+
res.end('Not Found');
109+
}
110+
}).listen(promPort);
111+
112+
const port = process.env.WEBHOOK_PORT;
113+
logger.info(`Webhook is listening on port ${port}`, operationId, null);
96114

97115
const server = https.createServer(options, (req, res) => {
98116
logger.info(`Received request with url: ${req.url}, method: ${req.method}, content-type: ${req.headers["content-type"]}`, operationId, null);
@@ -150,12 +168,19 @@ const server = https.createServer(options, (req, res) => {
150168
res.writeHead(404);
151169
res.end();
152170
}
153-
154171
}).listen(port);
155172

156173
logger.info(`Server created on port ${port}`, null, null);
157174

158-
function shutdownServer() {
175+
function shutdownServers() {
176+
promServer.close((err) => {
177+
if (err) {
178+
logger.error(`Error shutting down prom server: ${err}`, operationId, null);
179+
} else {
180+
logger.info("Prom server has shut down gracefully", operationId, null);
181+
}
182+
});
183+
159184
server.close((err) => {
160185
if (err) {
161186
logger.error(`Error shutting down server: ${err}`, operationId, null);
@@ -168,8 +193,8 @@ function shutdownServer() {
168193
}
169194

170195
// listen for process termination signals
171-
process.on("SIGINT", shutdownServer);
172-
process.on("SIGTERM", shutdownServer);
196+
process.on("SIGINT", shutdownServers);
197+
process.on("SIGTERM", shutdownServers);
173198

174199
const keepAlive = new Promise<void>((resolve) => {
175200
process.on('SIGINT', resolve);
@@ -184,9 +209,11 @@ logger.info("Server shut down, exiting now", operationId, null);
184209
function logCRs(crs: InstrumentationCRsCollection) {
185210
const items: InstrumentationCR[] = crs.ListCRs();
186211
logger.setHeartbeatMetric(HeartbeatMetrics.CRCount, items.length);
212+
logger.CRCountSummary.observe(items.length);
187213

188214
const uniqueNamespaces = new Set<string>(items.map(cr => cr.metadata.namespace, this));
189215
logger.setHeartbeatMetric(HeartbeatMetrics.InstrumentedNamespaceCount, uniqueNamespaces.size);
216+
logger.InstrumentedNamespaceCountSummary.observe(uniqueNamespaces.size);
190217

191218
let log = "CRs: [";
192219
for (let i = 0; i < items.length; i++) {

0 commit comments

Comments
 (0)