Skip to content

Commit 132feb9

Browse files
authored
Reduce usage service readiness threshold (#6759)
1 parent aafb8ae commit 132feb9

File tree

6 files changed

+59
-8
lines changed

6 files changed

+59
-8
lines changed

.changeset/gorgeous-news-report.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
'hive': patch
3+
---
4+
5+
Reduce usage service readiness threshold; Disable nagles algorithm and increase keepAlive from 60s to 180s for KafkaJS

deployment/services/usage.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ export function deployUsage({
5454
readinessProbe: {
5555
initialDelaySeconds: 10,
5656
periodSeconds: 5,
57-
failureThreshold: 2,
57+
failureThreshold: 1,
5858
timeoutSeconds: 5,
5959
endpoint: '/_readiness',
6060
},

packages/services/usage/src/buffer.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,7 @@ export function createKVBuffer<T>(config: {
322322
logger.info('Stopping buffer');
323323
if (timeoutId) {
324324
clearTimeout(timeoutId);
325+
timeoutId = null;
325326
}
326327
await send({
327328
scheduleNextSend: false,

packages/services/usage/src/fallback-queue.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ export function createFallbackQueue(config: {
5757
stop() {
5858
if (timeoutId !== null) {
5959
clearTimeout(timeoutId);
60+
timeoutId = null;
6061
}
6162

6263
const limit = pLimit(10);

packages/services/usage/src/index.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,11 +135,12 @@ async function main() {
135135
const shutdown = registerShutdown({
136136
logger: server.log,
137137
async onShutdown() {
138-
server.log.info('Stopping tracing handler...');
139-
await tracing?.shutdown();
140-
141138
server.log.info('Stopping service handler...');
142139
await Promise.all([usage.stop(), server.close()]);
140+
141+
// shut down tracing last so that traces are sent till the very end
142+
server.log.info('Stopping tracing handler...');
143+
await tracing?.shutdown();
143144
},
144145
});
145146

packages/services/usage/src/usage.ts

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,13 @@
1-
import { CompressionTypes, Kafka, logLevel, Partitioners, RetryOptions } from 'kafkajs';
1+
import net from 'net';
2+
import tls from 'tls';
3+
import {
4+
CompressionTypes,
5+
ISocketFactory,
6+
Kafka,
7+
logLevel,
8+
Partitioners,
9+
RetryOptions,
10+
} from 'kafkajs';
211
import { traceInlineSync, type ServiceLogger } from '@hive/service-common';
312
import type { RawOperationMap, RawReport } from '@hive/usage-common';
413
import { compress } from '@hive/usage-common';
@@ -32,7 +41,7 @@ const levelMap = {
3241

3342
const retryOptions = {
3443
maxRetryTime: 30_000,
35-
initialRetryTime: 500,
44+
initialRetryTime: 300,
3645
factor: 0.2,
3746
multiplier: 2,
3847
retries: 5,
@@ -103,6 +112,25 @@ export function createUsage(config: {
103112
}) {
104113
const { logger } = config;
105114

115+
// Default KafkaJS socketFactory implementation with minor optimizations for Azure
116+
// https://github.com/tulios/kafkajs/blob/master/src/network/socketFactory.js
117+
const socketFactory: ISocketFactory = ({ host, port, ssl, onConnect }) => {
118+
const socket = ssl
119+
? tls.connect(
120+
Object.assign({ host, port }, !net.isIP(host) ? { servername: host } : {}, ssl),
121+
onConnect,
122+
)
123+
: net.connect({ host, port }, onConnect);
124+
125+
// This is equivalent to kafka's "connections.max.idle.ms"
126+
socket.setKeepAlive(true, 180_000);
127+
// disable nagle's algorithm to have higher throughput since this logic
128+
// is already buffering messages into large payloads
129+
socket.setNoDelay(true);
130+
131+
return socket;
132+
};
133+
106134
const kafka = new Kafka({
107135
clientId: 'usage',
108136
brokers: [config.kafka.connection.broker],
@@ -140,10 +168,11 @@ export function createUsage(config: {
140168
};
141169
},
142170
// settings recommended by Azure EventHub https://docs.microsoft.com/en-us/azure/event-hubs/apache-kafka-configurations
143-
requestTimeout: 60_000, //
171+
requestTimeout: 30_000,
144172
connectionTimeout: 5_000,
145173
authenticationTimeout: 5_000,
146174
retry: retryOptions,
175+
socketFactory,
147176
});
148177

149178
const producer = kafka.producer({
@@ -268,10 +297,25 @@ export function createUsage(config: {
268297
status = newStatus;
269298
}
270299

300+
producer.on(producer.events.CONNECT, () => {
301+
logger.info(`Kafka producer: connected`);
302+
303+
if (status === Status.Unhealthy) {
304+
changeStatus(Status.Ready);
305+
}
306+
});
307+
271308
producer.on(producer.events.REQUEST_TIMEOUT, () => {
272309
logger.info('Kafka producer: request timeout');
273310
});
274311

312+
producer.on(producer.events.DISCONNECT, () => {
313+
logger.info(`Kafka producer: disconnected`);
314+
if (status === Status.Ready) {
315+
changeStatus(Status.Unhealthy);
316+
}
317+
});
318+
275319
async function stop() {
276320
logger.info('Started Usage shutdown...');
277321

@@ -281,7 +325,6 @@ export function createUsage(config: {
281325
await fallback.stop();
282326
logger.info(`Fallback stopped`);
283327
await producer.disconnect();
284-
logger.info(`Producer disconnected`);
285328

286329
logger.info('Usage stopped');
287330
}

0 commit comments

Comments
 (0)