Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion deployment/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import { deployPostgres } from './services/postgres';
import { deployProxy } from './services/proxy';
import { deployPublicGraphQLAPIGateway } from './services/public-graphql-api-gateway';
import { deployRedis } from './services/redis';
import { deployRedpanda } from './services/redpanda';
import { deployS3, deployS3AuditLog, deployS3Mirror } from './services/s3';
import { deploySchema } from './services/schema';
import { configureSentry } from './services/sentry';
Expand Down Expand Up @@ -78,6 +79,7 @@ const clickhouse = deployClickhouse();
const postgres = deployPostgres();
const redis = deployRedis({ environment });
const kafka = deployKafka();
const redpanda = deployRedpanda();
const s3 = deployS3();
const s3Mirror = deployS3Mirror();
const s3AuditLog = deployS3AuditLog();
Expand Down Expand Up @@ -284,6 +286,7 @@ const otelCollector = deployOTELCollector({
graphql,
dbMigrations,
clickhouse,
redpanda,
image: docker.factory.getImageId('otel-collector', imagesTag),
docker,
});
Expand Down Expand Up @@ -344,5 +347,7 @@ export const schemaApiServiceId = schema.service.id;
export const webhooksApiServiceId = webhooks.service.id;

export const appId = app.deployment.id;
export const otelCollectorId = otelCollector.deployment.id;
export const otelCollectorIngressId = otelCollector.ingress.deployment.id;
export const otelCollectorEgressId = otelCollector.egress.deployment.id;
export const redpandaStatefulSetId = redpanda.statefulSet.id;
export const publicIp = proxy.get()!.status.loadBalancer.ingress[0].ip;
51 changes: 47 additions & 4 deletions deployment/services/otel-collector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { DbMigrations } from './db-migrations';
import { Docker } from './docker';
import { Environment } from './environment';
import { GraphQL } from './graphql';
import { Redpanda } from './redpanda';

export type OTELCollector = ReturnType<typeof deployOTELCollector>;

Expand All @@ -15,9 +16,13 @@ export function deployOTELCollector(args: {
clickhouse: Clickhouse;
dbMigrations: DbMigrations;
graphql: GraphQL;
redpanda: Redpanda;
}) {
return new ServiceDeployment(
'otel-collector',
const kafkaBroker = args.redpanda.brokerEndpoint;

// Ingress: OTLP -> Redpanda
const ingress = new ServiceDeployment(
'otel-collector-ingress',
{
image: args.image,
imagePullSecret: args.docker.secret,
Expand All @@ -26,6 +31,7 @@ export function deployOTELCollector(args: {
HIVE_OTEL_AUTH_ENDPOINT: serviceLocalEndpoint(args.graphql.service).apply(
value => value + '/otel-auth',
),
KAFKA_BROKER: kafkaBroker,
},
/**
* We are using the healthcheck extension.
Expand All @@ -40,11 +46,40 @@ export function deployOTELCollector(args: {
pdb: true,
availabilityOnEveryNode: true,
port: 4318,
memoryLimit: args.environment.podsConfig.tracingCollector.memoryLimit,
memoryLimit: '512Mi',
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we keep this configurable by env?

autoScaling: {
maxReplicas: args.environment.podsConfig.tracingCollector.maxReplicas,
cpu: {
limit: args.environment.podsConfig.tracingCollector.cpuLimit,
limit: '500m',
cpuAverageToScale: 80,
},
},
},
[args.dbMigrations],
).deploy();

// Egress: Redpanda -> ClickHouse
const egress = new ServiceDeployment(
'otel-collector-egress',
{
image: args.image,
imagePullSecret: args.docker.secret,
env: {
...args.environment.envVars,
KAFKA_BROKER: kafkaBroker,
},
probePort: 13133,
readinessProbe: '/',
livenessProbe: '/',
startupProbe: '/',
exposesMetrics: true,
replicas: args.environment.podsConfig.tracingCollector.maxReplicas,
pdb: true,
memoryLimit: '512Mi',
autoScaling: {
maxReplicas: args.environment.podsConfig.tracingCollector.maxReplicas,
cpu: {
limit: '500m',
cpuAverageToScale: 80,
},
},
Expand All @@ -57,4 +92,12 @@ export function deployOTELCollector(args: {
.withSecret('CLICKHOUSE_PASSWORD', args.clickhouse.secret, 'password')
.withSecret('CLICKHOUSE_PROTOCOL', args.clickhouse.secret, 'protocol')
.deploy();

return {
ingress,
egress,
// For backward compatibility, expose ingress as the main deployment
deployment: ingress.deployment,
service: ingress.service,
};
}
198 changes: 198 additions & 0 deletions deployment/services/redpanda.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
import * as k8s from '@pulumi/kubernetes';
import * as pulumi from '@pulumi/pulumi';

export type Redpanda = ReturnType<typeof deployRedpanda>;

export function deployRedpanda() {
const redpandaConfig = new pulumi.Config('redpanda');
const replicas = redpandaConfig.getNumber('replicas') || 3;
const storageSize = redpandaConfig.get('storageSize') || '20Gi';
const memoryLimit = redpandaConfig.get('memoryLimit') || '1Gi';
const cpuLimit = redpandaConfig.get('cpuLimit') || '1000m';

const labels = { app: 'redpanda' };

// StatefulSet for Redpanda
const statefulSet = new k8s.apps.v1.StatefulSet('redpanda', {
metadata: {
name: 'redpanda',
},
spec: {
serviceName: 'redpanda',
replicas,
selector: {
matchLabels: labels,
},
template: {
metadata: {
labels,
},
spec: {
containers: [
{
name: 'redpanda',
image: 'redpandadata/redpanda:v25.3.1',
imagePullPolicy: 'Always',
resources: {
limits: {
cpu: cpuLimit,
memory: memoryLimit,
},
},
args: [
'redpanda',
'start',
'--overprovisioned',
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Redpanda expects a dev system to be an overprovisioned environment. Based on a Seastar option, setting overprovisioned disables thread affinity, zeros idle polling time, and disables busy-poll for disk I/O.

https://docs.redpanda.com/current/reference/rpk/rpk-redpanda/rpk-redpanda-mode/#production-mode

We want to make sure this is disabled in production

'--smp',
'1',
'--memory',
memoryLimit,
'--kafka-addr',
'PLAINTEXT://0.0.0.0:9092',
'--advertise-kafka-addr',
pulumi.interpolate`PLAINTEXT://\${HOSTNAME}.redpanda.default.svc.cluster.local:9092`,
],
ports: [
{ containerPort: 9092, name: 'kafka' },
{ containerPort: 8082, name: 'http' },
{ containerPort: 33145, name: 'rpc' },
{ containerPort: 9644, name: 'admin' },
],
volumeMounts: [
{
name: 'datadir',
mountPath: '/var/lib/redpanda/data',
},
],
livenessProbe: {
httpGet: {
path: '/v1/status/ready',
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for most of our services we use _health. I'm not sure if we want to keep this consistent or if we don't mind...

port: 9644 as any,
},
initialDelaySeconds: 30,
periodSeconds: 10,
},
readinessProbe: {
httpGet: {
path: '/v1/status/ready',
port: 9644 as any,
},
initialDelaySeconds: 10,
periodSeconds: 5,
},
},
],
},
},
volumeClaimTemplates: [
{
metadata: {
name: 'datadir',
},
spec: {
accessModes: ['ReadWriteOnce'],
resources: {
requests: {
storage: storageSize,
},
},
},
},
],
},
});

// Headless Service for StatefulSet (used for internal cluster communication)
const headlessService = new k8s.core.v1.Service('redpanda-headless', {
metadata: {
name: 'redpanda',
},
spec: {
clusterIP: 'None',
selector: labels,
ports: [
{ name: 'kafka', port: 9092, targetPort: 9092 as any },
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need the any casts ?

{ name: 'http', port: 8082, targetPort: 8082 as any },
{ name: 'rpc', port: 33145, targetPort: 33145 as any },
{ name: 'admin', port: 9644, targetPort: 9644 as any },
],
},
});

// ClusterIP Service for clients (load balances across all pods)
const clientService = new k8s.core.v1.Service('redpanda-client-service', {
metadata: {
name: 'redpanda-client',
},
spec: {
type: 'ClusterIP',
selector: labels,
ports: [
{ name: 'kafka', port: 9092, targetPort: 9092 as any },
{ name: 'http', port: 8082, targetPort: 8082 as any },
],
},
});

// Create otel-traces topic
const topicCreationJob = new k8s.batch.v1.Job(
'redpanda-topic-creation',
{
metadata: {
name: 'redpanda-topic-creation',
},
spec: {
template: {
spec: {
restartPolicy: 'OnFailure',
containers: [
{
name: 'rpk',
image: 'redpandadata/redpanda:v25.3.1',
imagePullPolicy: 'Always',
command: [
'/bin/bash',
'-c',
`
# Wait for Redpanda to be ready
for i in {1..60}; do
if rpk cluster health --brokers redpanda-0.redpanda:9092 2>/dev/null | grep -q 'Healthy'; then
echo "Redpanda cluster is ready"
break
fi
echo "Waiting for Redpanda cluster... ($i/60)"
sleep 5
done
# Create topic with partitioning only (no replication)
rpk topic create otel-traces \\
--brokers redpanda-0.redpanda:9092 \\
--replicas 1 \\
--partitions 10 \\
--config retention.ms=2592000000 \\
--config compression.type=snappy \\
--config max.message.bytes=10485760 \\
|| echo "Topic may already exist"
# Verify topic creation
rpk topic describe otel-traces --brokers redpanda-0.redpanda:9092
`,
],
},
],
},
},
},
},
{ dependsOn: [statefulSet, headlessService] },
);

return {
statefulSet,
headlessService,
clientService,
topicCreationJob,
// Client service endpoint - auto-discovers all brokers
brokerEndpoint: 'redpanda-client:9092',
};
}
21 changes: 21 additions & 0 deletions docker/configs/otel-collector/builder-config-egress.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
dist:
version: 0.122.0
name: otelcol-custom
description: Custom OTel Collector distribution
output_path: ./otelcol-custom

receivers:
- gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/kafkareceiver v0.122.0

processors:
- gomod: go.opentelemetry.io/collector/processor/batchprocessor v0.122.0

exporters:
- gomod: go.opentelemetry.io/collector/exporter/debugexporter v0.122.0
- gomod:
github.com/open-telemetry/opentelemetry-collector-contrib/exporter/clickhouseexporter v0.122.0

extensions:
- gomod:
github.com/open-telemetry/opentelemetry-collector-contrib/extension/healthcheckextension
v0.122.0
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,14 @@ receivers:
- gomod: go.opentelemetry.io/collector/receiver/otlpreceiver v0.122.0

processors:
- gomod: go.opentelemetry.io/collector/processor/batchprocessor v0.122.0
- gomod: go.opentelemetry.io/collector/processor/memorylimiterprocessor v0.122.0
- gomod:
github.com/open-telemetry/opentelemetry-collector-contrib/processor/attributesprocessor
v0.122.0
- gomod:
github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.122.0

exporters:
- gomod: go.opentelemetry.io/collector/exporter/debugexporter v0.122.0
- gomod:
github.com/open-telemetry/opentelemetry-collector-contrib/exporter/clickhouseexporter v0.122.0
- gomod: github.com/open-telemetry/opentelemetry-collector-contrib/exporter/kafkaexporter v0.122.0

extensions:
- gomod:
Expand Down
Loading
Loading