|
| 1 | +use crate::k8s::{K8sClient, K8sError, PodPhase}; |
1 | 2 | use async_trait::async_trait;
|
2 | 3 | use base64::{Engine, prelude::BASE64_STANDARD};
|
| 4 | +use etl_config::Environment; |
3 | 5 | use k8s_openapi::api::{
|
4 | 6 | apps::v1::StatefulSet,
|
5 | 7 | core::v1::{ConfigMap, Pod, Secret},
|
6 | 8 | };
|
7 |
| -use serde_json::json; |
8 |
| -use std::collections::BTreeMap; |
9 |
| -use thiserror::Error; |
10 |
| -use tracing::info; |
11 |
| - |
12 | 9 | use kube::{
|
13 | 10 | Client,
|
14 | 11 | api::{Api, DeleteParams, Patch, PatchParams},
|
15 | 12 | };
|
| 13 | +use serde_json::json; |
| 14 | +use std::collections::BTreeMap; |
| 15 | +use tracing::info; |
16 | 16 |
|
17 |
| -#[derive(Debug, Error)] |
18 |
| -pub enum K8sError { |
19 |
| - #[error("serde_json error: {0}")] |
20 |
| - Serde(#[from] serde_json::error::Error), |
21 |
| - |
22 |
| - #[error("kube error: {0}")] |
23 |
| - Kube(#[from] kube::Error), |
24 |
| -} |
25 |
| - |
26 |
| -pub enum PodPhase { |
27 |
| - Pending, |
28 |
| - Running, |
29 |
| - Succeeded, |
30 |
| - Failed, |
31 |
| - Unknown, |
32 |
| -} |
33 |
| - |
34 |
| -impl From<&str> for PodPhase { |
35 |
| - fn from(value: &str) -> Self { |
36 |
| - match value { |
37 |
| - "Pending" => PodPhase::Pending, |
38 |
| - "Running" => PodPhase::Running, |
39 |
| - "Succeeded" => PodPhase::Succeeded, |
40 |
| - "Failed" => PodPhase::Failed, |
41 |
| - _ => PodPhase::Unknown, |
42 |
| - } |
43 |
| - } |
44 |
| -} |
45 |
| - |
46 |
| -#[async_trait] |
47 |
| -pub trait K8sClient: Send + Sync { |
48 |
| - async fn create_or_update_postgres_secret( |
49 |
| - &self, |
50 |
| - prefix: &str, |
51 |
| - postgres_password: &str, |
52 |
| - ) -> Result<(), K8sError>; |
53 |
| - |
54 |
| - async fn create_or_update_bq_secret( |
55 |
| - &self, |
56 |
| - prefix: &str, |
57 |
| - bq_service_account_key: &str, |
58 |
| - ) -> Result<(), K8sError>; |
59 |
| - |
60 |
| - async fn delete_postgres_secret(&self, prefix: &str) -> Result<(), K8sError>; |
61 |
| - |
62 |
| - async fn delete_bq_secret(&self, prefix: &str) -> Result<(), K8sError>; |
63 |
| - |
64 |
| - async fn get_config_map(&self, config_map_name: &str) -> Result<ConfigMap, K8sError>; |
65 |
| - |
66 |
| - async fn create_or_update_config_map( |
67 |
| - &self, |
68 |
| - prefix: &str, |
69 |
| - base_config: &str, |
70 |
| - prod_config: &str, |
71 |
| - ) -> Result<(), K8sError>; |
72 |
| - |
73 |
| - async fn delete_config_map(&self, prefix: &str) -> Result<(), K8sError>; |
74 |
| - |
75 |
| - async fn create_or_update_stateful_set( |
76 |
| - &self, |
77 |
| - prefix: &str, |
78 |
| - replicator_image: &str, |
79 |
| - template_annotations: Option<BTreeMap<String, String>>, |
80 |
| - ) -> Result<(), K8sError>; |
81 |
| - |
82 |
| - async fn delete_stateful_set(&self, prefix: &str) -> Result<(), K8sError>; |
83 |
| - |
84 |
| - async fn get_pod_phase(&self, prefix: &str) -> Result<PodPhase, K8sError>; |
85 |
| - |
86 |
| - async fn has_replicator_container_error(&self, prefix: &str) -> Result<bool, K8sError>; |
87 |
| - |
88 |
| - async fn delete_pod(&self, prefix: &str) -> Result<(), K8sError>; |
89 |
| -} |
90 |
| - |
91 |
| -#[derive(Debug)] |
92 |
| -pub struct HttpK8sClient { |
93 |
| - secrets_api: Api<Secret>, |
94 |
| - config_maps_api: Api<ConfigMap>, |
95 |
| - stateful_sets_api: Api<StatefulSet>, |
96 |
| - pods_api: Api<Pod>, |
97 |
| -} |
98 |
| - |
| 17 | +/// Secret name suffix for the BigQuery service account key. |
99 | 18 | const BQ_SECRET_NAME_SUFFIX: &str = "bq-service-account-key";
|
| 19 | +/// Secret name suffix for the Postgres password. |
100 | 20 | const POSTGRES_SECRET_NAME_SUFFIX: &str = "postgres-password";
|
| 21 | +/// ConfigMap name suffix for the replicator configuration files. |
101 | 22 | const REPLICATOR_CONFIG_MAP_NAME_SUFFIX: &str = "replicator-config";
|
| 23 | +/// StatefulSet name suffix for the replicator workload. |
102 | 24 | const REPLICATOR_STATEFUL_SET_SUFFIX: &str = "replicator-stateful-set";
|
| 25 | +/// Application label suffix used to group resources. |
103 | 26 | const REPLICATOR_APP_SUFFIX: &str = "replicator-app";
|
| 27 | +/// Container name suffix for the replicator container. |
104 | 28 | const REPLICATOR_CONTAINER_NAME_SUFFIX: &str = "replicator";
|
| 29 | +/// Container name suffix for the Vector sidecar. |
105 | 30 | const VECTOR_CONTAINER_NAME_SUFFIX: &str = "vector";
|
| 31 | +/// Namespace where data-plane resources are created. |
106 | 32 | const DATA_PLANE_NAMESPACE: &str = "etl-data-plane";
|
| 33 | +/// Secret storing the Logflare API key. |
107 | 34 | const LOGFLARE_SECRET_NAME: &str = "replicator-logflare-api-key";
|
| 35 | +/// Docker image used for the Vector sidecar. |
108 | 36 | const VECTOR_IMAGE_NAME: &str = "timberio/vector:0.46.1-distroless-libc";
|
| 37 | +/// ConfigMap name containing the Vector configuration. |
109 | 38 | const VECTOR_CONFIG_MAP_NAME: &str = "replicator-vector-config";
|
| 39 | +/// Volume name for the replicator config file. |
110 | 40 | const REPLICATOR_CONFIG_FILE_VOLUME_NAME: &str = "replicator-config-file";
|
| 41 | +/// Volume name for the Vector config file. |
111 | 42 | const VECTOR_CONFIG_FILE_VOLUME_NAME: &str = "vector-config-file";
|
| 43 | +/// Secret storing the Sentry DSN. |
112 | 44 | const SENTRY_DSN_SECRET_NAME: &str = "replicator-sentry-dsn";
|
| 45 | +/// EmptyDir volume name used to share logs. |
113 | 46 | const LOGS_VOLUME_NAME: &str = "logs";
|
| 47 | +/// ConfigMap name providing trusted root certificates. |
114 | 48 | pub const TRUSTED_ROOT_CERT_CONFIG_MAP_NAME: &str = "trusted-root-certs-config";
|
| 49 | +/// Key inside the trusted root certificates ConfigMap. |
115 | 50 | pub const TRUSTED_ROOT_CERT_KEY_NAME: &str = "trusted_root_certs";
|
| 51 | +/// Environment variable for the Postgres password. |
116 | 52 | const PG_PASSWORD_ENV_VAR_NAME: &str = "APP_PIPELINE__PG_CONNECTION__PASSWORD";
|
| 53 | +/// Environment variable for the BigQuery service account key. |
117 | 54 | const BIG_QUERY_SA_KEY_ENV_VAR_NAME: &str = "APP_DESTINATION__BIG_QUERY__SERVICE_ACCOUNT_KEY";
|
| 55 | +/// Pod template annotation used to trigger rolling restarts. |
118 | 56 | pub const RESTARTED_AT_ANNOTATION_KEY: &str = "etl.supabase.com/restarted-at";
|
| 57 | +/// Label used to identify replicator pods. |
119 | 58 | const REPLICATOR_APP_LABEL: &str = "etl-replicator-app";
|
120 | 59 |
|
| 60 | +/// Replicator memory limit tuned for `c6in.4xlarge` instances. |
| 61 | +const REPLICATOR_MAX_MEMORY_PROD: &str = "500Mi"; |
| 62 | +/// Replicator CPU limit tuned for `c6in.4xlarge` instances. |
| 63 | +const REPLICATOR_MAX_CPU_PROD: &str = "100m"; |
| 64 | +/// Replicator memory limit tuned for `t3.small` instances. |
| 65 | +const REPLICATOR_MAX_MEMORY_STAGING: &str = "100Mi"; |
| 66 | +/// Replicator CPU limit tuned for `t3.small` instances. |
| 67 | +const REPLICATOR_MAX_CPU_STAGING: &str = "100m"; |
| 68 | + |
| 69 | +/// Runtime limits derived from the current environment. |
| 70 | +struct DynamicReplicatorConfig { |
| 71 | + max_memory: &'static str, |
| 72 | + max_cpu: &'static str, |
| 73 | +} |
| 74 | + |
| 75 | +impl DynamicReplicatorConfig { |
| 76 | + /// Loads the runtime limits for the current environment. |
| 77 | + fn load() -> Result<Self, K8sError> { |
| 78 | + let environment = Environment::load().map_err(|_| K8sError::ReplicatorConfiguration)?; |
| 79 | + |
| 80 | + let config = match environment { |
| 81 | + Environment::Prod => Self { |
| 82 | + max_memory: REPLICATOR_MAX_MEMORY_PROD, |
| 83 | + max_cpu: REPLICATOR_MAX_CPU_PROD, |
| 84 | + }, |
| 85 | + _ => Self { |
| 86 | + max_memory: REPLICATOR_MAX_MEMORY_STAGING, |
| 87 | + max_cpu: REPLICATOR_MAX_CPU_STAGING, |
| 88 | + }, |
| 89 | + }; |
| 90 | + |
| 91 | + Ok(config) |
| 92 | + } |
| 93 | +} |
| 94 | + |
| 95 | +/// HTTP-based implementation of [`K8sClient`]. |
| 96 | +/// |
| 97 | +/// The client is namespaced to the data-plane namespace and uses server-side |
| 98 | +/// apply to keep resources in sync. |
| 99 | +#[derive(Debug)] |
| 100 | +pub struct HttpK8sClient { |
| 101 | + secrets_api: Api<Secret>, |
| 102 | + config_maps_api: Api<ConfigMap>, |
| 103 | + stateful_sets_api: Api<StatefulSet>, |
| 104 | + pods_api: Api<Pod>, |
| 105 | +} |
| 106 | + |
121 | 107 | impl HttpK8sClient {
|
| 108 | + /// Creates a new [`HttpK8sClient`] using the ambient Kubernetes config. |
| 109 | + /// |
| 110 | + /// Prefers in-cluster configuration and falls back to the local kubeconfig |
| 111 | + /// when running outside the cluster. |
122 | 112 | pub async fn new() -> Result<HttpK8sClient, K8sError> {
|
123 | 113 | let client = Client::try_default().await?;
|
124 | 114 |
|
@@ -320,6 +310,8 @@ impl K8sClient for HttpK8sClient {
|
320 | 310 | let bq_secret_name = format!("{prefix}-{BQ_SECRET_NAME_SUFFIX}");
|
321 | 311 | let replicator_config_map_name = format!("{prefix}-{REPLICATOR_CONFIG_MAP_NAME_SUFFIX}");
|
322 | 312 |
|
| 313 | + let config = DynamicReplicatorConfig::load()?; |
| 314 | + |
323 | 315 | let mut stateful_set_json = json!({
|
324 | 316 | "apiVersion": "apps/v1",
|
325 | 317 | "kind": "StatefulSet",
|
@@ -393,11 +385,12 @@ impl K8sClient for HttpK8sClient {
|
393 | 385 | ],
|
394 | 386 | "resources": {
|
395 | 387 | "limits": {
|
396 |
| - "memory": "200Mi", |
| 388 | + "memory": config.max_memory, |
| 389 | + "cpu": config.max_cpu, |
397 | 390 | },
|
398 | 391 | "requests": {
|
399 |
| - "memory": "200Mi", |
400 |
| - "cpu": "100m" |
| 392 | + "memory": config.max_memory, |
| 393 | + "cpu": config.max_cpu, |
401 | 394 | }
|
402 | 395 | },
|
403 | 396 | "volumeMounts": [
|
|
0 commit comments