diff --git a/services/jaeger/docker-compose.yml.j2 b/services/jaeger/docker-compose.yml.j2 index 7de7b7ceb..4398652ff 100644 --- a/services/jaeger/docker-compose.yml.j2 +++ b/services/jaeger/docker-compose.yml.j2 @@ -33,7 +33,7 @@ services: memory: 64M cpus: "0.1" otel-collector: - image: otel/opentelemetry-collector-contrib:0.123.0 + image: otel/opentelemetry-collector-contrib:0.135.0 configs: - source: opentelemetry-collector-config target: /etc/otel/config.yaml diff --git a/services/jaeger/opentelemetry-collector-config.yaml b/services/jaeger/opentelemetry-collector-config.yaml index e8398d7e7..b311fa148 100644 --- a/services/jaeger/opentelemetry-collector-config.yaml +++ b/services/jaeger/opentelemetry-collector-config.yaml @@ -31,6 +31,11 @@ processors: error_mode: ignore traces: span: + - attributes["http.route"] == "/health" + - attributes["http.target"] == "/socket.io/" + - attributes["url.path"] == "/socket.io/" + - IsMatch(attributes["server.address"], "^monitoring..*" + - attributes["net.peer.name"] == "/var/run/docker.sock" - attributes["http.route"] == "healthcheck_readiness_probe" - attributes["http.route"] == "healthcheck_liveness_probe" - attributes["http.target"] == "/metrics" and IsMatch(attributes["http.user_agent"], ".*Prometheus.*") == true diff --git a/services/logging/docker-compose.yml.j2 b/services/logging/docker-compose.yml.j2 index c6b2b030d..7b1052649 100644 --- a/services/logging/docker-compose.yml.j2 +++ b/services/logging/docker-compose.yml.j2 @@ -138,7 +138,7 @@ services: start_period: 40s loki: - image: grafana/loki:3.5.0 + image: grafana/loki:3.5.4 configs: - source: loki_config target: /etc/loki/loki.yaml @@ -160,8 +160,8 @@ services: delay: 5s resources: limits: - cpus: '1.0' - memory: 2G + cpus: '2.0' + memory: 4G reservations: cpus: '0.5' memory: 1G diff --git a/services/logging/loki.yaml b/services/logging/loki.yaml index 5100e6f7d..8e43ec4ff 100644 --- a/services/logging/loki.yaml +++ b/services/logging/loki.yaml @@ -1,8 +1,10 @@ auth_enabled: false server: + log_level: info http_listen_port: 3100 - + grpc_server_max_recv_msg_size: 16777216 # 16MB (increase from 4MB default) + grpc_server_max_send_msg_size: 16777216 # 16MB common: path_prefix: /tmp/loki # Required for internal directories @@ -29,16 +31,30 @@ storage_config: active_index_directory: /tmp/loki/tsdb-index # Local cache for index metadata cache_location: /tmp/loki/tsdb-cache # Local cache for tsdb aws: - s3: s3://${S3_BUCKET_NAME_LOKI} + bucketnames: ${S3_BUCKET_NAME_LOKI} region: ${S3_REGION_LOKI} access_key_id: ${S3_ACCESS_KEY_LOKI} secret_access_key: ${S3_SECRET_KEY_LOKI} - s3forcepathstyle: ${S3_FORCE_PATH_STYLE_LOKI} # Set to true if using MinIO or S3-compatible API; optional for AWS - endpoint: ${S3_ENDPOINT_LOKI} # Optional; use for non-default endpoints + endpoint: ${S3_ENDPOINT_LOKI} + s3forcepathstyle: true compactor: working_directory: /tmp/loki/compactor retention_enabled: false limits_config: + reject_old_samples: true + reject_old_samples_max_age: 4h + max_cache_freshness_per_query: 10m + split_queries_by_interval: 15m + # for big logs tune + per_stream_rate_limit: 512M + per_stream_rate_limit_burst: 1024M + cardinality_limit: 200000 + ingestion_burst_size_mb: 1000 + ingestion_rate_mb: 10000 + max_entries_limit_per_query: 1000000 + max_label_value_length: 20480 + max_label_name_length: 10240 + max_label_names_per_series: 300 retention_period: ${LOKI_RETENTION_PERIOD} # must be >= 24h and multiple of index period (24h) diff --git a/services/monitoring/docker-compose.yml.j2 b/services/monitoring/docker-compose.yml.j2 index c5b6ee757..3c350fb09 100644 --- a/services/monitoring/docker-compose.yml.j2 +++ b/services/monitoring/docker-compose.yml.j2 @@ -372,7 +372,7 @@ services: memory: 32M cpus: "0.1" tempo: - image: grafana/tempo:2.7.2 + image: grafana/tempo:2.8.2 command: "-target=scalable-single-binary -config.file=/etc/tempo.yaml" configs: - source: tempo_config @@ -393,8 +393,8 @@ services: - traefik.http.routers.tempo.middlewares=ops_whitelist_ips@swarm, ops_gzip@swarm, tempo_replace_regex resources: limits: - memory: 2000M - cpus: "2.0" + memory: 4G + cpus: "5.0" configs: alertmanager_config: diff --git a/services/monitoring/tempo_config.yaml.j2 b/services/monitoring/tempo_config.yaml.j2 index 1ea39d90c..c28c41425 100644 --- a/services/monitoring/tempo_config.yaml.j2 +++ b/services/monitoring/tempo_config.yaml.j2 @@ -12,9 +12,9 @@ distributor: log_discarded_spans: enabled: true include_all_attributes: false -#ingester: -# max_block_duration: 5m # cut the headblock when this much time passes. this should probably be left alone normally - +ingester: + max_block_duration: 5m # cut the headblock when this much time passes. this should probably be left alone normally + trace_idle_period: 301s compactor: compaction: block_retention: 96h # overall Tempo trace retention. @@ -59,7 +59,7 @@ storage: querier: frontend_worker: frontend_address: 0.0.0.0:9095 - +stream_over_http_enabled: true overrides: defaults: metrics_generator: