diff --git a/scripts/deployments/deploy_everything_locally.bash b/scripts/deployments/deploy_everything_locally.bash index e1c0c1f1..a86c911c 100755 --- a/scripts/deployments/deploy_everything_locally.bash +++ b/scripts/deployments/deploy_everything_locally.bash @@ -243,9 +243,9 @@ if [ "$start_opsstack" -eq 0 ]; then call_make "." up-"$stack_target"; popd - # -------------------------------- GRAYLOG ------------------------------- - log_info "starting graylog..." - service_dir="${repo_basedir}"/services/graylog + # -------------------------------- LOGGING ------------------------------- + log_info "starting logging..." + service_dir="${repo_basedir}"/services/logging pushd "${service_dir}" call_make "." up-"$stack_target" sleep 1 diff --git a/services/graylog/GraylogWorkflow.png b/services/graylog/GraylogWorkflow.png deleted file mode 100644 index 0f795fa6..00000000 Binary files a/services/graylog/GraylogWorkflow.png and /dev/null differ diff --git a/services/graylog/docker-compose.aws.yml b/services/graylog/docker-compose.aws.yml deleted file mode 100644 index b360b20c..00000000 --- a/services/graylog/docker-compose.aws.yml +++ /dev/null @@ -1,18 +0,0 @@ -services: - mongodb: - deploy: - placement: - constraints: - - node.labels.graylog==true - elasticsearch: - deploy: - placement: - constraints: - - node.labels.graylog==true - graylog: - dns: # Add this always for AWS, otherwise we get "No such image: " for docker services - 8.8.8.8 - deploy: - placement: - constraints: - - node.labels.graylog==true diff --git a/services/graylog/docker-compose.dalco.yml b/services/graylog/docker-compose.dalco.yml deleted file mode 100644 index ad187885..00000000 --- a/services/graylog/docker-compose.dalco.yml +++ /dev/null @@ -1,18 +0,0 @@ -services: - mongodb: - deploy: - placement: - constraints: - - node.labels.graylog==true - - elasticsearch: - deploy: - placement: - constraints: - - node.labels.graylog==true - - graylog: - deploy: - placement: - constraints: - - node.labels.graylog==true diff --git a/services/graylog/docker-compose.master.yml b/services/graylog/docker-compose.master.yml deleted file mode 100644 index ad187885..00000000 --- a/services/graylog/docker-compose.master.yml +++ /dev/null @@ -1,18 +0,0 @@ -services: - mongodb: - deploy: - placement: - constraints: - - node.labels.graylog==true - - elasticsearch: - deploy: - placement: - constraints: - - node.labels.graylog==true - - graylog: - deploy: - placement: - constraints: - - node.labels.graylog==true diff --git a/services/graylog/.gitignore b/services/logging/.gitignore similarity index 100% rename from services/graylog/.gitignore rename to services/logging/.gitignore diff --git a/services/graylog/Makefile b/services/logging/Makefile similarity index 100% rename from services/graylog/Makefile rename to services/logging/Makefile diff --git a/services/graylog/README.md b/services/logging/README.md similarity index 100% rename from services/graylog/README.md rename to services/logging/README.md diff --git a/services/graylog/data/contentpacks/osparc-custom-content-pack-v2.json b/services/logging/data/contentpacks/osparc-custom-content-pack-v2.json similarity index 100% rename from services/graylog/data/contentpacks/osparc-custom-content-pack-v2.json rename to services/logging/data/contentpacks/osparc-custom-content-pack-v2.json diff --git a/services/logging/docker-compose.aws.yml b/services/logging/docker-compose.aws.yml new file mode 100644 index 00000000..cae555cc --- /dev/null +++ b/services/logging/docker-compose.aws.yml @@ -0,0 +1,28 @@ +services: + mongodb: + deploy: + placement: + constraints: + - node.labels.logging==true + elasticsearch: + deploy: + placement: + constraints: + - node.labels.logging==true + graylog: + deploy: + placement: + constraints: + - node.labels.logging==true + + fluentd: + deploy: + placement: + constraints: + - node.labels.logging==true + + loki: + deploy: + placement: + constraints: + - node.labels.logging==true diff --git a/services/logging/docker-compose.dalco.yml b/services/logging/docker-compose.dalco.yml new file mode 100644 index 00000000..c8c7f863 --- /dev/null +++ b/services/logging/docker-compose.dalco.yml @@ -0,0 +1,30 @@ +services: + mongodb: + deploy: + placement: + constraints: + - node.labels.logging==true + + elasticsearch: + deploy: + placement: + constraints: + - node.labels.logging==true + + graylog: + deploy: + placement: + constraints: + - node.labels.logging==true + + fluentd: + deploy: + placement: + constraints: + - node.labels.logging==true + + loki: + deploy: + placement: + constraints: + - node.labels.logging==true diff --git a/services/graylog/docker-compose.letsencrypt.dns.yml b/services/logging/docker-compose.letsencrypt.dns.yml similarity index 100% rename from services/graylog/docker-compose.letsencrypt.dns.yml rename to services/logging/docker-compose.letsencrypt.dns.yml diff --git a/services/graylog/docker-compose.letsencrypt.http.yml b/services/logging/docker-compose.letsencrypt.http.yml similarity index 100% rename from services/graylog/docker-compose.letsencrypt.http.yml rename to services/logging/docker-compose.letsencrypt.http.yml diff --git a/services/graylog/docker-compose.local.yml b/services/logging/docker-compose.local.yml similarity index 76% rename from services/graylog/docker-compose.local.yml rename to services/logging/docker-compose.local.yml index 036d0dee..69bc86ad 100644 --- a/services/graylog/docker-compose.local.yml +++ b/services/logging/docker-compose.local.yml @@ -13,3 +13,8 @@ services: deploy: placement: constraints: [] + + fluentd: + deploy: + placement: + constraints: [] diff --git a/services/logging/docker-compose.master.yml b/services/logging/docker-compose.master.yml new file mode 100644 index 00000000..c8c7f863 --- /dev/null +++ b/services/logging/docker-compose.master.yml @@ -0,0 +1,30 @@ +services: + mongodb: + deploy: + placement: + constraints: + - node.labels.logging==true + + elasticsearch: + deploy: + placement: + constraints: + - node.labels.logging==true + + graylog: + deploy: + placement: + constraints: + - node.labels.logging==true + + fluentd: + deploy: + placement: + constraints: + - node.labels.logging==true + + loki: + deploy: + placement: + constraints: + - node.labels.logging==true diff --git a/services/logging/docker-compose.yml.j2 b/services/logging/docker-compose.yml.j2 new file mode 100644 index 00000000..1f822ee8 --- /dev/null +++ b/services/logging/docker-compose.yml.j2 @@ -0,0 +1,196 @@ +services: + # MongoDB: https://hub.docker.com/_/mongo/ + mongodb: + image: mongo:6.0.6 + init: true + volumes: + # data persistency + - mongo_data:/data/db + deploy: + replicas: 1 + restart_policy: + condition: on-failure + resources: + limits: + memory: 1.2G + cpus: "1" + reservations: + memory: 300M + cpus: "0.1" + networks: + graylog: + aliases: + - mongo # needed because of graylog configuration + + # Elasticsearch: https://www.elastic.co/guide/en/elasticsearch/reference/6.6/docker.html + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch-oss:7.10.2 + init: true + volumes: + # data persistency + - elasticsearch_data:/usr/share/elasticsearch/data + environment: + - http.host=0.0.0.0 + - transport.host=localhost + - network.host=0.0.0.0 + - "ES_JAVA_OPTS=-Xms512m -Xmx512m" + deploy: + replicas: 1 + restart_policy: + condition: on-failure + resources: + limits: + memory: 2G + cpus: "2" + reservations: + memory: 1G + cpus: "0.1" + networks: + graylog: + # Graylog: https://hub.docker.com/r/graylog/graylog/ + graylog: + image: graylog/graylog:6.0.5 + init: true + # user: "1000:1001" + configs: + - source: graylog_config + target: /files/osparc-custom-content-pack-v2.json + volumes: + # Mount local configuration directory into Docker container + # - graylog_config:/usr/share/graylog/data/config + # data persistency + - graylog_journal:/usr/share/graylog/data/journal + env_file: + - .env + environment: + # CHANGE ME (must be at least 16 characters)! + - GRAYLOG_PASSWORD_SECRET=${GRAYLOG_PASSWORD_SECRET} + # Username: admin + - GRAYLOG_ROOT_PASSWORD_SHA2=${GRAYLOG_ROOT_PASSWORD_SHA2} + - GRAYLOG_HTTP_EXTERNAL_URI=${GRAYLOG_HTTP_EXTERNAL_URI} + - GRAYLOG_ELASTICSEARCH_HOSTS=http://elasticsearch:9200, + networks: + public: + monitoring: + graylog: + aliases: + - graylog + ports: + - 12201:12201/udp + - 12202:12202/udp + deploy: + replicas: 1 + restart_policy: + condition: on-failure + resources: + limits: + cpus: "2.00" + memory: 5G + reservations: + cpus: "0.1" + memory: 1G + labels: + - traefik.enable=true + - traefik.docker.network=${PUBLIC_NETWORK} + # direct access through port + - traefik.http.services.graylog.loadbalancer.server.port=9000 + - traefik.http.routers.graylog.rule=Host(`${MONITORING_DOMAIN}`) && PathPrefix(`/graylog`) + - traefik.http.routers.graylog.entrypoints=https + - traefik.http.routers.graylog.tls=true + - traefik.http.middlewares.graylog_replace_regex.replacepathregex.regex=^/graylog/?(.*)$$ + - traefik.http.middlewares.graylog_replace_regex.replacepathregex.replacement=/$${1} + - traefik.http.routers.graylog.middlewares=ops_whitelist_ips@swarm, ops_gzip@swarm, graylog_replace_regex + fluentd: + image: itisfoundation/fluentd:v1.16.9-1.0 + configs: + - source: fluentd_config + target: /fluentd/etc/fluent.conf + environment: + - GRAYLOG_HOST=graylog + - GRAYLOG_PORT=12201 + - LOKI_URL=http://loki:3100 + - FLUENTD_HOSTNAME={% raw %}{{.Node.Hostname}}{% endraw %} + ports: + - "24224:24224/tcp" + deploy: + #mode: global # Run on all nodes + restart_policy: + condition: on-failure + resources: + limits: + cpus: '1.0' + memory: 1G + reservations: + cpus: '0.5' + memory: 512M + update_config: + parallelism: 1 + delay: 10s + order: start-first + networks: + - monitoring + - graylog + healthcheck: + test: ["CMD", "curl", "-f", "http://0.0.0.0:24220/api/plugins"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + + loki: + image: grafana/loki:3.5.0 + configs: + - source: loki_config + target: /etc/loki/loki.yaml + command: -config.file=/etc/loki/loki.yaml + deploy: + placement: + constraints: [] + replicas: 1 + restart_policy: + condition: any + delay: 5s + resources: + limits: + cpus: '1.0' + memory: 2G + reservations: + cpus: '0.5' + memory: 1G + update_config: + parallelism: 1 + delay: 10s + order: start-first + networks: + - monitoring + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://0.0.0.0:3100/ready"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + + +volumes: + mongo_data: + elasticsearch_data: + graylog_journal: + +networks: + graylog: + public: + external: true + name: ${PUBLIC_NETWORK} + monitoring: + external: true + name: ${MONITORED_NETWORK} +configs: + graylog_config: + name: ${STACK_NAME}_graylog_config_{{ "./data/contentpacks/osparc-custom-content-pack-v2.json" | sha256file | substring(0,10) }} + file: ./data/contentpacks/osparc-custom-content-pack-v2.json + fluentd_config: + name: ${STACK_NAME}_fluentd_config_{{ "./fluentd/fluent.conf" | sha256file | substring(0,10) }} + file: ./fluentd/fluent.conf + loki_config: + name: ${STACK_NAME}_loki_config_{{ "./loki.yaml" | sha256file | substring(0,10) }} + file: ./loki.yaml diff --git a/services/logging/fluentd/Dockerfile b/services/logging/fluentd/Dockerfile new file mode 100644 index 00000000..ee66adff --- /dev/null +++ b/services/logging/fluentd/Dockerfile @@ -0,0 +1,26 @@ +FROM fluent/fluentd:v1.16.9-1.0 + +USER root + +# Install dependencies and plugins +RUN apk add --no-cache --update --virtual .build-deps \ + sudo build-base ruby-dev curl \ + && sudo gem install fluent-plugin-grafana-loki \ + && sudo gem install fluent-plugin-gelf-best \ + && sudo gem install fluent-plugin-prometheus \ + && apk del .build-deps \ + && apk add --no-cache curl jq \ + && rm -rf /var/cache/apk/* \ + && rm -rf /tmp/* /var/tmp/* /usr/lib/ruby/gems/*/cache/*.gem + +# Create directories with appropriate permissions +RUN mkdir -p /fluentd/buffer /fluentd/log \ + && chown -R fluent:fluent /fluentd/buffer /fluentd/log + +# Health check +HEALTHCHECK --interval=30s --timeout=30s --retries=3 \ + CMD curl -s http://localhost:24220/api/plugins | jq -e '.plugins | length > 0' || exit 1 + +USER fluent + +ENTRYPOINT ["fluentd", "-c", "/fluentd/etc/fluent.conf"] diff --git a/services/logging/fluentd/Makefile b/services/logging/fluentd/Makefile new file mode 100644 index 00000000..c353f2f3 --- /dev/null +++ b/services/logging/fluentd/Makefile @@ -0,0 +1,9 @@ +.DEFAULT_GOAL := help +STACK_NAME = $(notdir $(shell pwd)/..) +REPO_BASE_DIR := $(shell git rev-parse --show-toplevel) + +# TARGETS -------------------------------------------------- +include ${REPO_BASE_DIR}/scripts/common.Makefile + +build: + @docker build -t itisfoundation/fluentd:v1.16.9-1.0 . diff --git a/services/logging/fluentd/README.md b/services/logging/fluentd/README.md new file mode 100644 index 00000000..40979a86 --- /dev/null +++ b/services/logging/fluentd/README.md @@ -0,0 +1,4 @@ +There is currently no CD for building the fluentd image. +It has to be built and pushed manually: + +Run e.g. `docker build -t itisfoundation/fluentd:v1.16.9-1.0 .` in this folder, then push the image to dockerhub. diff --git a/services/logging/fluentd/fluent.conf b/services/logging/fluentd/fluent.conf new file mode 100644 index 00000000..e562e59c --- /dev/null +++ b/services/logging/fluentd/fluent.conf @@ -0,0 +1,111 @@ +# Monitoring + + @type monitor_agent + bind 0.0.0.0 + port 24220 + + +# Prometheus metrics + + @type prometheus + bind 0.0.0.0 + port 24231 + metrics_path metrics + + + + workers 1 + + + + @type prometheus_output_monitor + interval 10 + + hostname ${hostname} + + + +# Input: Receive logs from Docker containers + + @type forward + port 24224 + bind 0.0.0.0 + + +# Add additional metadata + + @type record_transformer + + hostname "#{Socket.gethostname}" + fluentd_hostname "#{ENV['FLUENTD_HOSTNAME']}" + tag ${tag} + + + +# Output to both Graylog (GELF) and Loki + + @type copy + + # Output to Graylog using GELF + + @type gelf + host logging_graylog + port 12201 + protocol udp + add_msec_time true + flush_interval 5s + + @type file + path /fluentd/buffer/graylog + flush_thread_count 8 + flush_interval 5s + retry_forever false + retry_timeout 1h + retry_max_times 30 + retry_randomize true + chunk_limit_size 8M + total_limit_size 2G + overflow_action drop_oldest_chunk + flush_mode interval + + + @type file + path /fluentd/log/graylog-error + append true + + @type json + + + + + # Output to Loki + + @type loki + url "#{ENV['LOKI_URL']}" + extra_labels {"job": "docker"} + line_format json + username "" + password "" + flush_interval 5s + + @type file + path /fluentd/buffer/loki + flush_thread_count 8 + flush_interval 5s + retry_forever false + retry_max_interval 30 + retry_max_times 30 + retry_randomize true + chunk_limit_size 8M + total_limit_size 2G + + + @type file + path /fluentd/log/loki-error + append true + + @type json + + + + diff --git a/services/logging/loki.yaml b/services/logging/loki.yaml new file mode 100644 index 00000000..5100e6f7 --- /dev/null +++ b/services/logging/loki.yaml @@ -0,0 +1,44 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + path_prefix: /tmp/loki # Required for internal directories + +ingester: + lifecycler: + address: 0.0.0.0 + ring: + kvstore: + store: inmemory + replication_factor: 1 + +schema_config: + configs: + - from: 2020-10-15 + store: tsdb + object_store: s3 + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + tsdb_shipper: + active_index_directory: /tmp/loki/tsdb-index # Local cache for index metadata + cache_location: /tmp/loki/tsdb-cache # Local cache for tsdb + aws: + s3: s3://${S3_BUCKET_NAME_LOKI} + region: ${S3_REGION_LOKI} + access_key_id: ${S3_ACCESS_KEY_LOKI} + secret_access_key: ${S3_SECRET_KEY_LOKI} + s3forcepathstyle: ${S3_FORCE_PATH_STYLE_LOKI} # Set to true if using MinIO or S3-compatible API; optional for AWS + endpoint: ${S3_ENDPOINT_LOKI} # Optional; use for non-default endpoints + +compactor: + working_directory: /tmp/loki/compactor + retention_enabled: false + +limits_config: + retention_period: ${LOKI_RETENTION_PERIOD} # must be >= 24h and multiple of index period (24h) diff --git a/services/graylog/scripts/.gitignore b/services/logging/scripts/.gitignore similarity index 100% rename from services/graylog/scripts/.gitignore rename to services/logging/scripts/.gitignore diff --git a/services/graylog/scripts/README.md b/services/logging/scripts/README.md similarity index 100% rename from services/graylog/scripts/README.md rename to services/logging/scripts/README.md diff --git a/services/graylog/scripts/alerts.template.yaml b/services/logging/scripts/alerts.template.yaml similarity index 95% rename from services/graylog/scripts/alerts.template.yaml rename to services/logging/scripts/alerts.template.yaml index fcadc6cf..8568215e 100644 --- a/services/graylog/scripts/alerts.template.yaml +++ b/services/logging/scripts/alerts.template.yaml @@ -3,7 +3,7 @@ priority: 3 config: query: > - container_name: /.*director-v2.*/ AND "could not find an available, non-overlapping IPv4 address pool among the defaults to assign to the network" AND NOT container_name:/.*graylog_graylog.*/ + container_name: /.*director-v2.*/ AND "could not find an available, non-overlapping IPv4 address pool among the defaults to assign to the network" AND NOT container_name:/.*logging_graylog.*/ query_parameters: [] search_within_ms: 600000 event_limit: 1000 @@ -43,7 +43,7 @@ priority: 2 config: query: > - "lock is no longer owned. This is unexpected and requires investigation" AND NOT container_name:/.*graylog_graylog.*/ + "lock is no longer owned. This is unexpected and requires investigation" AND NOT container_name:/.*logging_graylog.*/ query_parameters: [] search_within_ms: 3600000 event_limit: 1000 @@ -82,7 +82,7 @@ priority: 2 config: query: > - "LockNotOwnedError" AND NOT container_name:/.*graylog_graylog.*/ + "LockNotOwnedError" AND NOT container_name:/.*logging_graylog.*/ query_parameters: [] search_within_ms: 3600000 event_limit: 1000 diff --git a/services/graylog/scripts/configure.py b/services/logging/scripts/configure.py similarity index 100% rename from services/graylog/scripts/configure.py rename to services/logging/scripts/configure.py diff --git a/services/graylog/scripts/requirements.txt b/services/logging/scripts/requirements.txt similarity index 100% rename from services/graylog/scripts/requirements.txt rename to services/logging/scripts/requirements.txt diff --git a/services/graylog/template.env b/services/logging/template.env similarity index 72% rename from services/graylog/template.env rename to services/logging/template.env index 6d8bd056..d660de18 100644 --- a/services/graylog/template.env +++ b/services/logging/template.env @@ -15,3 +15,11 @@ GRAYLOG_WAIT_ONLINE_TIMEOUT_SEC=${GRAYLOG_WAIT_ONLINE_TIMEOUT_SEC} GRAYLOG_LOG_MAX_DAYS_IN_STORAGE=${GRAYLOG_LOG_MAX_DAYS_IN_STORAGE} GRAYLOG_LOG_MIN_DAYS_IN_STORAGE=${GRAYLOG_LOG_MIN_DAYS_IN_STORAGE} PUBLIC_NETWORK=${PUBLIC_NETWORK} +MONITORED_NETWORK=${MONITORED_NETWORK} +LOKI_RETETION_PERIOD=${LOKI_RETETION_PERIOD} +S3_ENDPOINT_LOKI=${S3_ENDPOINT_LOKI} +S3_ACCESS_KEY_LOKI=${S3_ACCESS_KEY_LOKI} +S3_BUCKET_NAME_LOKI=${S3_BUCKET_NAME_LOKI} +S3_FORCE_PATH_STYLE_LOKI=${S3_FORCE_PATH_STYLE_LOKI} +S3_REGION_LOKI=${S3_REGION_LOKI} +S3_SECRET_KEY_LOKI=${S3_SECRET_KEY_LOKI} diff --git a/services/monitoring/grafana/terraform/datasources.tf b/services/monitoring/grafana/terraform/datasources.tf index c5fcefc6..ee299f6a 100644 --- a/services/monitoring/grafana/terraform/datasources.tf +++ b/services/monitoring/grafana/terraform/datasources.tf @@ -35,6 +35,13 @@ resource "grafana_data_source" "tempo" { uid = "delr011tpeupsc" } +resource "grafana_data_source" "loki" { + type = "loki" + name = "loki" + url = "http://loki:3100" + basic_auth_enabled = false + is_default = false +} resource "grafana_data_source" "cloudwatch" { # This resource is only created if the AWS Deployments count = var.IS_AWS_DEPLOYMENT ? 1 : 0