diff --git a/scripts/common.Makefile b/scripts/common.Makefile index a5e36327..294dfc3c 100644 --- a/scripts/common.Makefile +++ b/scripts/common.Makefile @@ -274,7 +274,7 @@ $(REPO_BASE_DIR)/.venv/bin/activate: $(REPO_BASE_DIR)/.venv/bin/pip3 install --upgrade pip wheel setuptools $(REPO_BASE_DIR)/.venv/bin/pip3 install jinja2 j2cli[yaml] typer @echo "To activate the venv, execute 'source $(REPO_BASE_DIR)/.venv/bin/activate'" -PHONY: .venv +.PHONY: .venv .venv: $(REPO_BASE_DIR)/.venv/bin/activate ## Creates a python virtual environment with dev tools (pip, pylint, ...) .PHONY: venv venv: $(REPO_BASE_DIR)/.venv/bin/activate ## Creates a python virtual environment with dev tools (pip, pylint, ...) diff --git a/services/graylog/data/contentpacks/osparc-custom-content-pack-v2.json b/services/graylog/data/contentpacks/osparc-custom-content-pack-v2.json index 3b5627f2..ad948470 100644 --- a/services/graylog/data/contentpacks/osparc-custom-content-pack-v2.json +++ b/services/graylog/data/contentpacks/osparc-custom-content-pack-v2.json @@ -1,7 +1,7 @@ { "v": "1", "id": "dfaeea11-bde6-4203-9cfe-6ca2a23ca22e", - "rev": 42, + "rev": 43, "name": "osparc-custom-content-pack-v2", "summary": "osparc-custom-content-pack-v2", "description": "", diff --git a/services/graylog/scripts/alerts.template.yaml b/services/graylog/scripts/alerts.template.yaml index 992375ca..9338b57e 100644 --- a/services/graylog/scripts/alerts.template.yaml +++ b/services/graylog/scripts/alerts.template.yaml @@ -1,237 +1,3 @@ -- title: "${MACHINE_FQDN}: Interactive Study Data too large for AWS S3 and hanging" - description: "${MACHINE_FQDN}: Study Hanging" - priority: 2 - config: - query: > - "EntityTooLarge" AND NOT container_name:/.*graylog_graylog.*/ - query_parameters: [] - search_within_ms: 600000 - execute_every_ms: 600000 - group_by: [] - series: [] - conditions: {} - type: aggregation-v1 - field_spec: - source: - data_type: string - providers: - - type: template-v1 - template: "${source.source}" - require_values: false - container_name: - data_type: string - providers: - - type: template-v1 - template: "${source.container_name}" - require_values: false - full_message: - data_type: string - providers: - - type: template-v1 - template: "${source.full_message}" - key_spec: - - source - - container_name - - full_message - notification_settings: - grace_period_ms: 0 - backlog_size: 99 - alert: true -- title: "${MACHINE_FQDN}: Writer Is None Error in Webserver" - description: "${MACHINE_FQDN}: Alert if \"writer is None\" pops up. Communication with rabbitMQ is disrupted and this will make simcore go crazy" - priority: 2 - config: - query: > - "writer is None" AND NOT container_name:/.*graylog_graylog.*/ - query_parameters: [] - search_within_ms: 600000 - execute_every_ms: 600000 - group_by: [] - series: [] - conditions: {} - type: aggregation-v1 - field_spec: - source: - data_type: string - providers: - - type: template-v1 - template: "${source.source}" - require_values: false - container_name: - data_type: string - providers: - - type: template-v1 - template: "${source.container_name}" - require_values: false - full_message: - data_type: string - providers: - - type: template-v1 - template: "${source.full_message}" - key_spec: - - source - - container_name - - full_message - notification_settings: - grace_period_ms: 0 - backlog_size: 99 - alert: true -- title: "${MACHINE_FQDN}: Dynamic Sidecar failed to save with S3TransferError" - description: "${MACHINE_FQDN}: Alert if Dynamic Sidecar failed to save with S3TransferError" - priority: 2 - config: - query: > - "simcore_sdk.node_ports_common.exceptions.S3TransferError: Could not upload file" AND NOT container_name:/.*graylog_graylog.*/ - query_parameters: [] - search_within_ms: 600000 - execute_every_ms: 600000 - group_by: [] - series: [] - conditions: {} - type: aggregation-v1 - field_spec: - source: - data_type: string - providers: - - type: template-v1 - template: "${source.source}" - require_values: false - container_name: - data_type: string - providers: - - type: template-v1 - template: "${source.container_name}" - require_values: false - full_message: - data_type: string - providers: - - type: template-v1 - template: "${source.full_message}" - key_spec: - - source - - container_name - - full_message - notification_settings: - grace_period_ms: 0 - backlog_size: 99 - alert: true -- title: "${MACHINE_FQDN}: Dynamic Sidecar failed to save - 2" - description: "${MACHINE_FQDN}: Alert if Dynamic Sidecar failed to save - 2" - priority: 2 - config: - query: > - "Could not contact dynamic-sidecar to save service" AND NOT container_name:/.*graylog_graylog.*/ - query_parameters: [] - search_within_ms: 60000 - execute_every_ms: 60000 - group_by: [] - series: [] - conditions: {} - type: aggregation-v1 - field_spec: - source: - data_type: string - providers: - - type: template-v1 - template: "${source.source}" - require_values: false - container_name: - data_type: string - providers: - - type: template-v1 - template: "${source.container_name}" - require_values: false - full_message: - data_type: string - providers: - - type: template-v1 - template: "${source.full_message}" - key_spec: - - source - - container_name - - full_message - notification_settings: - grace_period_ms: 0 - backlog_size: 99 - alert: true -- title: "${MACHINE_FQDN}: simcore-agent failed pushing docker volume data to backup S3 bucket" - description: "${MACHINE_FQDN}: simcore-agent failed pushing docker volume data to backup S3 bucket" - priority: 2 - config: - query: > - container_name: /.*agent.*/ AND "Shell subprocesses yielded nonzero error code" AND NOT container_name:/.*graylog_graylog.*/ - query_parameters: [] - search_within_ms: 600000 - execute_every_ms: 600000 - group_by: [] - series: [] - conditions: {} - type: aggregation-v1 - field_spec: - source: - data_type: string - providers: - - type: template-v1 - template: "${source.source}" - require_values: false - container_name: - data_type: string - providers: - - type: template-v1 - template: "${source.container_name}" - require_values: false - full_message: - data_type: string - providers: - - type: template-v1 - template: "${source.full_message}" - key_spec: - - source - - container_name - - full_message - notification_settings: - grace_period_ms: 0 - backlog_size: 99 - alert: true -- title: "${MACHINE_FQDN}: faulty env-var setup" - description: "${MACHINE_FQDN}: Look e.g. here https://git.speag.com/oSparc/osparc-ops-environments/-/issues/564" - priority: 2 - config: - query: > - "unresolved, defaulting to None" AND NOT container_name:/.*graylog_graylog.*/ - query_parameters: [] - search_within_ms: 600000 - execute_every_ms: 600000 - group_by: [] - series: [] - conditions: {} - type: aggregation-v1 - field_spec: - source: - data_type: string - providers: - - type: template-v1 - template: "${source.source}" - require_values: false - container_name: - data_type: string - providers: - - type: template-v1 - template: "${source.container_name}" - require_values: false - full_message: - data_type: string - providers: - - type: template-v1 - template: "${source.full_message}" - key_spec: - - source - - container_name - - full_message - notification_settings: - grace_period_ms: 0 - backlog_size: 99 - alert: true - title: "${MACHINE_FQDN}: DOCKER IP POOL EXHAUSTED, no service can start" description: "${MACHINE_FQDN}: DOCKER IP POOL EXHAUSTED, no service can start. See: https://github.com/moby/moby/issues/30820" priority: 3 @@ -240,6 +6,7 @@ container_name: /.*director-v2.*/ AND "could not find an available, non-overlapping IPv4 address pool among the defaults to assign to the network" AND NOT container_name:/.*graylog_graylog.*/ query_parameters: [] search_within_ms: 600000 + event_limit: 1 execute_every_ms: 600000 group_by: [] series: [] @@ -271,82 +38,6 @@ grace_period_ms: 0 backlog_size: 99 alert: true -- title: "${MACHINE_FQDN}: Potential hanging dy-sidecar service detected." - description: "${MACHINE_FQDN}: Potential hanging dy-sidecar service detected. Human intervention required. Please investigate." - priority: 3 - config: - query: > - "waiting for manual intervention" AND container_name:/.*director-v2.*/ - query_parameters: [] - search_within_ms: 3600000 - execute_every_ms: 3600000 - group_by: [] - series: [] - conditions: {} - type: aggregation-v1 - field_spec: - source: - data_type: string - providers: - - type: template-v1 - template: "${source.source}" - require_values: false - container_name: - data_type: string - providers: - - type: template-v1 - template: "${source.container_name}" - require_values: false - full_message: - data_type: string - providers: - - type: template-v1 - template: "${source.full_message}" - key_spec: - - source - - container_name - notification_settings: - grace_period_ms: 0 - backlog_size: 99 - alert: true -- title: "${MACHINE_FQDN}: Syslogs indicate OOM-Event" - description: "${MACHINE_FQDN}: Likely the oom-killer has reaped a container Please investigate and adjust service limitations." - priority: 2 - config: - query: > - "Memory cgroup out of memory:" - query_parameters: [] - search_within_ms: 3600000 - execute_every_ms: 3600000 - group_by: [] - series: [] - conditions: {} - type: aggregation-v1 - field_spec: - source: - data_type: string - providers: - - type: template-v1 - template: "${source.source}" - require_values: false - container_name: - data_type: string - providers: - - type: template-v1 - template: "${source.container_name}" - require_values: false - full_message: - data_type: string - providers: - - type: template-v1 - template: "${source.full_message}" - key_spec: - - source - - container_name - notification_settings: - grace_period_ms: 0 - backlog_size: 99 - alert: true - title: "${MACHINE_FQDN}: Unexpected error with redis lock detected" description: "${MACHINE_FQDN}: This error should only occur in unit tests due to very low timings, maybe something happene here" priority: 2 @@ -355,6 +46,7 @@ "lock is no longer owned. This is unexpected and requires investigation" AND NOT container_name:/.*graylog_graylog.*/ query_parameters: [] search_within_ms: 3600000 + event_limit: 1 execute_every_ms: 3600000 group_by: [] series: [] @@ -393,6 +85,7 @@ "LockNotOwnedError" AND NOT container_name:/.*graylog_graylog.*/ query_parameters: [] search_within_ms: 3600000 + event_limit: 1 execute_every_ms: 3600000 group_by: [] series: [] @@ -432,6 +125,7 @@ query_parameters: [] search_within_ms: 86400000 execute_every_ms: 86400000 + event_limit: 1 group_by: [] series: [] conditions: {} @@ -463,6 +157,7 @@ query: log_service:/.+payments/ AND (log_level:ERROR OR log_level:WARNING) query_parameters: [] search_within_ms: 600000 + event_limit: 1 execute_every_ms: 600000 group_by: [] series: [] diff --git a/services/graylog/scripts/configure.py b/services/graylog/scripts/configure.py index 37a415d4..788e318c 100644 --- a/services/graylog/scripts/configure.py +++ b/services/graylog/scripts/configure.py @@ -1,15 +1,19 @@ +# pylint: disable=invalid-name # pylint: disable=logging-fstring-interpolation + import json import logging import os import sys import warnings from time import sleep +from typing import Any, Dict, Optional, Tuple import requests import yaml from environs import Env, EnvError from requests.exceptions import HTTPError +from requests.sessions import Session from tenacity import ( before_log, retry, @@ -21,8 +25,7 @@ from yaml.loader import SafeLoader logging.basicConfig(level="INFO") -logger = logging.getLogger() - +logger: logging.Logger = logging.getLogger() warnings.filterwarnings( "ignore", ".*Adding certificate verification is strongly advised.*", @@ -31,46 +34,58 @@ env = Env() env.read_env("./../.env", recurse=False) -SUPPORTED_GRAYLOG_MAJOR_VERSION = 6 - -GRAYLOG_BASE_DOMAIN = "https://monitoring." + env.str("MACHINE_FQDN") + "/graylog" -GRAYLOG_WAIT_ONLINE_TIMEOUT_SEC = env.int("GRAYLOG_WAIT_ONLINE_TIMEOUT_SEC") -REQUESTS_AUTH = (env.str("SERVICES_USER"), env.str("SERVICES_PASSWORD")) +SUPPORTED_GRAYLOG_MAJOR_VERSION: int = 6 +MACHINE_FQDN: str = env.str("MACHINE_FQDN") +GRAYLOG_BASE_DOMAIN: str = f"https://monitoring.{MACHINE_FQDN}/graylog" +GRAYLOG_WAIT_ONLINE_TIMEOUT_SEC: int = env.int("GRAYLOG_WAIT_ONLINE_TIMEOUT_SEC", 30) +REQUESTS_AUTH: Tuple[str, str] = ( + env.str("SERVICES_USER"), + env.str("SERVICES_PASSWORD"), +) +GRAYLOG_SYSLOG_CAPTURE_PORT: int = env.int("GRAYLOG_SYSLOG_CAPTURE_PORT") +GRAYLOG_LOG_MAX_DAYS_IN_STORAGE: int = env.int("GRAYLOG_LOG_MAX_DAYS_IN_STORAGE") +GRAYLOG_LOG_MIN_DAYS_IN_STORAGE: int = env.int("GRAYLOG_LOG_MIN_DAYS_IN_STORAGE") +GRAYLOG_SLACK_WEBHOOK_URL: str = env.str("GRAYLOG_SLACK_WEBHOOK_URL") +GRAYLOG_ALERT_MAIL_ADDRESS: str = env.str("GRAYLOG_ALERT_MAIL_ADDRESS") +GRAYLOG_SLACK_WEBHOOK_ICON_URL: str = env.str("GRAYLOG_SLACK_WEBHOOK_ICON_URL") +GRAYLOG_SLACK_WEBHOOK_CHANNEL: str = env.str("GRAYLOG_SLACK_WEBHOOK_CHANNEL") -GRAYLOG_LOG_MAX_DAYS_IN_STORAGE = env.int("GRAYLOG_LOG_MAX_DAYS_IN_STORAGE") -GRAYLOG_LOG_MIN_DAYS_IN_STORAGE = env.int("GRAYLOG_LOG_MIN_DAYS_IN_STORAGE") +assert MACHINE_FQDN +assert REQUESTS_AUTH +assert GRAYLOG_SYSLOG_CAPTURE_PORT +assert GRAYLOG_LOG_MAX_DAYS_IN_STORAGE +assert GRAYLOG_LOG_MIN_DAYS_IN_STORAGE @retry( - stop=stop_after_attempt(GRAYLOG_WAIT_ONLINE_TIMEOUT_SEC / 5), + stop=stop_after_attempt(GRAYLOG_WAIT_ONLINE_TIMEOUT_SEC // 5), wait=wait_fixed(5), retry=retry_if_exception_type(HTTPError), before=before_log(logger, logging.INFO), ) -def wait_graylog_is_online(): - _r = requests.get( +def wait_graylog_is_online() -> None: + _r: requests.Response = requests.get( GRAYLOG_BASE_DOMAIN + "/api/system", auth=REQUESTS_AUTH, verify=False, timeout=10, ) - if _r.status_code == 401: raise TypeError(f"Graylog unauthorized HTTP response: {_r}") - _r.raise_for_status() logger.info("Graylog is online") -def validate_graylog_version_is_supported(): - _r = requests.get( - GRAYLOG_BASE_DOMAIN + "/api/system", auth=REQUESTS_AUTH, verify=False +def validate_graylog_version_is_supported() -> None: + _r: requests.Response = requests.get( + GRAYLOG_BASE_DOMAIN + "/api/system", + auth=REQUESTS_AUTH, + verify=False, + timeout=30, ) _r.raise_for_status() - - graylog_version = _r.json()["version"] - major_version = int(graylog_version.split(".")[0]) - + graylog_version: str = _r.json()["version"] + major_version: int = int(graylog_version.split(".")[0]) if major_version != SUPPORTED_GRAYLOG_MAJOR_VERSION: raise TypeError( f"Graylog major version {major_version} is not supported by this script. " @@ -79,14 +94,16 @@ def validate_graylog_version_is_supported(): @retry(stop=stop_after_attempt(5), wait=wait_random(min=1, max=10)) -def get_graylog_inputs(_session, _headers, _url): +def get_graylog_inputs( + _session: Session, _headers: Dict[str, str], _url: str +) -> requests.Response: # We check if graylog has inputs, if not we add a new one - _r = _session.get(_url, headers=_headers, verify=False) + _r: requests.Response = _session.get(_url, headers=_headers, verify=False) # DEBUG if _r.status_code == 200: print("Successfully send GET /api/system/inputs") return _r - error_message = ( + error_message: str = ( "Error while sending GET /api/system/inputs. Status code of the request : " + str(_r.status_code) + " " @@ -96,29 +113,24 @@ def get_graylog_inputs(_session, _headers, _url): raise RuntimeError(error_message) -def configure_email_notifications(_session, _headers): - _url = ( - "https://monitoring." - + env.str("MACHINE_FQDN") - + "/graylog/api/events/notifications" - ) +def configure_email_notifications(_session: requests.Session, _headers: dict) -> str: + _url = GRAYLOG_BASE_DOMAIN + "/api/events/notifications" _r = _session.get(_url, headers=_headers, verify=False) if ( len( [ noti for noti in r.json()["notifications"] - if noti["title"] - == "Graylog " + env.str("MACHINE_FQDN") + " mail notification" + if noti["title"] == "Graylog " + MACHINE_FQDN + " mail notification" ] ) == 0 ): raw_data = ( '{"title":"Graylog ' - + env.str("MACHINE_FQDN") + + MACHINE_FQDN + ' mail notification","description":"","config":{"sender":"","subject":"Graylog event notification: ${event_definition_title}","user_recipients":[],"email_recipients":["' - + env.str("GRAYLOG_ALERT_MAIL_ADDRESS") + + GRAYLOG_ALERT_MAIL_ADDRESS + '"],"type":"email-notification-v1"}}' ) _r = _session.post(_url, headers=_headers, data=raw_data, verify=False) @@ -131,7 +143,7 @@ def configure_email_notifications(_session, _headers): + " " + _r.text ) - sys.exit(1) + sys.exit(os.EX_USAGE) else: print("Graylog Mail Notification already present - skipping...") # Keeping notification ID @@ -139,17 +151,96 @@ def configure_email_notifications(_session, _headers): _mail_notification_id = [ noti for noti in _r.json()["notifications"] - if noti["title"] == "Graylog " + env.str("MACHINE_FQDN") + " mail notification" + if noti["title"] == "Graylog " + MACHINE_FQDN + " mail notification" ][0]["id"] return _mail_notification_id -def configure_log_retention(_session, _headers): +def configure_slack_notification_channel(_session: requests.Session, _hed: dict) -> str: + # Configure sending Slack notifications + if GRAYLOG_SLACK_WEBHOOK_URL != "": + assert GRAYLOG_SLACK_WEBHOOK_CHANNEL + assert GRAYLOG_SLACK_WEBHOOK_ICON_URL + print( + f"Starting Graylock Slack Channel Setup. Assuming:\nGRAYLOG_SLACK_WEBHOOK_URL={GRAYLOG_SLACK_WEBHOOK_URL}\nGRAYLOG_SLACK_WEBHOOK_URL={GRAYLOG_SLACK_WEBHOOK_CHANNEL}\nGRAYLOG_SLACK_WEBHOOK_ICON_URL={GRAYLOG_SLACK_WEBHOOK_ICON_URL}" + ) + _url = ( + "https://monitoring." + MACHINE_FQDN + "/graylog/api/events/notifications" + ) + _r = _session.get(_url, headers=_hed, verify=False) + if ( + len( + [ + noti + for noti in _r.json()["notifications"] + if noti["title"] + == "Graylog " + MACHINE_FQDN + " Slack notification" + ] + ) + == 0 + ): + raw_data = ( + '{"title":"Graylog ' + + MACHINE_FQDN + + """ Slack notification","description":"Slack notification","config": { + "color": "#FF0000", + "webhook_url": \"""" + + GRAYLOG_SLACK_WEBHOOK_URL + + """\", + "channel": "#""" + + GRAYLOG_SLACK_WEBHOOK_CHANNEL + + """\", + "custom_message":"--- [Event Definition] ---------------------------\\nTitle: ${event_definition_title}\\nType: ${event_definition_type}\\n--- [Event] --------------------------------------\\nTimestamp: ${event.timestamp}\\nMessage: ${event.message}\\nSource: ${event.source}\\nKey: ${event.key}\\nPriority: ${event.priority}\\nAlert: ${event.alert}\\nTimestamp Processing: ${event.timestamp}\\nTimerange Start: ${event.timerange_start}\\nTimerange End: ${event.timerange_end}\\nEvent Fields:\\n${foreach event.fields field}\\n${field.key}: ${field.value}\\n${end}\\n${if backlog}\\n--- [Backlog] ------------------------------------\\nLast messages accounting for this alert:\\n${foreach backlog message}\\n""" + + "https://monitoring." + + MACHINE_FQDN + + "/graylog/messages" + + """/${message.index}/${message.id}\\n${end}${end}\\n", + "user_name": "Graylog", + "notify_channel": true, + "link_names": false, + "icon_url": \"""" + + GRAYLOG_SLACK_WEBHOOK_ICON_URL + + """\", + "icon_emoji": "", + "backlog_size": 5, + "type": "slack-notification-v1"}}""" + ) + _r = _session.post( + _url, headers=_hed, verify=False, data=raw_data.encode("utf-8") + ) + if _r.status_code == 200: + print("Slack Notification added with success !") + # Keeping notification ID + _r = _session.get(_url, headers=_hed, verify=False) + _slack_notification_id = [ + noti + for noti in _r.json()["notifications"] + if noti["title"] + == "Graylog " + MACHINE_FQDN + " Slack notification" + ][0]["id"] + return _slack_notification_id + print( + "Error while adding the Slack Notification. Status code of the request : " + + str(r.status_code) + + " " + + r.text + ) + sys.exit(os.EX_USAGE) + print("Graylog Slack Notification already present - skipping...") + _r = _session.get(_url, headers=_hed, verify=False) + _slack_notification_id = [ + noti + for noti in _r.json()["notifications"] + if noti["title"] == "Graylog " + MACHINE_FQDN + " Slack notification" + ][0]["id"] + return _slack_notification_id + return "" + + +def configure_log_retention(_session: requests.Session, _headers: dict) -> None: _url = ( - "https://monitoring." - + env.str("MACHINE_FQDN") - + "/graylog/api/system/indices/index_sets" + "https://monitoring." + MACHINE_FQDN + "/graylog/api/system/indices/index_sets" ) _r = _session.get(_url, headers=_headers, verify=False) index_of_interest = [ @@ -167,9 +258,7 @@ def configure_log_retention(_session, _headers): "index_lifetime_max": f"P{GRAYLOG_LOG_MAX_DAYS_IN_STORAGE}D", } _url = ( - "https://monitoring." - + env.str("MACHINE_FQDN") - + "/graylog/api/system/indices/index_sets" + "https://monitoring." + MACHINE_FQDN + "/graylog/api/system/indices/index_sets" ) raw_data = json.dumps(index_of_interest) _r = _session.put( @@ -188,26 +277,22 @@ def configure_log_retention(_session, _headers): + r.text ) + +def configure_syslog_capture(_session: requests.Session, _headers: dict) -> None: try: _url = ( - "https://monitoring." - + env.str("MACHINE_FQDN") - + "/graylog/api/system/cluster/nodes" + "https://monitoring." + MACHINE_FQDN + "/graylog/api/system/cluster/nodes" ) _r = _session.get(_url, headers=_headers, verify=False).json() assert len(_r["nodes"]) == 1 node_uuid = _r["nodes"][0]["node_id"] # - _url = ( - "https://monitoring." - + env.str("MACHINE_FQDN") - + "/graylog/api/system/inputs" - ) + _url = "https://monitoring." + MACHINE_FQDN + "/graylog/api/system/inputs" r2 = _session.get(_url, headers=_headers, verify=False).json() if len([i for i in r2["inputs"] if i["title"] == "Syslog"]) == 0: raw_data = ( '{"title":"Syslog","type":"org.graylog2.inputs.syslog.udp.SyslogUDPInput","configuration":{"bind_address":"0.0.0.0","port":' - + env.str("GRAYLOG_SYSLOG_CAPTURE_PORT") + + GRAYLOG_SYSLOG_CAPTURE_PORT + ',"recv_buffer_size":262144,"number_worker_threads":8,"override_source":null,"force_rdns":false,"allow_override_date":true,"store_full_message":true,"expand_structured_data":false},"global":true,"node":"' + node_uuid + '"}' @@ -219,7 +304,7 @@ def configure_log_retention(_session, _headers): sleep(0.3) _url = ( "https://monitoring." - + env.str("MACHINE_FQDN") + + MACHINE_FQDN + "/graylog/api/system/inputs/" + input_id + "/extractors" @@ -233,11 +318,16 @@ def configure_log_retention(_session, _headers): print("Error setting up graylog syslog capturing.") -def configure_alerts(_mail_notification_id, _session, _headers): +def configure_alerts( + _session: requests.Session, + _headers: dict, + _mail_notification_id: Optional[str] = None, + _slack_notification_id: Optional[str] = None, +) -> None: print("Configuring Graylog Alerts...") with open("alerts.yaml") as f: data = yaml.load(f, Loader=SafeLoader) - _url = "https://monitoring." + env.str("MACHINE_FQDN") + "/graylog/api/streams" + _url = "https://monitoring." + MACHINE_FQDN + "/graylog/api/streams" _r = _session.get(_url, headers=_headers, verify=False) if _r.status_code == 200: streams_list = _r.json()["streams"] @@ -252,12 +342,9 @@ def configure_alerts(_mail_notification_id, _session, _headers): print( "Could not determine ID of stream containing all events. Is graylog misconfigured? Exiting with error!" ) - sys.exit(1) - _url = ( - "https://monitoring." - + env.str("MACHINE_FQDN") - + "/graylog/api/events/definitions" - ) + sys.exit(os.EX_USAGE) + _url = "https://monitoring." + MACHINE_FQDN + "/graylog/api/events/definitions" + # Deleting existing alerts - this ensures idemptency _r = _session.get( _url, headers=_headers, params={"per_page": 2500}, verify=False ) @@ -282,15 +369,17 @@ def configure_alerts(_mail_notification_id, _session, _headers): ) print(resp.status_code) print(resp.json()) - sys.exit(1) + sys.exit(os.EX_USAGE) for i in data: i["notifications"] = [] - if env.str("GRAYLOG_ALERT_MAIL_ADDRESS"): + if GRAYLOG_ALERT_MAIL_ADDRESS != "" and _mail_notification_id: i["notifications"] += [{"notification_id": str(_mail_notification_id)}] + if GRAYLOG_SLACK_WEBHOOK_URL != "" and _slack_notification_id: + i["notifications"] += [{"notification_id": str(_slack_notification_id)}] i["config"]["streams"] = [str(stream_id_for_all_messages)] _url = ( "https://monitoring." - + env.str("MACHINE_FQDN") + + MACHINE_FQDN + "/graylog/api/events/definitions?schedule=true" ) resp = _session.post(_url, headers=_headers, json=i, verify=False) @@ -299,11 +388,13 @@ def configure_alerts(_mail_notification_id, _session, _headers): else: print("Could not add alert. Failure:", resp.status_code) print(resp.json()) - sys.exit(1) + sys.exit(os.EX_USAGE) -def configure_content_packs(_session, _headers, base_url): - def get_installation(content_pack): +def configure_content_packs( + _session: Session, _headers: Dict[str, str], base_url: str +) -> None: + def get_installation(content_pack: Dict[str, Any]) -> Optional[Dict[str, Any]]: logger.debug(f"Getting installations for content pack {content_pack['id']}") resp = _session.get( base_url + "/system/content_packs/" + content_pack["id"] + "/installations" @@ -321,7 +412,9 @@ def get_installation(content_pack): return installations[0] if installations else None - def delete_installation(content_pack, installation): + def delete_installation( + content_pack: Dict[str, Any], installation: Dict[str, Any] + ) -> None: logger.debug(f"Deleting installation {installation['_id']}") resp = _session.delete( @@ -338,7 +431,7 @@ def delete_installation(content_pack, installation): f"Error while deleting installation {installation['_id']} for content pack {content_pack['id']}" ) - def create_content_pack_revision(content_pack): + def create_content_pack_revision(content_pack: Dict[str, Any]) -> None: logger.debug( f"Uploading content pack {content_pack['id']} revision {content_pack['rev']}" ) @@ -357,7 +450,7 @@ def create_content_pack_revision(content_pack): f"Unexpected {resp.status_code=} while uploading content pack {content_pack['id']} revision {content_pack['rev']}. Error: {resp.text}" ) - def install_content_pack_revision(content_pack): + def install_content_pack_revision(content_pack: Dict[str, Any]) -> None: logger.debug( f"Installing content pack {content_pack['id']} revision {content_pack['rev']}" ) @@ -376,7 +469,7 @@ def install_content_pack_revision(content_pack): for file in os.listdir("../data/contentpacks"): with open(f"../data/contentpacks/{file}") as f: - logger.debug(f"Configuring content pack {f.name}") + logger.debug(f"Configuring content pack {file}") content_pack = json.loads(f.read()) create_content_pack_revision(content_pack) @@ -393,7 +486,7 @@ def install_content_pack_revision(content_pack): "This revision of content pack is already installed. Nothing to do..." ) - logging.info(f"{f.name} content pack has been configured") + logging.info(f"{file} content pack has been configured") if __name__ == "__main__": @@ -402,26 +495,32 @@ def install_content_pack_revision(content_pack): session = requests.Session() session.verify = False - session.auth = ( - env.str("SERVICES_USER"), - env.str("SERVICES_PASSWORD"), - ) # Graylog username is always "admin" + session.auth = REQUESTS_AUTH # Graylog username is always "admin" hed = { "Content-Type": "application/json", "Accept": "application/json", "X-Requested-By": "cli", } - url = "https://monitoring." + env.str("MACHINE_FQDN") + "/graylog/api/system/inputs" + url = "https://monitoring." + MACHINE_FQDN + "/graylog/api/system/inputs" r = get_graylog_inputs(session, hed, url) configure_log_retention(session, hed) - + configure_syslog_capture(session, hed) # Configure sending email notifications - if env.str("GRAYLOG_ALERT_MAIL_ADDRESS") != "": + mail_notification_id = None + slack_notification_id = None + if GRAYLOG_ALERT_MAIL_ADDRESS != "": mail_notification_id = configure_email_notifications(session, hed) + if GRAYLOG_SLACK_WEBHOOK_URL != "": + slack_notification_id = configure_slack_notification_channel(session, hed) + if mail_notification_id or slack_notification_id: # Configure Alerts - configure_alerts(mail_notification_id, session, hed) - # Configure log retention time + configure_alerts( + session, + hed, + _mail_notification_id=mail_notification_id, + _slack_notification_id=slack_notification_id, + ) # content pack will create GELF UDP Input # NOTE: When you introduce changes, revision number increase is mandatory @@ -429,7 +528,5 @@ def install_content_pack_revision(content_pack): # Autoloader is only good at loading content packs first time but not updating / adding new ones to existing. # https://community.graylog.org/t/update-content-packs-using-autoloading-functionality/6205 # https://github.com/Graylog2/graylog2-server/issues/14672 - content_pack_base_url = ( - "https://monitoring." + env.str("MACHINE_FQDN") + "/graylog/api" - ) + content_pack_base_url = "https://monitoring." + MACHINE_FQDN + "/graylog/api" configure_content_packs(session, hed, content_pack_base_url) diff --git a/services/graylog/template.env b/services/graylog/template.env index 01bcf8e6..121f3e97 100644 --- a/services/graylog/template.env +++ b/services/graylog/template.env @@ -1,29 +1,16 @@ GRAYLOG_PASSWORD_SECRET=${GRAYLOG_PASSWORD_SECRET} GRAYLOG_ROOT_PASSWORD_SHA2=${GRAYLOG_ROOT_PASSWORD_SHA2} -GRAYLOG_HTTP_EXTERNAL_URI=https://${MONITORING_DOMAIN}/graylog/ MONITORING_DOMAIN=${MONITORING_DOMAIN} +GRAYLOG_HTTP_EXTERNAL_URI=https://${MONITORING_DOMAIN}/graylog/ GRAYLOG_ALERT_MAIL_ADDRESS=${GRAYLOG_ALERT_MAIL_ADDRESS} -GRAYLOG_TRANSPORT_EMAIL_ENABLED=${GRAYLOG_TRANSPORT_EMAIL_ENABLED} -GRAYLOG_TRANSPORT_EMAIL_HOSTNAME=${SMTP_HOST} -GRAYLOG_TRANSPORT_EMAIL_PORT=${SMTP_PORT} -GRAYLOG_TRANSPORT_EMAIL_USE_AUTH=${GRAYLOG_TRANSPORT_EMAIL_USE_AUTH} -GRAYLOG_TRANSPORT_EMAIL_USE_TLS=${GRAYLOG_TRANSPORT_EMAIL_USE_TLS} -GRAYLOG_TRANSPORT_EMAIL_USE_SSL=${GRAYLOG_TRANSPORT_EMAIL_USE_SSL} -GRAYLOG_TRANSPORT_EMAIL_AUTH_USERNAME=${SMTP_USERNAME} -GRAYLOG_TRANSPORT_EMAIL_AUTH_PASSWORD=${SMTP_PASSWORD} -GRAYLOG_TRANSPORT_EMAIL_SUBJECT_PREFIX=[graylog-${CLUSTER_NAME}] -GRAYLOG_TRANSPORT_EMAIL_FROM_EMAIL=${SMTP_USERNAME} GRAYLOG_SYSLOG_CAPTURE_PORT=${GRAYLOG_SYSLOG_CAPTURE_PORT} SERVICES_PASSWORD=${SERVICES_PASSWORD} SERVICES_USER=${SERVICES_USER} - MACHINE_FQDN=${MACHINE_FQDN} GRAYLOG_SLACK_WEBHOOK_URL=${GRAYLOG_SLACK_WEBHOOK_URL} GRAYLOG_SLACK_WEBHOOK_ICON_URL=${GRAYLOG_SLACK_WEBHOOK_ICON_URL} GRAYLOG_SLACK_WEBHOOK_CHANNEL=${GRAYLOG_SLACK_WEBHOOK_CHANNEL} - GRAYLOG_WAIT_ONLINE_TIMEOUT_SEC=${GRAYLOG_WAIT_ONLINE_TIMEOUT_SEC} GRAYLOG_LOG_MAX_DAYS_IN_STORAGE=${GRAYLOG_LOG_MAX_DAYS_IN_STORAGE} GRAYLOG_LOG_MIN_DAYS_IN_STORAGE=${GRAYLOG_LOG_MIN_DAYS_IN_STORAGE} - PUBLIC_NETWORK=${PUBLIC_NETWORK}