Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
f0d8cf0
wip
mrnicegyu11 Sep 19, 2024
e906b41
Merge remote-tracking branch 'upstream/main' into main
mrnicegyu11 Oct 23, 2024
14c751d
Merge remote-tracking branch 'upstream/main' into main
mrnicegyu11 Oct 23, 2024
293f63c
Add csi-s3 and have portainer use it
mrnicegyu11 Oct 24, 2024
f7f72ec
Change request @hrytsuk 1GB max portainer volume size
mrnicegyu11 Oct 25, 2024
94cfb76
t push
mrnicegyu11 Oct 28, 2024
509c717
Merge remote-tracking branch 'upstream/main'
mrnicegyu11 Oct 29, 2024
1a65ecf
Merge remote-tracking branch 'upstream/main'
mrnicegyu11 Nov 13, 2024
77ee45e
Merge remote-tracking branch 'upstream/main'
mrnicegyu11 Nov 25, 2024
c9c70d6
Arch Linux Certificates Customization
mrnicegyu11 Dec 3, 2024
7b8be53
Merge remote-tracking branch 'upstream/main'
mrnicegyu11 Dec 5, 2024
bcd61cd
Merge remote-tracking branch 'upstream/main'
mrnicegyu11 Dec 12, 2024
58e1030
Merge remote-tracking branch 'upstream/main'
mrnicegyu11 Dec 13, 2024
ed8d479
Merge remote-tracking branch 'upstream/main'
mrnicegyu11 Jan 10, 2025
dda6e01
Merge remote-tracking branch 'upstream/main'
mrnicegyu11 Feb 4, 2025
f6f4f36
Merge remote-tracking branch 'upstream/main'
mrnicegyu11 Feb 25, 2025
5dca5c3
Merge remote-tracking branch 'upstream/main'
mrnicegyu11 Mar 13, 2025
4a653ef
Merge remote-tracking branch 'upstream/main'
mrnicegyu11 Mar 20, 2025
3a21f0f
Merge remote-tracking branch 'upstream/main'
mrnicegyu11 Mar 28, 2025
94b996a
Fixes https://github.com/ITISFoundation/osparc-simcore/issues/7363
mrnicegyu11 Mar 28, 2025
018b708
fixes
mrnicegyu11 Mar 31, 2025
7e19a88
Make graylog alerts work again
mrnicegyu11 Mar 31, 2025
bff3820
Merge branch 'main' into 2025/add/graylogAlerts
mrnicegyu11 Mar 31, 2025
18385f7
Change request @YuryHrytsuk bump content pack rev
mrnicegyu11 Apr 3, 2025
23fb61a
Merge branch 'main' into 2025/add/graylogAlerts
mrnicegyu11 Apr 3, 2025
fec5e6d
graylog/configure.py add type annotations
mrnicegyu11 Apr 3, 2025
de6cc1c
Fix minor makefile bug
mrnicegyu11 Apr 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/common.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ $(REPO_BASE_DIR)/.venv/bin/activate:
$(REPO_BASE_DIR)/.venv/bin/pip3 install --upgrade pip wheel setuptools
$(REPO_BASE_DIR)/.venv/bin/pip3 install jinja2 j2cli[yaml] typer
@echo "To activate the venv, execute 'source $(REPO_BASE_DIR)/.venv/bin/activate'"
PHONY: .venv
.PHONY: .venv
.venv: $(REPO_BASE_DIR)/.venv/bin/activate ## Creates a python virtual environment with dev tools (pip, pylint, ...)
.PHONY: venv
venv: $(REPO_BASE_DIR)/.venv/bin/activate ## Creates a python virtual environment with dev tools (pip, pylint, ...)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"v": "1",
"id": "dfaeea11-bde6-4203-9cfe-6ca2a23ca22e",
"rev": 42,
"rev": 43,
"name": "osparc-custom-content-pack-v2",
"summary": "osparc-custom-content-pack-v2",
"description": "",
Expand Down
315 changes: 5 additions & 310 deletions services/graylog/scripts/alerts.template.yaml
Original file line number Diff line number Diff line change
@@ -1,237 +1,3 @@
- title: "${MACHINE_FQDN}: Interactive Study Data too large for AWS S3 and hanging"
description: "${MACHINE_FQDN}: Study Hanging"
priority: 2
config:
query: >
"EntityTooLarge" AND NOT container_name:/.*graylog_graylog.*/
query_parameters: []
search_within_ms: 600000
execute_every_ms: 600000
group_by: []
series: []
conditions: {}
type: aggregation-v1
field_spec:
source:
data_type: string
providers:
- type: template-v1
template: "${source.source}"
require_values: false
container_name:
data_type: string
providers:
- type: template-v1
template: "${source.container_name}"
require_values: false
full_message:
data_type: string
providers:
- type: template-v1
template: "${source.full_message}"
key_spec:
- source
- container_name
- full_message
notification_settings:
grace_period_ms: 0
backlog_size: 99
alert: true
- title: "${MACHINE_FQDN}: Writer Is None Error in Webserver"
description: "${MACHINE_FQDN}: Alert if \"writer is None\" pops up. Communication with rabbitMQ is disrupted and this will make simcore go crazy"
priority: 2
config:
query: >
"writer is None" AND NOT container_name:/.*graylog_graylog.*/
query_parameters: []
search_within_ms: 600000
execute_every_ms: 600000
group_by: []
series: []
conditions: {}
type: aggregation-v1
field_spec:
source:
data_type: string
providers:
- type: template-v1
template: "${source.source}"
require_values: false
container_name:
data_type: string
providers:
- type: template-v1
template: "${source.container_name}"
require_values: false
full_message:
data_type: string
providers:
- type: template-v1
template: "${source.full_message}"
key_spec:
- source
- container_name
- full_message
notification_settings:
grace_period_ms: 0
backlog_size: 99
alert: true
- title: "${MACHINE_FQDN}: Dynamic Sidecar failed to save with S3TransferError"
description: "${MACHINE_FQDN}: Alert if Dynamic Sidecar failed to save with S3TransferError"
priority: 2
config:
query: >
"simcore_sdk.node_ports_common.exceptions.S3TransferError: Could not upload file" AND NOT container_name:/.*graylog_graylog.*/
query_parameters: []
search_within_ms: 600000
execute_every_ms: 600000
group_by: []
series: []
conditions: {}
type: aggregation-v1
field_spec:
source:
data_type: string
providers:
- type: template-v1
template: "${source.source}"
require_values: false
container_name:
data_type: string
providers:
- type: template-v1
template: "${source.container_name}"
require_values: false
full_message:
data_type: string
providers:
- type: template-v1
template: "${source.full_message}"
key_spec:
- source
- container_name
- full_message
notification_settings:
grace_period_ms: 0
backlog_size: 99
alert: true
- title: "${MACHINE_FQDN}: Dynamic Sidecar failed to save - 2"
description: "${MACHINE_FQDN}: Alert if Dynamic Sidecar failed to save - 2"
priority: 2
config:
query: >
"Could not contact dynamic-sidecar to save service" AND NOT container_name:/.*graylog_graylog.*/
query_parameters: []
search_within_ms: 60000
execute_every_ms: 60000
group_by: []
series: []
conditions: {}
type: aggregation-v1
field_spec:
source:
data_type: string
providers:
- type: template-v1
template: "${source.source}"
require_values: false
container_name:
data_type: string
providers:
- type: template-v1
template: "${source.container_name}"
require_values: false
full_message:
data_type: string
providers:
- type: template-v1
template: "${source.full_message}"
key_spec:
- source
- container_name
- full_message
notification_settings:
grace_period_ms: 0
backlog_size: 99
alert: true
- title: "${MACHINE_FQDN}: simcore-agent failed pushing docker volume data to backup S3 bucket"
description: "${MACHINE_FQDN}: simcore-agent failed pushing docker volume data to backup S3 bucket"
priority: 2
config:
query: >
container_name: /.*agent.*/ AND "Shell subprocesses yielded nonzero error code" AND NOT container_name:/.*graylog_graylog.*/
query_parameters: []
search_within_ms: 600000
execute_every_ms: 600000
group_by: []
series: []
conditions: {}
type: aggregation-v1
field_spec:
source:
data_type: string
providers:
- type: template-v1
template: "${source.source}"
require_values: false
container_name:
data_type: string
providers:
- type: template-v1
template: "${source.container_name}"
require_values: false
full_message:
data_type: string
providers:
- type: template-v1
template: "${source.full_message}"
key_spec:
- source
- container_name
- full_message
notification_settings:
grace_period_ms: 0
backlog_size: 99
alert: true
- title: "${MACHINE_FQDN}: faulty env-var setup"
description: "${MACHINE_FQDN}: Look e.g. here https://git.speag.com/oSparc/osparc-ops-environments/-/issues/564"
priority: 2
config:
query: >
"unresolved, defaulting to None" AND NOT container_name:/.*graylog_graylog.*/
query_parameters: []
search_within_ms: 600000
execute_every_ms: 600000
group_by: []
series: []
conditions: {}
type: aggregation-v1
field_spec:
source:
data_type: string
providers:
- type: template-v1
template: "${source.source}"
require_values: false
container_name:
data_type: string
providers:
- type: template-v1
template: "${source.container_name}"
require_values: false
full_message:
data_type: string
providers:
- type: template-v1
template: "${source.full_message}"
key_spec:
- source
- container_name
- full_message
notification_settings:
grace_period_ms: 0
backlog_size: 99
alert: true
- title: "${MACHINE_FQDN}: DOCKER IP POOL EXHAUSTED, no service can start"
description: "${MACHINE_FQDN}: DOCKER IP POOL EXHAUSTED, no service can start. See: https://github.com/moby/moby/issues/30820"
priority: 3
Expand All @@ -240,6 +6,7 @@
container_name: /.*director-v2.*/ AND "could not find an available, non-overlapping IPv4 address pool among the defaults to assign to the network" AND NOT container_name:/.*graylog_graylog.*/
query_parameters: []
search_within_ms: 600000
event_limit: 1
execute_every_ms: 600000
group_by: []
series: []
Expand Down Expand Up @@ -271,82 +38,6 @@
grace_period_ms: 0
backlog_size: 99
alert: true
- title: "${MACHINE_FQDN}: Potential hanging dy-sidecar service detected."
description: "${MACHINE_FQDN}: Potential hanging dy-sidecar service detected. Human intervention required. Please investigate."
priority: 3
config:
query: >
"waiting for manual intervention" AND container_name:/.*director-v2.*/
query_parameters: []
search_within_ms: 3600000
execute_every_ms: 3600000
group_by: []
series: []
conditions: {}
type: aggregation-v1
field_spec:
source:
data_type: string
providers:
- type: template-v1
template: "${source.source}"
require_values: false
container_name:
data_type: string
providers:
- type: template-v1
template: "${source.container_name}"
require_values: false
full_message:
data_type: string
providers:
- type: template-v1
template: "${source.full_message}"
key_spec:
- source
- container_name
notification_settings:
grace_period_ms: 0
backlog_size: 99
alert: true
- title: "${MACHINE_FQDN}: Syslogs indicate OOM-Event"
description: "${MACHINE_FQDN}: Likely the oom-killer has reaped a container Please investigate and adjust service limitations."
priority: 2
config:
query: >
"Memory cgroup out of memory:"
query_parameters: []
search_within_ms: 3600000
execute_every_ms: 3600000
group_by: []
series: []
conditions: {}
type: aggregation-v1
field_spec:
source:
data_type: string
providers:
- type: template-v1
template: "${source.source}"
require_values: false
container_name:
data_type: string
providers:
- type: template-v1
template: "${source.container_name}"
require_values: false
full_message:
data_type: string
providers:
- type: template-v1
template: "${source.full_message}"
key_spec:
- source
- container_name
notification_settings:
grace_period_ms: 0
backlog_size: 99
alert: true
- title: "${MACHINE_FQDN}: Unexpected error with redis lock detected"
description: "${MACHINE_FQDN}: This error should only occur in unit tests due to very low timings, maybe something happene here"
priority: 2
Expand All @@ -355,6 +46,7 @@
"lock is no longer owned. This is unexpected and requires investigation" AND NOT container_name:/.*graylog_graylog.*/
query_parameters: []
search_within_ms: 3600000
event_limit: 1
execute_every_ms: 3600000
group_by: []
series: []
Expand Down Expand Up @@ -393,6 +85,7 @@
"LockNotOwnedError" AND NOT container_name:/.*graylog_graylog.*/
query_parameters: []
search_within_ms: 3600000
event_limit: 1
execute_every_ms: 3600000
group_by: []
series: []
Expand Down Expand Up @@ -432,6 +125,7 @@
query_parameters: []
search_within_ms: 86400000
execute_every_ms: 86400000
event_limit: 1
group_by: []
series: []
conditions: {}
Expand Down Expand Up @@ -463,6 +157,7 @@
query: log_service:/.+payments/ AND (log_level:ERROR OR log_level:WARNING)
query_parameters: []
search_within_ms: 600000
event_limit: 1
execute_every_ms: 600000
group_by: []
series: []
Expand Down
Loading
Loading