|
3 | 3 | "alerts": { |
4 | 4 | "dataset_metrics_flink": [ |
5 | 5 | { |
6 | | - "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_ExtractorJob_dataset_id_extractor_failed_count[5m]))", |
7 | | - "alias": "Number of Failed Extraction Events", |
8 | | - "description": "This alert tracks how many events failed the extraction stage", |
| 6 | + "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_ExtractorJob_dataset_id_extractor_failed_count[5m])) + sum(sum_over_time(flink_taskmanager_job_task_operator_ExtractorJob_dataset_id_extractor_duplicate_count[5m])) + sum(sum_over_time(flink_taskmanager_job_task_operator_PipelinePreprocessorJob_dataset_id_validator_failed_count[5m]))", |
| 7 | + "alias": "[DATASET]: Detected high rate of invalid data than expected", |
| 8 | + "category": "Processing", |
| 9 | + "severity": "critical", |
| 10 | + "code": "ALERT_1203", |
| 11 | + "description": "The dataset is unhealthy, and the query results may be incorrect", |
| 12 | + "summary": "Invalid data has been ingested in the system, preventing it from being processed. Henceforth, queries on this dataset may not return accurate data.", |
9 | 13 | "frequency": "5m", |
10 | 14 | "interval": "5m", |
11 | 15 | "operator": "gt", |
12 | 16 | "threshold": 0 |
13 | 17 | }, |
14 | 18 | { |
15 | | - "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_ExtractorJob_dataset_id_extractor_duplicate_count[5m]))", |
16 | | - "alias": "Number of Duplicate Extraction Events", |
17 | | - "description": "This alert tracks how many duplicate events were found during extraction stage", |
| 19 | + "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_PipelinePreprocessorJob_dataset_id_dedup_failed_count[5m]))", |
| 20 | + "alias": "[DATASET]: Detected higher rate of duplicate data than expected", |
| 21 | + "category": "Processing", |
| 22 | + "severity": "warning", |
| 23 | + "code": "ALERT_1204", |
| 24 | + "description": "The dataset is unhealthy, and the query results may be incorrect", |
| 25 | + "summary": "Duplicate data has been ingested in the system, preventing it from being processed. Henceforth, queries on this dataset may not return accurate data.", |
18 | 26 | "frequency": "5m", |
19 | 27 | "interval": "5m", |
20 | 28 | "operator": "gt", |
21 | 29 | "threshold": 0 |
22 | 30 | }, |
23 | 31 | { |
24 | | - "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_PipelinePreprocessorJob_dataset_id_dedup_failed_count[5m]))", |
25 | | - "alias": "Number of Duplicate Preprocessing Events", |
26 | | - "description": "This alert tracks how many duplicate events were found during preprocessing stage", |
| 32 | + "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_DenormalizerJob_dataset_id_denorm_failed[5m])) + sum(sum_over_time(flink_taskmanager_job_task_operator_DenormalizerJob_dataset_id_denorm_partial_success[5m]))", |
| 33 | + "alias": "[DATASET]: Detected higher incidence of failures during data enrichment.", |
| 34 | + "category": "Processing", |
| 35 | + "severity": "warning", |
| 36 | + "code": "ALERT_1205", |
| 37 | + "description": "The dataset is unhealthy, and the query results may be incorrect", |
| 38 | + "summary": "The data ingested into the system is failing the enrichment process, which may cause queries on this dataset to return inaccurate data.", |
27 | 39 | "frequency": "5m", |
28 | 40 | "interval": "5m", |
29 | 41 | "operator": "gt", |
30 | 42 | "threshold": 0 |
31 | 43 | }, |
32 | 44 | { |
33 | | - "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_PipelinePreprocessorJob_dataset_id_validator_failed_count[5m]))", |
34 | | - "alias": "Number of Failed Validation Events", |
35 | | - "description": "This alert tracks how many events failed the validation stage", |
| 45 | + "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_TransformerJob_dataset_id_transform_failed_count[5m])) + sum(sum_over_time(flink_taskmanager_job_task_operator_TransformerJob_dataset_id_transform_partial_count[5m]))", |
| 46 | + "alias": "[DATASET]: Detected higher incidence of failures during data transformations.", |
| 47 | + "category": "Processing", |
| 48 | + "severity": "warning", |
| 49 | + "code": "ALERT_1206", |
| 50 | + "description": "The dataset is unhealthy, and the query results may be incorrect", |
| 51 | + "summary": "The data ingested into the system is failing the data transformation process, which may cause queries on this dataset to return inaccurate data.", |
36 | 52 | "frequency": "5m", |
37 | 53 | "interval": "5m", |
38 | 54 | "operator": "gt", |
39 | 55 | "threshold": 0 |
40 | 56 | }, |
41 | 57 | { |
42 | | - "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_DenormalizerJob_dataset_id_denorm_failed[5m])) + sum(sum_over_time(flink_taskmanager_job_task_operator_DenormalizerJob_dataset_id_denorm_partial_success[5m]))", |
43 | | - "alias": "Number of Failed Denorm Events", |
44 | | - "description": "This alert tracks how many events failed the denorm stage", |
| 58 | + "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_PipelinePreprocessorJob_dataset_id_validator_total_count[$__range]))", |
| 59 | + "alias": "[DATASET]: No data has been received for the past hour.", |
| 60 | + "category": "Processing", |
| 61 | + "severity": "warning", |
| 62 | + "code": "ALERT_1209", |
| 63 | + "description": "The dataset hasn’t received any new data for the past hour, which may affect the querying of the new data.", |
| 64 | + "summary": "The dataset has not received any new data, which will impact real-time data processing", |
| 65 | + "frequency": "5m", |
| 66 | + "interval": "60m", |
| 67 | + "operator": "lt", |
| 68 | + "threshold": 1 |
| 69 | + } |
| 70 | + ], |
| 71 | + "dataset_metrics_druid": [ |
| 72 | + { |
| 73 | + "metric": "max(druid_supervisors{supervisor_name=\"dataset_id\", state=\"RUNNING\"} or (0 * absent(druid_supervisors{supervisor_name=\"dataset_id\", state=\"RUNNING\"})))", |
| 74 | + "alias": "[DATASET]: Druid supervisor is in an unhealthy state", |
| 75 | + "category": "Querying", |
| 76 | + "severity": "critical", |
| 77 | + "code": "ALERT_1309", |
| 78 | + "description": "The dataset is unhealthy, and no new data has been available for querying since the system encountered the issue.", |
| 79 | + "summary": "The associated Druid Supervisor is in an unhealthy state, preventing druid ingestion tasks from running. As a result, real-time data cannot be queried.", |
| 80 | + "frequency": "5m", |
| 81 | + "interval": "5m", |
| 82 | + "operator": "lt", |
| 83 | + "threshold": 1 |
| 84 | + }, |
| 85 | + { |
| 86 | + "metric": "druid_ingest_events_unparseable_total{dataSource=\"dataset_id\"}", |
| 87 | + "alias": "[DATASET]: Detected higher amount of unparseable data.", |
| 88 | + "flattened": true, |
| 89 | + "category": "Querying", |
| 90 | + "severity": "critical", |
| 91 | + "code": "ALERT_1308", |
| 92 | + "description": "The dataset is unhealthy, and the query results may be incorrect", |
| 93 | + "summary": "Unparseable data has been detected in the system, preventing it from being processed. Henceforth, queries on this dataset may not return accurate data until the issue is resolved.", |
45 | 94 | "frequency": "5m", |
46 | 95 | "interval": "5m", |
47 | 96 | "operator": "gt", |
48 | 97 | "threshold": 0 |
49 | 98 | }, |
50 | 99 | { |
51 | | - "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_TransformerJob_dataset_id_transform_failed_count[5m])) + sum(sum_over_time(flink_taskmanager_job_task_operator_TransformerJob_dataset_id_transform_partial_count[5m]))", |
52 | | - "alias": "Number of Failed Transformer Events", |
53 | | - "description": "This alert tracks how many events failed the transformation stage", |
| 100 | + "metric": "druid_ingest_kafka_lag{dataSource=\"dataset_id\"}", |
| 101 | + "alias": "[DATASET]: Detected higher amount of query lag than expected.", |
| 102 | + "category": "Querying", |
| 103 | + "flattened": true, |
| 104 | + "severity": "critical", |
| 105 | + "code": "ALERT_1307", |
| 106 | + "description": "A large amount of data is still waiting to be processed. This may cause delays in querying the most recent data", |
| 107 | + "summary": "High indexer lag in the dataset indicates processing of new data is delayed. Because of this delay, new data isn’t available when querying the dataset.", |
| 108 | + "frequency": "5m", |
| 109 | + "interval": "60m", |
| 110 | + "operator": "gt", |
| 111 | + "threshold": 5000000 |
| 112 | + }, |
| 113 | + { |
| 114 | + "metric": "druid_ingest_kafka_lag{dataSource=\"dataset_id\"}", |
| 115 | + "alias": "[DATASET]: Druid Supervisor Ingestion Failure Due to Offsets.", |
| 116 | + "category": "Querying", |
| 117 | + "flattened": true, |
| 118 | + "severity": "critical", |
| 119 | + "code": "ALERT_1312", |
| 120 | + "description": "The dataset is unhealthy, and no new data has been available for querying since the issue occurred", |
| 121 | + "summary": "The supervisor is experiencing a negative offset, preventing it from ingesting new data. As a result, real-time data is unavailable for querying.", |
| 122 | + "frequency": "5m", |
| 123 | + "interval": "5m", |
| 124 | + "operator": "lt", |
| 125 | + "threshold": 0 |
| 126 | + }, |
| 127 | + { |
| 128 | + "metric": "count(druid_tasks_duration{task_status='FAILED', datasource='dataset_id'}) OR on() vector(0)", |
| 129 | + "alias": "[DATASET]: Druid tasks are in an unhealthy state", |
| 130 | + "category": "Querying", |
| 131 | + "severity": "critical", |
| 132 | + "code": "ALERT_1310", |
| 133 | + "description": "The dataset is unhealthy, and no new data has been available for querying since the system encountered the issue.", |
| 134 | + "summary": "The Druid ingestion tasks are in an unhealthy state, causing data ingestion delays and failures. As a result, real-time data may not be available for querying.", |
| 135 | + "frequency": "5m", |
| 136 | + "interval": "5m", |
| 137 | + "operator": "gt", |
| 138 | + "threshold": 0 |
| 139 | + } |
| 140 | + ], |
| 141 | + "api_metric": [ |
| 142 | + { |
| 143 | + "metric": "sum(sum_over_time(node_failed_api_calls{dataset_id='<dataset_id>', id='api.data.out'}[$__range]))", |
| 144 | + "alias": "[DATASET]: The Data Query API is encountering more failures to retrieve the data", |
| 145 | + "category": "Querying", |
| 146 | + "severity": "warning", |
| 147 | + "code": "ALERT_1305", |
| 148 | + "description": "The dataset has been unavailable for querying data for an extented period", |
| 149 | + "summary": "Query failures are preventing access to the dataset, resulting in an inability to retrieve data as expected.", |
| 150 | + "frequency": "5m", |
| 151 | + "interval": "5m", |
| 152 | + "operator": "gt", |
| 153 | + "threshold": 0 |
| 154 | + }, |
| 155 | + { |
| 156 | + "metric": "avg(avg_over_time(node_query_response_time{dataset_id='<dataset_id>', id='api.data.out'}[$__range]))", |
| 157 | + "alias": "[DATASET]: The Data Query API is facing delays in retrieving data", |
| 158 | + "category": "Querying", |
| 159 | + "severity": "warning", |
| 160 | + "code": "ALERT_1306", |
| 161 | + "description": "There is a delay in querying the dataset for an extended period.", |
| 162 | + "summary": "Delays in queries are affecting access to the dataset, leading to delayed data retrieval.", |
| 163 | + "frequency": "5m", |
| 164 | + "interval": "5m", |
| 165 | + "operator": "gt", |
| 166 | + "threshold": 1000 |
| 167 | + }, |
| 168 | + { |
| 169 | + "metric": "sum(sum_over_time(node_failed_api_calls{dataset_id='<dataset_id>', id='api.data.in'}[$__range]))", |
| 170 | + "alias": "[DATASET]: Failed to ingest data into the system", |
| 171 | + "category": "Ingestion", |
| 172 | + "severity": "warning", |
| 173 | + "code": "ALERT_1101", |
| 174 | + "description": "Detected failures while adding new data to the dataset.", |
| 175 | + "summary": "Failed to add new data to the dataset, impacting real-time data availability.", |
54 | 176 | "frequency": "5m", |
55 | 177 | "interval": "5m", |
56 | 178 | "operator": "gt", |
|
0 commit comments