Sanketika-Obsrv
diff --git a/‎api-service/src/configs/alertsConfig.json‎
Lines changed: 140 additions & 18 deletions b/‎api-service/src/configs/alertsConfig.json‎
Lines changed: 140 additions & 18 deletions
diff --git a/‎api-service/src/controllers/Alerts/Metric.ts‎
Lines changed: 1 addition & 2 deletions b/‎api-service/src/controllers/Alerts/Metric.ts‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎api-service/src/services/AlertManagerService.ts‎
Lines changed: 61 additions & 22 deletions b/‎api-service/src/services/AlertManagerService.ts‎
Lines changed: 61 additions & 22 deletions
@@ -3,54 +3,176 @@
     "alerts": {
       "dataset_metrics_flink": [
         {
-          "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_ExtractorJob_dataset_id_extractor_failed_count[5m]))",
-          "alias": "Number of Failed Extraction Events",
-          "description": "This alert tracks how many events failed the extraction stage",
+          "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_ExtractorJob_dataset_id_extractor_failed_count[5m])) + sum(sum_over_time(flink_taskmanager_job_task_operator_ExtractorJob_dataset_id_extractor_duplicate_count[5m])) + sum(sum_over_time(flink_taskmanager_job_task_operator_PipelinePreprocessorJob_dataset_id_validator_failed_count[5m]))",
+          "alias": "[DATASET]: Detected high rate of invalid data than expected",
+          "category": "Processing",
+          "severity": "critical",
+          "code": "ALERT_1203",
+          "description": "The dataset is unhealthy, and the query results may be incorrect",
+          "summary": "Invalid data has been ingested in the system, preventing it from being processed. Henceforth, queries on this dataset may not return accurate data.",
           "frequency": "5m",
           "interval": "5m",
           "operator": "gt",
           "threshold": 0
         },
         {
-          "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_ExtractorJob_dataset_id_extractor_duplicate_count[5m]))",
-          "alias": "Number of Duplicate Extraction Events",
-          "description": "This alert tracks how many duplicate events were found during extraction stage",
+          "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_PipelinePreprocessorJob_dataset_id_dedup_failed_count[5m]))",
+          "alias": "[DATASET]: Detected higher rate of duplicate data than expected",
+          "category": "Processing",
+          "severity": "warning",
+          "code": "ALERT_1204",
+          "description": "The dataset is unhealthy, and the query results may be incorrect",
+          "summary": "Duplicate data has been ingested in the system, preventing it from being processed. Henceforth, queries on this dataset may not return accurate data.",
           "frequency": "5m",
           "interval": "5m",
           "operator": "gt",
           "threshold": 0
         },
         {
-          "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_PipelinePreprocessorJob_dataset_id_dedup_failed_count[5m]))",
-          "alias": "Number of Duplicate Preprocessing Events",
-          "description": "This alert tracks how many duplicate events were found during preprocessing stage",
+          "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_DenormalizerJob_dataset_id_denorm_failed[5m])) + sum(sum_over_time(flink_taskmanager_job_task_operator_DenormalizerJob_dataset_id_denorm_partial_success[5m]))",
+          "alias": "[DATASET]: Detected higher incidence of failures during data enrichment.",
+          "category": "Processing",
+          "severity": "warning",
+          "code": "ALERT_1205",
+          "description": "The dataset is unhealthy, and the query results may be incorrect",
+          "summary": "The data ingested into the system is failing the enrichment process, which may cause queries on this dataset to return inaccurate data.",
           "frequency": "5m",
           "interval": "5m",
           "operator": "gt",
           "threshold": 0
         },
         {
-          "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_PipelinePreprocessorJob_dataset_id_validator_failed_count[5m]))",
-          "alias": "Number of Failed Validation Events",
-          "description": "This alert tracks how many events failed the validation stage",
+          "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_TransformerJob_dataset_id_transform_failed_count[5m])) + sum(sum_over_time(flink_taskmanager_job_task_operator_TransformerJob_dataset_id_transform_partial_count[5m]))",
+          "alias": "[DATASET]: Detected higher incidence of failures during data transformations.",
+          "category": "Processing",
+          "severity": "warning",
+          "code": "ALERT_1206",
+          "description": "The dataset is unhealthy, and the query results may be incorrect",
+          "summary": "The data ingested into the system is failing the data transformation process, which may cause queries on this dataset to return inaccurate data.",
           "frequency": "5m",
           "interval": "5m",
           "operator": "gt",
           "threshold": 0
         },
         {
-          "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_DenormalizerJob_dataset_id_denorm_failed[5m])) + sum(sum_over_time(flink_taskmanager_job_task_operator_DenormalizerJob_dataset_id_denorm_partial_success[5m]))",
-          "alias": "Number of Failed Denorm Events",
-          "description": "This alert tracks how many events failed the denorm stage",
+          "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_PipelinePreprocessorJob_dataset_id_validator_total_count[$__range]))",
+          "alias": "[DATASET]: No data has been received for the past hour.",
+          "category": "Processing",
+          "severity": "warning",
+          "code": "ALERT_1209",
+          "description": "The dataset hasn’t received any new data for the past hour, which may affect the querying of the new data.",
+          "summary": "The dataset has not received any new data, which will impact real-time data processing",
+          "frequency": "5m",
+          "interval": "60m",
+          "operator": "lt",
+          "threshold": 1
+        }
+      ],
+      "dataset_metrics_druid": [
+        {
+          "metric": "max(druid_supervisors{supervisor_name=\"dataset_id\", state=\"RUNNING\"} or (0 * absent(druid_supervisors{supervisor_name=\"dataset_id\", state=\"RUNNING\"})))",
+          "alias": "[DATASET]: Druid supervisor is in an unhealthy state",
+          "category": "Querying",
+          "severity": "critical",
+          "code": "ALERT_1309",
+          "description": "The dataset is unhealthy, and no new data has been available for querying since the system encountered the issue.",
+          "summary": "The associated Druid Supervisor is in an unhealthy state, preventing druid ingestion tasks from running. As a result, real-time data cannot be queried.",
+          "frequency": "5m",
+          "interval": "5m",
+          "operator": "lt",
+          "threshold": 1
+        },
+        {
+          "metric": "druid_ingest_events_unparseable_total{dataSource=\"dataset_id\"}",
+          "alias": "[DATASET]: Detected higher amount of unparseable data.",
+          "flattened": true,
+          "category": "Querying",
+          "severity": "critical",
+          "code": "ALERT_1308",
+          "description": "The dataset is unhealthy, and the query results may be incorrect",
+          "summary": "Unparseable data has been detected in the system, preventing it from being processed. Henceforth, queries on this dataset may not return accurate data until the issue is resolved.",
           "frequency": "5m",
           "interval": "5m",
           "operator": "gt",
           "threshold": 0
         },
         {
-          "metric": "sum(sum_over_time(flink_taskmanager_job_task_operator_TransformerJob_dataset_id_transform_failed_count[5m])) + sum(sum_over_time(flink_taskmanager_job_task_operator_TransformerJob_dataset_id_transform_partial_count[5m]))",
-          "alias": "Number of Failed Transformer Events",
-          "description": "This alert tracks how many events failed the transformation stage",
+          "metric": "druid_ingest_kafka_lag{dataSource=\"dataset_id\"}",
+          "alias": "[DATASET]: Detected higher amount of query lag than expected.",
+          "category": "Querying",
+          "flattened": true,
+          "severity": "critical",
+          "code": "ALERT_1307",
+          "description": "A large amount of data is still waiting to be processed. This may cause delays in querying the most recent data",
+          "summary": "High indexer lag in the dataset indicates processing of new data is delayed. Because of this delay, new data isn’t available when querying the dataset.",
+          "frequency": "5m",
+          "interval": "60m",
+          "operator": "gt",
+          "threshold": 5000000
+        },
+        {
+          "metric": "druid_ingest_kafka_lag{dataSource=\"dataset_id\"}",
+          "alias": "[DATASET]: Druid Supervisor Ingestion Failure Due to Offsets.",
+          "category": "Querying",
+          "flattened": true,
+          "severity": "critical",
+          "code": "ALERT_1312",
+          "description": "The dataset is unhealthy, and no new data has been available for querying since the issue occurred",
+          "summary": "The supervisor is experiencing a negative offset, preventing it from ingesting new data. As a result, real-time data is unavailable for querying.",
+          "frequency": "5m",
+          "interval": "5m",
+          "operator": "lt",
+          "threshold": 0
+        },
+        {
+          "metric": "count(druid_tasks_duration{task_status='FAILED', datasource='dataset_id'}) OR on() vector(0)",
+          "alias": "[DATASET]: Druid tasks are in an unhealthy state",
+          "category": "Querying",
+          "severity": "critical",
+          "code": "ALERT_1310",
+          "description": "The dataset is unhealthy, and no new data has been available for querying since the system encountered the issue.",
+          "summary": "The Druid ingestion tasks are in an unhealthy state, causing data ingestion delays and failures. As a result, real-time data may not be available for querying.",
+          "frequency": "5m",
+          "interval": "5m",
+          "operator": "gt",
+          "threshold": 0
+        }
+      ],
+      "api_metric": [
+        {
+          "metric": "sum(sum_over_time(node_failed_api_calls{dataset_id='<dataset_id>', id='api.data.out'}[$__range]))",
+          "alias": "[DATASET]: The Data Query API is encountering more failures to retrieve the data",
+          "category": "Querying",
+          "severity": "warning",
+          "code": "ALERT_1305",
+          "description": "The dataset has been unavailable for querying data for an extented period",
+          "summary": "Query failures are preventing access to the dataset, resulting in an inability to retrieve data as expected.",
+          "frequency": "5m",
+          "interval": "5m",
+          "operator": "gt",
+          "threshold": 0
+        },
+        {
+          "metric": "avg(avg_over_time(node_query_response_time{dataset_id='<dataset_id>', id='api.data.out'}[$__range]))",
+          "alias": "[DATASET]: The Data Query API is facing delays in retrieving data",
+          "category": "Querying",
+          "severity": "warning",
+          "code": "ALERT_1306",
+          "description": "There is a delay in querying the dataset for an extended period.",
+          "summary": "Delays in queries are affecting access to the dataset, leading to delayed data retrieval.",
+          "frequency": "5m",
+          "interval": "5m",
+          "operator": "gt",
+          "threshold": 1000
+        },
+        {
+          "metric": "sum(sum_over_time(node_failed_api_calls{dataset_id='<dataset_id>', id='api.data.in'}[$__range]))",
+          "alias": "[DATASET]: Failed to ingest data into the system",
+          "category": "Ingestion",
+          "severity": "warning",
+          "code": "ALERT_1101",
+          "description": "Detected failures while adding new data to the dataset.",
+          "summary": "Failed to add new data to the dataset, impacting real-time data availability.",
           "frequency": "5m",
           "interval": "5m",
           "operator": "gt",
 
@@ -11,8 +11,7 @@ const telemetryObject = { type: "metric", ver: "1.0.0" };
 const createMetricHandler = async (req: Request, res: Response, next: NextFunction) => {
     try {
         const { component } = req.body;
-        const transformComponent = _.toLower(component);
-        const metricsBody = await Metrics.create({ ...(req.body), component: transformComponent });
+        const metricsBody = await Metrics.create({ ...(req.body), component: component });
         updateTelemetryAuditEvent({ request: req, object: { id: metricsBody?.dataValues?.id, ...telemetryObject } });
         ResponseHandler.successResponse(req, res, { status: httpStatus.OK, data: { id: metricsBody.dataValues.id } });
     } catch (error: any) {
 
@@ -6,6 +6,20 @@ import { Alert } from '../models/Alert';
 import { alertConfig } from './AlertsConfigSevice';
 import Transaction from "sequelize/types/transaction";
 
+interface MetricConfig {
+    metric: string;
+    alias: string;
+    category: string;
+    description: string;
+    frequency: string;
+    interval: string;
+    code: string;
+    severity: string;
+    summary?: string;
+    operator: string;
+    threshold: number;
+    flattened?: boolean;
+}
 
 class AlertManagerService {
     private config: any;
@@ -14,54 +28,72 @@ class AlertManagerService {
         this.config = alertConfig.find('configs.alerts');
     }
 
-    private getModifiedMetric = (service: string, metric: any, datasetId: string): any => {
+    private getModifiedMetric = (service: string, metric: any, datasetId: string, datasource_ref?: string): any => {
         const metricData = _.cloneDeep(metric);
         if (service === 'flink') {
             const modifiedSubstring = datasetId.replace(/-/g, '_');
-            metricData.metric = metricData.metric.replace('dataset_id', modifiedSubstring);
-        } else {
+            metricData.metric = metricData.metric.replaceAll('dataset_id', modifiedSubstring);
+        }
+        else if (service === 'druid') {
+            metricData.metric = metricData.metric.replaceAll('dataset_id', 
+                metricData.flattened ? (datasource_ref || '').replace(/-/g, '_') : datasource_ref
+            );
+        }
+        else if (service === 'api') {
+            metricData.metric = metricData.metric.replaceAll('<dataset_id>', datasetId);
+        }
+        else {
             metricData.metric = metricData.metric.replace('dataset_id', datasetId);
         }
         return metricData;
     }
 
-    private createAlerts = async (params: { datasetId: string; service: string; metric: any; transaction: Transaction }): Promise<void> => {
-        const { datasetId, service, metric, transaction } = params;
-        const metricData = this.getModifiedMetric(service, metric, datasetId);
+    private createAlerts = async (params: { datasetId: string; service: string; metric: any; transaction: Transaction, datasource_ref?: string }): Promise<void> => {
+        const { datasetId, service, metric, transaction, datasource_ref } = params;
+        const metricData = this.getModifiedMetric(service, metric, datasetId, datasource_ref);
 
+        const dataset_id = datasource_ref ? datasource_ref : datasetId
         const metricPayload = {
-            alias: `${metricData.alias} (${datasetId})`,
-            component: 'datasets',
+            alias: `${metricData.alias} (${dataset_id})`,
+            component: metricData.category,
             subComponent: datasetId,
             metric: metricData.metric,
             context: {
                 datasetId: datasetId,
             },
         };
 
-        await this.createMetric(metricPayload, transaction);
-        await this.createAlertRule({ datasetId, metricData, transaction });
+        const response = await this.createMetric(metricPayload, transaction);
+        await this.createAlertRule({ datasetId, datasource_ref, metricData, transaction, metricId: response.dataValues.id });
     }
 
     private createAlertRule = async (params: {
         datasetId: string;
         metricData: any;
-        transaction: Transaction
+        transaction: Transaction;
+        metricId: string;
+        datasource_ref?: string | null;
     }): Promise<void> => {
-        const { datasetId, metricData, transaction } = params;
-        const datasetName = datasetId.replace(/[-.]/g, ' ').replace(/\b\w/g, c => _.toUpper(c));
+        const { datasetId, metricData, transaction, metricId, datasource_ref = null } = params;
+        const dataset = datasource_ref ? datasource_ref : datasetId
+        const datasetName = dataset.replace(/[-.]/g, ' ').replace(/\b\w/g, c => _.toUpper(c));
         const alertPayload = {
-            name: `${metricData.alias} (${datasetName})`,
+            name: metricData.alias.replace('[DATASET]', `[DATASET][${datasetName}]`),
             manager: 'grafana',
             description: metricData.description,
-            category: 'datasets',
+            category: metricData.category,
             frequency: metricData.frequency,
             interval: metricData.interval,
             context: { alertType: 'SYSTEM' },
-            labels: { component: 'obsrv' },
+            labels: { alert_code: metricData.code, component: 'obsrv', dataset: datasetId, table: datasource_ref },
+            severity: metricData.severity,
+            annotations: {
+                summary: _.get(metricData, 'summary', ''),
+            },
             metadata: {
                 queryBuilderContext: {
-                    category: 'datasets',
+                    category: metricData.category,
+                    id: metricId,
                     subComponent: datasetId,
                     metric: metricData.metric,
                     operator: metricData.operator,
@@ -97,13 +129,20 @@ class AlertManagerService {
         return Alert.create(alertData, { transaction });
     }
 
-    public createDatasetAlertsDraft = async (dataset: Record<string, any>, transaction: Transaction): Promise<void> => {
-        for (const metric of this.config.dataset_metrics_flink) {
+    public createDatasetAlertsDraft = async (dataset: Record<string, any>, transaction: Transaction, datasource_ref: string): Promise<void> => {
+        const allMetrics = [
+            ...this.config.dataset_metrics_flink.map((metric: MetricConfig) => ({ service: 'flink', metric })),
+            ...this.config.dataset_metrics_druid.map((metric: MetricConfig) => ({ service: 'druid', metric })),
+            ...this.config.api_metric.map((metric: MetricConfig) => ({ service: 'api', metric }))
+        ];
+
+        for (const { service, metric } of allMetrics) {
             await this.createAlerts({
                 datasetId: dataset.dataset_id,
-                service: "flink",
-                metric: metric,
-                transaction
+                service,
+                metric,
+                transaction,
+                ...(service === 'druid' ? { datasource_ref } : {})
             });
         }
     }