@@ -523,6 +523,166 @@ func TestRulerAlertmanagerTLS(t *testing.T) {
523
523
require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (1 ), []string {"cortex_prometheus_notifications_alertmanagers_discovered" }, e2e .WaitMissingMetrics ))
524
524
}
525
525
526
+ func TestRulerMetricsForInvalidQueries (t * testing.T ) {
527
+ s , err := e2e .NewScenario (networkName )
528
+ require .NoError (t , err )
529
+ defer s .Close ()
530
+
531
+ // Start dependencies.
532
+ consul := e2edb .NewConsul ()
533
+ minio := e2edb .NewMinio (9000 , bucketName , rulestoreBucketName )
534
+ require .NoError (t , s .StartAndWaitReady (consul , minio ))
535
+
536
+ // Configure the ruler.
537
+ flags := mergeFlags (
538
+ BlocksStorageFlags (),
539
+ RulerFlags (false ),
540
+ map [string ]string {
541
+ // Since we're not going to run any rule (our only rule is invalid), we don't need the
542
+ // store-gateway to be configured to a valid address.
543
+ "-querier.store-gateway-addresses" : "localhost:12345" ,
544
+ // Enable the bucket index so we can skip the initial bucket scan.
545
+ "-blocks-storage.bucket-store.bucket-index.enabled" : "true" ,
546
+ // Evaluate rules often, so that we don't need to wait for metrics to show up.
547
+ "-ruler.evaluation-interval" : "2s" ,
548
+ "-ruler.poll-interval" : "2s" ,
549
+ // No delay
550
+ "-ruler.evaluation-delay-duration" : "0" ,
551
+
552
+ "-blocks-storage.tsdb.block-ranges-period" : "1h" ,
553
+ "-blocks-storage.bucket-store.sync-interval" : "1s" ,
554
+ "-blocks-storage.tsdb.retention-period" : "2h" ,
555
+
556
+ // We run single ingester only, no replication.
557
+ "-distributor.replication-factor" : "1" ,
558
+
559
+ // Very low limit so that ruler hits it.
560
+ "-querier.max-fetched-chunks-per-query" : "5" ,
561
+ // We need this to make limit work.
562
+ "-ingester.stream-chunks-when-using-blocks" : "true" ,
563
+ },
564
+ )
565
+
566
+ const namespace = "test"
567
+ const user = "user"
568
+
569
+ distributor := e2ecortex .NewDistributor ("distributor" , consul .NetworkHTTPEndpoint (), flags , "" )
570
+ ruler := e2ecortex .NewRuler ("ruler" , consul .NetworkHTTPEndpoint (), flags , "" )
571
+ ingester := e2ecortex .NewIngester ("ingester" , consul .NetworkHTTPEndpoint (), flags , "" )
572
+ require .NoError (t , s .StartAndWaitReady (distributor , ingester , ruler ))
573
+
574
+ // Wait until both the distributor and ruler have updated the ring. The querier will also watch
575
+ // the store-gateway ring if blocks sharding is enabled.
576
+ require .NoError (t , distributor .WaitSumMetrics (e2e .Equals (512 ), "cortex_ring_tokens_total" ))
577
+ require .NoError (t , ruler .WaitSumMetrics (e2e .Equals (512 ), "cortex_ring_tokens_total" ))
578
+
579
+ c , err := e2ecortex .NewClient (distributor .HTTPEndpoint (), "" , "" , ruler .HTTPEndpoint (), user )
580
+ require .NoError (t , err )
581
+
582
+ // Push some series to Cortex -- enough so that we can hit some limits.
583
+ for i := 0 ; i < 10 ; i ++ {
584
+ series , _ := generateSeries ("metric" , time .Now (), prompb.Label {Name : "foo" , Value : fmt .Sprintf ("%d" , i )})
585
+
586
+ res , err := c .Push (series )
587
+ require .NoError (t , err )
588
+ require .Equal (t , 200 , res .StatusCode )
589
+ }
590
+
591
+ totalQueries , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_total" })
592
+ require .NoError (t , err )
593
+
594
+ // Verify that user-failures don't increase cortex_ruler_queries_failed_total
595
+ for groupName , expression := range map [string ]string {
596
+ // Syntactically correct expression (passes check in ruler), but failing because of invalid regex. This fails in PromQL engine.
597
+ "invalid_group" : `label_replace(metric, "foo", "$1", "service", "[")` ,
598
+
599
+ // This one fails in querier code, because of limits.
600
+ "too_many_chunks_group" : `sum(metric)` ,
601
+ } {
602
+ t .Run (groupName , func (t * testing.T ) {
603
+ require .NoError (t , c .SetRuleGroup (ruleGroupWithRule (groupName , "rule" , expression ), namespace ))
604
+ m := ruleGroupMatcher (user , namespace , groupName )
605
+
606
+ // Wait until ruler has loaded the group.
607
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (1 ), []string {"cortex_prometheus_rule_group_rules" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
608
+
609
+ // Wait until rule group has tried to evaluate the rule.
610
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_prometheus_rule_evaluations_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
611
+
612
+ // Verify that evaluation of the rule failed.
613
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_prometheus_rule_evaluation_failures_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
614
+
615
+ // But these failures were not reported as "failed queries"
616
+ sum , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_failed_total" })
617
+ require .NoError (t , err )
618
+ require .Equal (t , float64 (0 ), sum [0 ])
619
+
620
+ // Delete rule before checkin "cortex_ruler_queries_total", as we want to reuse value for next test.
621
+ require .NoError (t , c .DeleteRuleGroup (namespace , groupName ))
622
+
623
+ // Wait until ruler has unloaded the group. We don't use any matcher, so there should be no groups (in fact, metric disappears).
624
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_prometheus_rule_group_rules" }, e2e .SkipMissingMetrics ))
625
+
626
+ // Check that cortex_ruler_queries_total went up since last test.
627
+ newTotalQueries , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_total" })
628
+ require .NoError (t , err )
629
+ require .Greater (t , newTotalQueries [0 ], totalQueries [0 ])
630
+
631
+ // Remember totalQueries for next test.
632
+ totalQueries = newTotalQueries
633
+ })
634
+ }
635
+
636
+ // Now let's upload a non-failing rule, and make sure that it works.
637
+ t .Run ("real_error" , func (t * testing.T ) {
638
+ const groupName = "good_rule"
639
+ const expression = `sum(metric{foo=~"1|2"})`
640
+
641
+ require .NoError (t , c .SetRuleGroup (ruleGroupWithRule (groupName , "rule" , expression ), namespace ))
642
+ m := ruleGroupMatcher (user , namespace , groupName )
643
+
644
+ // Wait until ruler has loaded the group.
645
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (1 ), []string {"cortex_prometheus_rule_group_rules" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
646
+
647
+ // Wait until rule group has tried to evaluate the rule, and succeeded.
648
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_prometheus_rule_evaluations_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
649
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_prometheus_rule_evaluation_failures_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
650
+
651
+ // Still no failures.
652
+ sum , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_failed_total" })
653
+ require .NoError (t , err )
654
+ require .Equal (t , float64 (0 ), sum [0 ])
655
+
656
+ // Now let's stop ingester, and recheck metrics. This should increase cortex_ruler_queries_failed_total failures.
657
+ require .NoError (t , s .Stop (ingester ))
658
+
659
+ // We should start getting "real" failures now.
660
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_ruler_queries_failed_total" }))
661
+ })
662
+ }
663
+
664
+ func ruleGroupMatcher (user , namespace , groupName string ) * labels.Matcher {
665
+ return labels .MustNewMatcher (labels .MatchEqual , "rule_group" , fmt .Sprintf ("/rules/%s/%s;%s" , user , namespace , groupName ))
666
+ }
667
+
668
+ func ruleGroupWithRule (groupName string , ruleName string , expression string ) rulefmt.RuleGroup {
669
+ // Prepare rule group with invalid rule.
670
+ var recordNode = yaml.Node {}
671
+ var exprNode = yaml.Node {}
672
+
673
+ recordNode .SetString (ruleName )
674
+ exprNode .SetString (expression )
675
+
676
+ return rulefmt.RuleGroup {
677
+ Name : groupName ,
678
+ Interval : 10 ,
679
+ Rules : []rulefmt.RuleNode {{
680
+ Record : recordNode ,
681
+ Expr : exprNode ,
682
+ }},
683
+ }
684
+ }
685
+
526
686
func createTestRuleGroup (t * testing.T ) rulefmt.RuleGroup {
527
687
t .Helper ()
528
688
0 commit comments