Skip to content

Commit c22cc28

Browse files
narqocodesomepracucci
authored
blockbuilder: backport top-level service (#9068)
* blockbuilder: backport top-level service Co-authored-by: Ganesh Vernekar <[email protected]> Signed-off-by: Vladimir Varankin <[email protected]> * clean up kafka config and metrics Signed-off-by: Vladimir Varankin <[email protected]> * improve logging Signed-off-by: Vladimir Varankin <[email protected]> * rephrase metrics help Signed-off-by: Vladimir Varankin <[email protected]> * simplify getGroupLag Signed-off-by: Vladimir Varankin <[email protected]> * Added comments Signed-off-by: Marco Pracucci <[email protected]> * use explicit local names Signed-off-by: Vladimir Varankin <[email protected]> * fail app on non-recoverable errors Signed-off-by: Vladimir Varankin <[email protected]> * exclude metrics not currently used Signed-off-by: Vladimir Varankin <[email protected]> * refactor config Signed-off-by: Vladimir Varankin <[email protected]> * storage/ingest: expose and reuse NewKafkaLogger Signed-off-by: Vladimir Varankin <[email protected]> * fix back off in getLagForPartition Signed-off-by: Vladimir Varankin <[email protected]> * add missing license header Signed-off-by: Vladimir Varankin <[email protected]> * store data in a dedicated working directory Signed-off-by: Vladimir Varankin <[email protected]> * test nextCycleEnd calculations Signed-off-by: Vladimir Varankin <[email protected]> * redundant code Signed-off-by: Vladimir Varankin <[email protected]> * fixup! test nextCycleEnd calculations * Apply suggestions from code review Co-authored-by: Marco Pracucci <[email protected]> * address minor comments Signed-off-by: Vladimir Varankin <[email protected]> * tests for cycleEnd calculation Signed-off-by: Vladimir Varankin <[email protected]> * simplify getGroupLag Signed-off-by: Vladimir Varankin <[email protected]> --------- Signed-off-by: Vladimir Varankin <[email protected]> Signed-off-by: Marco Pracucci <[email protected]> Co-authored-by: Ganesh Vernekar <[email protected]> Co-authored-by: Marco Pracucci <[email protected]>
1 parent fdc7cd2 commit c22cc28

File tree

9 files changed

+1040
-6
lines changed

9 files changed

+1040
-6
lines changed

pkg/blockbuilder/blockbuilder.go

Lines changed: 340 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,340 @@
1+
// SPDX-License-Identifier: AGPL-3.0-only
2+
3+
package blockbuilder
4+
5+
import (
6+
"context"
7+
"errors"
8+
"fmt"
9+
"os"
10+
"time"
11+
12+
"github.com/go-kit/log"
13+
"github.com/go-kit/log/level"
14+
"github.com/grafana/dskit/backoff"
15+
"github.com/grafana/dskit/services"
16+
"github.com/prometheus/client_golang/prometheus"
17+
"github.com/thanos-io/objstore"
18+
"github.com/twmb/franz-go/pkg/kadm"
19+
"github.com/twmb/franz-go/pkg/kerr"
20+
"github.com/twmb/franz-go/pkg/kgo"
21+
"github.com/twmb/franz-go/plugin/kprom"
22+
23+
"github.com/grafana/mimir/pkg/storage/bucket"
24+
"github.com/grafana/mimir/pkg/storage/ingest"
25+
"github.com/grafana/mimir/pkg/util/validation"
26+
)
27+
28+
type BlockBuilder struct {
29+
services.Service
30+
31+
cfg Config
32+
logger log.Logger
33+
register prometheus.Registerer
34+
limits *validation.Overrides
35+
kafkaClient *kgo.Client
36+
bucket objstore.Bucket
37+
38+
assignedPartitionIDs []int32
39+
// fallbackOffsetMillis is the milliseconds timestamp after which a partition that doesn't have a commit will be consumed from.
40+
fallbackOffsetMillis int64
41+
42+
metrics blockBuilderMetrics
43+
}
44+
45+
func New(
46+
cfg Config,
47+
logger log.Logger,
48+
reg prometheus.Registerer,
49+
limits *validation.Overrides,
50+
) (*BlockBuilder, error) {
51+
b := &BlockBuilder{
52+
cfg: cfg,
53+
logger: logger,
54+
register: reg,
55+
limits: limits,
56+
metrics: newBlockBuilderMetrics(reg),
57+
}
58+
59+
b.assignedPartitionIDs = b.cfg.PartitionAssignment[b.cfg.InstanceID]
60+
if len(b.assignedPartitionIDs) == 0 {
61+
// This is just an assertion check. The config validation prevents this from happening.
62+
return nil, fmt.Errorf("no partitions assigned to instance %s", b.cfg.InstanceID)
63+
}
64+
65+
bucketClient, err := bucket.NewClient(context.Background(), cfg.BlocksStorageConfig.Bucket, "block-builder", logger, reg)
66+
if err != nil {
67+
return nil, fmt.Errorf("failed to create the bucket client: %w", err)
68+
}
69+
b.bucket = bucketClient
70+
71+
b.Service = services.NewBasicService(b.starting, b.running, b.stopping)
72+
73+
return b, nil
74+
}
75+
76+
func (b *BlockBuilder) starting(ctx context.Context) (err error) {
77+
// Empty any previous artifacts.
78+
if err := os.RemoveAll(b.cfg.DataDir); err != nil {
79+
return fmt.Errorf("removing data dir: %w", err)
80+
}
81+
if err := os.MkdirAll(b.cfg.DataDir, os.ModePerm); err != nil {
82+
return fmt.Errorf("creating data dir: %w", err)
83+
}
84+
85+
// TODO: add a test to test the case where the consumption on startup happens
86+
// after the LookbackOnNoCommit with records before the after the consumption
87+
// start point.
88+
startAtOffsets, fallbackOffsetMillis, err := b.findOffsetsToStartAt(ctx)
89+
if err != nil {
90+
return fmt.Errorf("find offsets to start at: %w", err)
91+
}
92+
b.fallbackOffsetMillis = fallbackOffsetMillis
93+
94+
const fetchMaxBytes = 100_000_000
95+
96+
opts := []kgo.Opt{
97+
kgo.ConsumePartitions(map[string]map[int32]kgo.Offset{
98+
b.cfg.Kafka.Topic: startAtOffsets,
99+
}),
100+
kgo.FetchMinBytes(128),
101+
kgo.FetchMaxBytes(fetchMaxBytes),
102+
kgo.FetchMaxWait(5 * time.Second),
103+
kgo.FetchMaxPartitionBytes(50_000_000),
104+
// BrokerMaxReadBytes sets the maximum response size that can be read from
105+
// Kafka. This is a safety measure to avoid OOMing on invalid responses.
106+
// franz-go recommendation is to set it 2x FetchMaxBytes.
107+
kgo.BrokerMaxReadBytes(2 * fetchMaxBytes),
108+
}
109+
110+
metrics := ingest.NewKafkaReaderClientMetrics("block-builder", b.register)
111+
opts = append(
112+
commonKafkaClientOptions(b.cfg.Kafka, b.logger, metrics),
113+
opts...,
114+
)
115+
b.kafkaClient, err = kgo.NewClient(opts...)
116+
if err != nil {
117+
return fmt.Errorf("creating kafka client: %w", err)
118+
}
119+
120+
// Immediately pause fetching all assigned partitions. We control the order and the pace of partition fetching within the cycles.
121+
b.kafkaClient.PauseFetchPartitions(map[string][]int32{b.cfg.Kafka.Topic: b.assignedPartitionIDs})
122+
123+
return nil
124+
}
125+
126+
func (b *BlockBuilder) findOffsetsToStartAt(ctx context.Context) (map[int32]kgo.Offset, int64, error) {
127+
// We use an ephemeral client to fetch the offset and then create a new client with this offset.
128+
// The reason for this is that changing the offset of an existing client requires to have used this client for fetching at least once.
129+
// We don't want to do noop fetches just to warm up the client, so we create a new client instead.
130+
cl, err := kgo.NewClient(commonKafkaClientOptions(b.cfg.Kafka, b.logger, nil)...)
131+
if err != nil {
132+
return nil, -1, fmt.Errorf("unable to create bootstrap client: %w", err)
133+
}
134+
defer cl.Close()
135+
136+
admClient := kadm.NewClient(cl)
137+
138+
// Fallback offset is the millisecond timestamp used to look up a real offset if partition doesn't have a commit.
139+
fallbackOffsetMillis := time.Now().Add(-b.cfg.LookbackOnNoCommit).UnixMilli()
140+
fallbackOffset := kgo.NewOffset().AfterMilli(fallbackOffsetMillis)
141+
142+
fetchOffsets := func(ctx context.Context) (offsets map[int32]kgo.Offset, err error) {
143+
resp, err := admClient.FetchOffsets(ctx, b.cfg.Kafka.ConsumerGroup)
144+
if err == nil {
145+
err = resp.Error()
146+
}
147+
// Either success or the requested group does not exist.
148+
if errors.Is(err, kerr.GroupIDNotFound) || errors.Is(err, kerr.UnknownTopicOrPartition) {
149+
offsets = make(map[int32]kgo.Offset)
150+
} else if err != nil {
151+
return nil, fmt.Errorf("unable to fetch group offsets: %w", err)
152+
} else {
153+
offsets = resp.KOffsets()[b.cfg.Kafka.Topic]
154+
}
155+
156+
// Look over the assigned partitions and fallback to the ConsumerResetOffset if a partition doesn't have any commit for the configured group.
157+
for _, partition := range b.assignedPartitionIDs {
158+
if _, ok := offsets[partition]; ok {
159+
level.Info(b.logger).Log("msg", "consuming from last consumed offset", "consumer_group", b.cfg.Kafka.ConsumerGroup, "partition", partition, "offset", offsets[partition].String())
160+
} else {
161+
offsets[partition] = fallbackOffset
162+
level.Info(b.logger).Log("msg", "consuming from partition lookback because no offset has been found", "consumer_group", b.cfg.Kafka.ConsumerGroup, "partition", partition, "offset", offsets[partition].String())
163+
}
164+
}
165+
return offsets, nil
166+
}
167+
168+
boff := backoff.New(ctx, backoff.Config{
169+
MinBackoff: 100 * time.Millisecond,
170+
MaxBackoff: 2 * time.Second,
171+
MaxRetries: 10,
172+
})
173+
for boff.Ongoing() {
174+
var offsets map[int32]kgo.Offset
175+
offsets, err = fetchOffsets(ctx)
176+
if err == nil {
177+
return offsets, fallbackOffsetMillis, nil
178+
}
179+
level.Warn(b.logger).Log("msg", "failed to fetch startup offsets; will retry", "err", err)
180+
boff.Wait()
181+
}
182+
// Handle the case the context was canceled before the first attempt.
183+
if err == nil {
184+
err = boff.Err()
185+
}
186+
return nil, -1, err
187+
}
188+
189+
// TODO(v): consider exposing storage/ingest.commonKafkaClientOptions
190+
func commonKafkaClientOptions(cfg KafkaConfig, logger log.Logger, metrics *kprom.Metrics) []kgo.Opt {
191+
opts := []kgo.Opt{
192+
kgo.ClientID(cfg.ClientID),
193+
kgo.SeedBrokers(cfg.Address),
194+
kgo.DialTimeout(cfg.DialTimeout),
195+
kgo.MetadataMinAge(10 * time.Second),
196+
kgo.MetadataMaxAge(10 * time.Second),
197+
kgo.WithLogger(ingest.NewKafkaLogger(logger)),
198+
}
199+
if metrics != nil {
200+
opts = append(opts, kgo.WithHooks(metrics))
201+
}
202+
return opts
203+
}
204+
205+
func (b *BlockBuilder) stopping(_ error) error {
206+
b.kafkaClient.Close()
207+
208+
return nil
209+
}
210+
211+
func (b *BlockBuilder) running(ctx context.Context) error {
212+
// Do initial consumption on start using current time as the point up to which we are consuming.
213+
// To avoid small blocks at startup, we consume until the <consume interval> boundary + buffer.
214+
cycleEnd := cycleEndAtStartup(time.Now, b.cfg.ConsumeInterval, b.cfg.ConsumeIntervalBuffer)
215+
err := b.nextConsumeCycle(ctx, cycleEnd)
216+
if err != nil {
217+
if errors.Is(err, context.Canceled) {
218+
return nil
219+
}
220+
return err
221+
}
222+
223+
cycleEnd, waitDur := nextCycleEnd(time.Now, b.cfg.ConsumeInterval, b.cfg.ConsumeIntervalBuffer)
224+
for {
225+
select {
226+
case <-time.After(waitDur):
227+
level.Info(b.logger).Log("msg", "triggering next consume cycle", "cycle_end", cycleEnd)
228+
err := b.nextConsumeCycle(ctx, cycleEnd)
229+
if err != nil && !errors.Is(err, context.Canceled) {
230+
// Fail the whole service in case of a non-recoverable error.
231+
return fmt.Errorf("consume next cycle until cycle_end %s: %w", cycleEnd, err)
232+
}
233+
234+
// If we took more than ConsumeInterval to consume the records, this will immediately start the next consumption.
235+
// TODO(codesome): track waitDur < 0, which is the time we ran over. Should have an alert on this.
236+
cycleEnd = cycleEnd.Add(b.cfg.ConsumeInterval)
237+
waitDur = time.Until(cycleEnd)
238+
case <-ctx.Done():
239+
level.Info(b.logger).Log("msg", "context cancelled, stopping")
240+
return nil
241+
}
242+
}
243+
}
244+
245+
func cycleEndAtStartup(now func() time.Time, interval, buffer time.Duration) time.Time {
246+
t := now()
247+
cycleEnd := t.Truncate(interval).Add(buffer)
248+
if cycleEnd.After(t) {
249+
cycleEnd = cycleEnd.Add(-interval)
250+
}
251+
return cycleEnd
252+
}
253+
254+
func nextCycleEnd(now func() time.Time, interval, buffer time.Duration) (time.Time, time.Duration) {
255+
t := now()
256+
cycleEnd := t.Truncate(interval).Add(interval + buffer)
257+
waitTime := cycleEnd.Sub(t)
258+
for waitTime > interval {
259+
// Example - with interval=1h and buffer=15m:
260+
// - at now=14:12, next cycle starts at 14:15 (startup cycle ended at 13:15)
261+
// - at now=14:17, next cycle starts at 15:15 (startup cycle ended at 14:15)
262+
cycleEnd = cycleEnd.Add(-interval)
263+
waitTime -= interval
264+
}
265+
return cycleEnd, waitTime
266+
}
267+
268+
// nextConsumeCycle manages consumption of currently assigned partitions.
269+
// The cycleEnd argument indicates the timestamp (relative to Kafka records) up until which to consume from partitions
270+
// in this cycle. That is, Kafka records produced after the cycleEnd mark will be consumed in the next cycle.
271+
func (b *BlockBuilder) nextConsumeCycle(ctx context.Context, cycleEnd time.Time) error {
272+
defer func(t time.Time) {
273+
b.metrics.consumeCycleDuration.Observe(time.Since(t).Seconds())
274+
}(time.Now())
275+
276+
for _, partition := range b.assignedPartitionIDs {
277+
if ctx.Err() != nil {
278+
return ctx.Err()
279+
}
280+
281+
// TODO(v): calculating lag for each individual partition requests data for the whole group every time. This is redundant.
282+
// As, currently, we don't expect rebalance (re-assignment) happening in the middle of the cycle, we could calculate the lag once for all assigned partitions
283+
// in the beginning of the cycle.
284+
// Lag is the upperbound number of records we'll have to consume from Kafka to build the blocks.
285+
// It's the "upperbound" because the consumption may be stopped earlier if we get records containing
286+
// samples with timestamp greater than the cycleEnd timestamp.
287+
lag, err := b.getLagForPartition(ctx, partition)
288+
if err != nil {
289+
level.Error(b.logger).Log("msg", "failed to get partition lag", "err", err, "partition", partition, "cycle_end", cycleEnd)
290+
continue
291+
}
292+
293+
b.metrics.consumerLagRecords.WithLabelValues(lag.Topic, fmt.Sprintf("%d", lag.Partition)).Set(float64(lag.Lag))
294+
295+
if lag.Lag <= 0 {
296+
if err := lag.Err; err != nil {
297+
level.Error(b.logger).Log("msg", "failed to get partition lag", "err", err, "partition", partition, "cycle_end", cycleEnd)
298+
} else {
299+
level.Info(b.logger).Log(
300+
"msg", "nothing to consume in partition",
301+
"partition", partition,
302+
"offset", lag.Commit.At,
303+
"end_offset", lag.End.Offset,
304+
"lag", lag.Lag,
305+
)
306+
}
307+
}
308+
309+
// TODO(v): backport the code for consuming partition up to its lag.Lag
310+
}
311+
return nil
312+
}
313+
314+
func (b *BlockBuilder) getLagForPartition(ctx context.Context, partition int32) (kadm.GroupMemberLag, error) {
315+
boff := backoff.New(ctx, backoff.Config{
316+
MinBackoff: 100 * time.Millisecond,
317+
MaxBackoff: time.Second,
318+
MaxRetries: 10,
319+
})
320+
var lastErr error
321+
for boff.Ongoing() {
322+
groupLag, err := getGroupLag(ctx, kadm.NewClient(b.kafkaClient), b.cfg.Kafka.Topic, b.cfg.Kafka.ConsumerGroup, b.fallbackOffsetMillis)
323+
if err != nil {
324+
lastErr = fmt.Errorf("get consumer group lag: %w", err)
325+
} else {
326+
lag, ok := groupLag.Lookup(b.cfg.Kafka.Topic, partition)
327+
if ok {
328+
return lag, nil
329+
}
330+
// This should not happen with the recent implementation of getGroupLag, that handles a case when the group doesn't have live participants;
331+
// leaving the check here for completeness.
332+
lastErr = fmt.Errorf("partition %d not found in lag response", partition)
333+
}
334+
335+
level.Warn(b.logger).Log("msg", "failed to get consumer group lag; will retry", "err", lastErr, "partition", partition)
336+
boff.Wait()
337+
}
338+
339+
return kadm.GroupMemberLag{}, lastErr
340+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
// SPDX-License-Identifier: AGPL-3.0-only
2+
3+
package blockbuilder
4+
5+
import (
6+
"github.com/prometheus/client_golang/prometheus"
7+
"github.com/prometheus/client_golang/prometheus/promauto"
8+
)
9+
10+
type blockBuilderMetrics struct {
11+
consumeCycleDuration prometheus.Histogram
12+
consumerLagRecords *prometheus.GaugeVec
13+
}
14+
15+
func newBlockBuilderMetrics(reg prometheus.Registerer) blockBuilderMetrics {
16+
var m blockBuilderMetrics
17+
18+
m.consumeCycleDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
19+
Name: "cortex_blockbuilder_consume_cycle_duration_seconds",
20+
Help: "Time spent consuming a full cycle.",
21+
22+
NativeHistogramBucketFactor: 1.1,
23+
})
24+
25+
m.consumerLagRecords = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
26+
Name: "cortex_blockbuilder_consumer_lag_records",
27+
Help: "The per-topic-partition number of records, instance needs to work through each cycle.",
28+
}, []string{"topic", "partition"})
29+
30+
return m
31+
}

0 commit comments

Comments
 (0)