Skip to content

Commit 006c3aa

Browse files
authored
TBS: Optimize performance for instances with more CPU and memory (elastic#17254)
To improve performance for larger instances e.g. 16GB+ memory on cloud, a few tweaks are done: - DB cache size is autoscaled to memory limit - Enable concurrent compaction to increase CPU utilization benchstat 32gb GCP on ECH, 0.01 sample rate. +60% throughput after PR.
1 parent 42567f2 commit 006c3aa

File tree

6 files changed

+49
-13
lines changed

6 files changed

+49
-13
lines changed

internal/beater/beater.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,14 @@ func (s *Runner) Run(ctx context.Context) error {
240240
)
241241
}
242242

243+
if s.config.Sampling.Tail.Enabled && s.config.Sampling.Tail.DatabaseCacheSize == 0 {
244+
// 1GB=16MB, 2GB=24MB, 4GB=40MB, ..., 32GB=264MB, 64GB=520MB
245+
s.config.Sampling.Tail.DatabaseCacheSize = uint64(linearScaledValue(8<<20, memLimitGB, 8<<20))
246+
s.logger.Infof("Sampling.Tail.DatabaseCacheSize set to %d based on %0.1fgb of memory",
247+
s.config.Sampling.Tail.DatabaseCacheSize, memLimitGB,
248+
)
249+
}
250+
243251
// Send config to telemetry.
244252
recordAPMServerConfig(s.config)
245253

internal/beater/config/sampling.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ type TailSamplingConfig struct {
6363

6464
DiscardOnWriteFailure bool `config:"discard_on_write_failure"`
6565

66+
// DatabaseCacheSize is cache size in bytes for tail-sampling database.
67+
DatabaseCacheSize uint64 `config:"database_cache_size"`
68+
6669
esConfigured bool
6770
}
6871

x-pack/apm-server/main.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ func newTailSamplingProcessor(args beater.ServerParams) (*sampling.Processor, er
108108
}
109109

110110
storageDir := paths.Resolve(paths.Data, tailSamplingStorageDir)
111-
db, err := getDB(storageDir, args.MeterProvider, args.Logger)
111+
db, err := getDB(storageDir, tailSamplingConfig.DatabaseCacheSize, args.MeterProvider, args.Logger)
112112
if err != nil {
113113
return nil, fmt.Errorf("failed to get tail-sampling database: %w", err)
114114
}
@@ -154,11 +154,13 @@ func newTailSamplingProcessor(args beater.ServerParams) (*sampling.Processor, er
154154
}, args.Logger)
155155
}
156156

157-
func getDB(storageDir string, mp metric.MeterProvider, logger *logp.Logger) (*eventstorage.StorageManager, error) {
157+
func getDB(storageDir string, cacheSize uint64, mp metric.MeterProvider, logger *logp.Logger) (*eventstorage.StorageManager, error) {
158158
dbMu.Lock()
159159
defer dbMu.Unlock()
160160
if db == nil {
161-
var opts []eventstorage.StorageManagerOptions
161+
opts := []eventstorage.StorageManagerOptions{
162+
eventstorage.WithDBCacheSize(cacheSize),
163+
}
162164
if mp != nil {
163165
opts = append(opts, eventstorage.WithMeterProvider(mp))
164166
}

x-pack/apm-server/sampling/eventstorage/pebble.go

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,10 @@ func eventComparer() *pebble.Comparer {
3838
return &comparer
3939
}
4040

41-
func OpenEventPebble(storageDir string, logger *logp.Logger) (*pebble.DB, error) {
41+
func OpenEventPebble(storageDir string, cacheSize uint64, logger *logp.Logger) (*pebble.DB, error) {
4242
// Option values are picked and validated in https://github.com/elastic/apm-server/issues/15568
43+
cache := pebble.NewCache(int64(cacheSize))
44+
defer cache.Unref()
4345
opts := &pebble.Options{
4446
FormatMajorVersion: pebble.FormatColumnarBlocks,
4547
Logger: logger.Named(logs.Sampling),
@@ -53,13 +55,19 @@ func OpenEventPebble(storageDir string, logger *logp.Logger) (*pebble.DB, error)
5355
},
5456
},
5557
Comparer: eventComparer(),
58+
Cache: cache,
59+
MaxConcurrentCompactions: func() int {
60+
return 2
61+
}, // Better utilizes CPU on larger instances
5662
}
5763
return pebble.Open(filepath.Join(storageDir, "event"), opts)
5864
}
5965

60-
func OpenDecisionPebble(storageDir string, logger *logp.Logger) (*pebble.DB, error) {
66+
func OpenDecisionPebble(storageDir string, cacheSize uint64, logger *logp.Logger) (*pebble.DB, error) {
6167
// Option values are picked and validated in https://github.com/elastic/apm-server/issues/15568
62-
return pebble.Open(filepath.Join(storageDir, "decision"), &pebble.Options{
68+
cache := pebble.NewCache(int64(cacheSize))
69+
defer cache.Unref()
70+
opts := &pebble.Options{
6371
FormatMajorVersion: pebble.FormatColumnarBlocks,
6472
Logger: logger.Named(logs.Sampling),
6573
MemTableSize: 2 << 20, // big memtables are slow to scan, and significantly slow the hot path
@@ -71,5 +79,10 @@ func OpenDecisionPebble(storageDir string, logger *logp.Logger) (*pebble.DB, err
7179
FilterType: pebble.TableFilter,
7280
},
7381
},
74-
})
82+
Cache: cache,
83+
MaxConcurrentCompactions: func() int {
84+
return 2
85+
}, // Better utilizes CPU on larger instances
86+
}
87+
return pebble.Open(filepath.Join(storageDir, "decision"), opts)
7588
}

x-pack/apm-server/sampling/eventstorage/prefix_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ import (
1919
)
2020

2121
func newEventPebble(t *testing.T) *pebble.DB {
22-
db, err := eventstorage.OpenEventPebble(t.TempDir(), logptest.NewTestingLogger(t, ""))
22+
db, err := eventstorage.OpenEventPebble(t.TempDir(), 8<<20, logptest.NewTestingLogger(t, ""))
2323
require.NoError(t, err)
2424
t.Cleanup(func() {
2525
db.Close()
@@ -28,7 +28,7 @@ func newEventPebble(t *testing.T) *pebble.DB {
2828
}
2929

3030
func newDecisionPebble(t *testing.T) *pebble.DB {
31-
db, err := eventstorage.OpenDecisionPebble(t.TempDir(), logptest.NewTestingLogger(t, ""))
31+
db, err := eventstorage.OpenDecisionPebble(t.TempDir(), 8<<20, logptest.NewTestingLogger(t, ""))
3232
require.NoError(t, err)
3333
t.Cleanup(func() {
3434
db.Close()

x-pack/apm-server/sampling/eventstorage/storage_manager.go

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,13 @@ func WithGetDiskUsage(getDiskUsage func() (DiskUsage, error)) StorageManagerOpti
8888
}
8989
}
9090

91+
// WithDBCacheSize sets the total size in bytes of in-memory cache of all databases managed by StorageManager.
92+
func WithDBCacheSize(size uint64) StorageManagerOptions {
93+
return func(sm *StorageManager) {
94+
sm.dbCacheSize = size
95+
}
96+
}
97+
9198
// DiskUsage is the struct returned by getDiskUsage.
9299
type DiskUsage struct {
93100
UsedBytes, TotalBytes uint64
@@ -96,8 +103,9 @@ type DiskUsage struct {
96103
// StorageManager encapsulates pebble.DB.
97104
// It assumes exclusive access to pebble DB at storageDir.
98105
type StorageManager struct {
99-
storageDir string
100-
logger *logp.Logger
106+
storageDir string
107+
dbCacheSize uint64
108+
logger *logp.Logger
101109

102110
eventDB *pebble.DB
103111
decisionDB *pebble.DB
@@ -155,6 +163,7 @@ func NewStorageManager(storageDir string, logger *logp.Logger, opts ...StorageMa
155163
TotalBytes: usage.TotalBytes,
156164
}, err
157165
},
166+
dbCacheSize: 16 << 20, // default to 16MB cache shared between event and decision DB
158167
}
159168
sm.getDBSize = func() uint64 {
160169
return sm.eventDB.Metrics().DiskSpaceUsage() + sm.decisionDB.Metrics().DiskSpaceUsage()
@@ -179,13 +188,14 @@ func NewStorageManager(storageDir string, logger *logp.Logger, opts ...StorageMa
179188

180189
// reset initializes db and storage.
181190
func (sm *StorageManager) reset() error {
182-
eventDB, err := OpenEventPebble(sm.storageDir, sm.logger)
191+
// Configured db cache size is split between event DB and decision DB
192+
eventDB, err := OpenEventPebble(sm.storageDir, sm.dbCacheSize/2, sm.logger)
183193
if err != nil {
184194
return fmt.Errorf("open event db error: %w", err)
185195
}
186196
sm.eventDB = eventDB
187197

188-
decisionDB, err := OpenDecisionPebble(sm.storageDir, sm.logger)
198+
decisionDB, err := OpenDecisionPebble(sm.storageDir, sm.dbCacheSize/2, sm.logger)
189199
if err != nil {
190200
return fmt.Errorf("open decision db error: %w", err)
191201
}

0 commit comments

Comments
 (0)