Skip to content

Commit eae8d71

Browse files
[extension/storage/filestorage] Fix recreate from panic (#41802)
<!--Ex. Fixing a bug - Describe the bug and how this fixes the issue. Ex. Adding a feature - Explain what this achieves.--> #### Description * This enables recovery from a panic when the bbolt db is corrupted and renames the file when a panic occurs. * This changes the `recreate` behavior to not rename the file upon every start of the collector. <!-- Issue number (e.g. #1234) or full URL to issue, if applicable. --> #### Link to tracking issue Resolves #36840 Resolves #35899 <!--Describe what testing was performed and which tests were added.--> #### Testing 1. Start collector with file extension configured. This should create the file with some content within it. 2. Stop the collector. 3. Manually edit or "break" the bbolt db file by adding characters in random places, forcefully causing a panic 4. Start the collector 5. See logs: ```json {"level":"info","ts":"2025-07-31T23:26:34.218-0400","msg":"Setting up own telemetry...","resource":{"service.instance.id":"d830aeff-7993-49f7-9817-a0c96af3498d","service.name":"otelcontribcol","service.version":"0.131.0-dev"}} {"level":"info","ts":"2025-07-31T23:26:34.242-0400","msg":"Starting otelcontribcol...","resource":{"service.instance.id":"d830aeff-7993-49f7-9817-a0c96af3498d","service.name":"otelcontribcol","service.version":"0.131.0-dev"},"Version":"0.131.0-dev","NumCPU":6} {"level":"info","ts":"2025-07-31T23:26:34.242-0400","msg":"Starting extensions...","resource":{"service.instance.id":"d830aeff-7993-49f7-9817-a0c96af3498d","service.name":"otelcontribcol","service.version":"0.131.0-dev"}} {"level":"info","ts":"2025-07-31T23:26:34.242-0400","msg":"Extension is starting...","resource":{"service.instance.id":"d830aeff-7993-49f7-9817-a0c96af3498d","service.name":"otelcontribcol","service.version":"0.131.0-dev"},"otelcol.component.id":"file_storage/persistent_queue_storage","otelcol.component.kind":"extension"} {"level":"info","ts":"2025-07-31T23:26:34.242-0400","msg":"Extension started.","resource":{"service.instance.id":"d830aeff-7993-49f7-9817-a0c96af3498d","service.name":"otelcontribcol","service.version":"0.131.0-dev"},"otelcol.component.id":"file_storage/persistent_queue_storage","otelcol.component.kind":"extension"} #### This line #### {"level":"warn","ts":"2025-07-31T23:26:34.242-0400","msg":"Database corruption detected, recreating database file","resource":{"service.instance.id":"d830aeff-7993-49f7-9817-a0c96af3498d","service.name":"otelcontribcol","service.version":"0.131.0-dev"},"otelcol.component.id":"file_storage/persistent_queue_storage","otelcol.component.kind":"extension","file":"/data/otelcol/persistent_queue_storage/exporter_otlphttp_general_logs","panic":"assertion failed: Page expected to be: 96, but self identifies as 4909050520039286100"} {"level":"info","ts":"2025-07-31T23:26:34.243-0400","msg":"Corrupted database file renamed","resource":{"service.instance.id":"d830aeff-7993-49f7-9817-a0c96af3498d","service.name":"otelcontribcol","service.version":"0.131.0-dev"},"otelcol.component.id":"file_storage/persistent_queue_storage","otelcol.component.kind":"extension","original":"/data/otelcol/persistent_queue_storage/exporter_otlphttp_general_logs","backup":"/data/otelcol/persistent_queue_storage/exporter_otlphttp_general_logs.2025-08-27T13:26:07.479.backup"} #### through this line #### {"level":"info","ts":"2025-07-31T23:26:34.244-0400","msg":"New queue metadata key not found, attempting to load legacy format.","resource":{"service.instance.id":"d830aeff-7993-49f7-9817-a0c96af3498d","service.name":"otelcontribcol","service.version":"0.131.0-dev"},"otelcol.component.id":"otlphttp/general","otelcol.component.kind":"exporter","otelcol.signal":"logs"} {"level":"info","ts":"2025-07-31T23:26:34.244-0400","msg":"Initializing new persistent queue","resource":{"service.instance.id":"d830aeff-7993-49f7-9817-a0c96af3498d","service.name":"otelcontribcol","service.version":"0.131.0-dev"},"otelcol.component.id":"otlphttp/general","otelcol.component.kind":"exporter","otelcol.signal":"logs"} {"level":"info","ts":"2025-07-31T23:26:34.244-0400","msg":"Successfully migrated to consolidated metadata format","resource":{"service.instance.id":"d830aeff-7993-49f7-9817-a0c96af3498d","service.name":"otelcontribcol","service.version":"0.131.0-dev"},"otelcol.component.id":"otlphttp/general","otelcol.component.kind":"exporter","otelcol.signal":"logs"} {"level":"info","ts":"2025-07-31T23:26:34.255-0400","msg":"Everything is ready. Begin running and processing data.","resource":{"service.instance.id":"d830aeff-7993-49f7-9817-a0c96af3498d","service.name":"otelcontribcol","service.version":"0.131.0-dev"}} ``` <!--Describe the documentation added.--> #### Documentation Updated the existing documentation with behavioral changes for `recreate` option. <!--Please delete paragraphs that you did not use before submitting.--> --------- Co-authored-by: Antoine Toulme <[email protected]>
1 parent 35f8fa4 commit eae8d71

File tree

4 files changed

+83
-11
lines changed

4 files changed

+83
-11
lines changed
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: "bug_fix"
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
7+
component: "extension/storage"
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: "Fix 'recreate' rename file only on panic"
11+
12+
# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
13+
issues: [41802]
14+
15+
# (Optional) One or more lines of additional information to render under the primary note.
16+
# These lines will be padded with 2 spaces and then inserted directly into the document.
17+
# Use pipe (|) for multiline entries.
18+
subtext: |
19+
* This recovers from a panic when the bbolt db is corrupted and renames the file when a panic occurs.
20+
* This changes the `recreate` behavior to not rename the file upon every start of the collector.
21+
22+
# If your change doesn't affect end users or the exported elements of any package,
23+
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
24+
# Optional: The change log or logs in which this entry should be included.
25+
# e.g. '[user]' or '[user, api]'
26+
# Include 'user' if the change is relevant to end users.
27+
# Include 'api' if there is a change to a library API.
28+
# Default: '[user]'
29+
change_logs: [user]

extension/storage/filestorage/README.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,14 @@ The default timeout is `1s`.
3131
By default, the directories will be created with `0750 (rwxr-x---)` permissions, minus the process umask.
3232
Use `directory_permissions` to customize directory creation permissions, minus the process umask.
3333

34-
`recreate` when set, will rename the existing data storage to `{filename}.backup` and a new data file will be created from scratch. This option is useful if underlying database is corrupted and as a result, it can halt the entire collector process due to a panic. See (#36840)[https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/36840] for more details.
34+
`recreate` when set, the filestorage extension will automatically rename the corrupted bbolt database and create a new one when certain bbolt panics occur.
35+
See (#35899)[https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/35899] for more details.
36+
37+
If the database fails to open due to corruption (resulting in a panic), the corrupted file will be automatically renamed to `{filename}.{ISO 8601 timestamp}.backup` and a new data file will be created from scratch. This allows the collector to continue operating even when encountering certain bbolt panics. If no corruption is detected, the existing database continues to be used normally.
38+
There may still be scenarios where manually removing or renaming the file may be required, and this feature flag is not a panacea for all bbolt panics you can encounter.
3539

3640
> [!Note]
37-
> Enabling `recreate` will regenerate the database files, which may lead to data duplication or data loss.
41+
> When database corruption is detected and automatic recovery is triggered, the corrupted data will be moved to a `.backup` file. While this prevents complete data loss, the collector will start with a fresh database, which may lead to data duplication or loss of component state.
3842
3943
## Compaction
4044
`compaction` defines how and when files should be compacted. There are two modes of compaction available (both of which can be set concurrently):
@@ -88,6 +92,7 @@ extensions:
8892
file_storage/all_settings:
8993
directory: /var/lib/otelcol/mydir
9094
timeout: 1s
95+
recreate: true
9196
compaction:
9297
on_start: true
9398
directory: /tmp/

extension/storage/filestorage/extension.go

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"os"
1111
"path/filepath"
1212
"strings"
13+
"time"
1314

1415
"go.opentelemetry.io/collector/component"
1516
"go.opentelemetry.io/collector/extension"
@@ -71,12 +72,9 @@ func (lfs *localFileStorage) GetClient(_ context.Context, kind component.Kind, e
7172

7273
rawName = sanitize(rawName)
7374
absoluteName := filepath.Join(lfs.cfg.Directory, rawName)
74-
if lfs.cfg.Recreate {
75-
if err := os.Rename(absoluteName, absoluteName+".backup"); err != nil {
76-
return nil, fmt.Errorf("error renaming the database. Please remove %s manually: %w", absoluteName, err)
77-
}
78-
}
79-
client, err := newClient(lfs.logger, absoluteName, lfs.cfg.Timeout, lfs.cfg.Compaction, !lfs.cfg.FSync)
75+
76+
// Try to create client, handling panics if recreate is enabled
77+
client, err := lfs.createClientWithPanicRecovery(absoluteName)
8078
if err != nil {
8179
return nil, err
8280
}
@@ -92,6 +90,45 @@ func (lfs *localFileStorage) GetClient(_ context.Context, kind component.Kind, e
9290
return client, nil
9391
}
9492

93+
// createClientWithPanicRecovery attempts to create a client, and if recreate is enabled
94+
// and a panic occurs (typically due to database corruption), it will rename the file
95+
// and try again with a fresh database
96+
func (lfs *localFileStorage) createClientWithPanicRecovery(absoluteName string) (client *fileStorageClient, err error) {
97+
// First attempt: try to create client normally
98+
if !lfs.cfg.Recreate {
99+
// If recreate is disabled, just try once
100+
return newClient(lfs.logger, absoluteName, lfs.cfg.Timeout, lfs.cfg.Compaction, !lfs.cfg.FSync)
101+
}
102+
103+
// If recreate is enabled, handle potential panics during database opening
104+
defer func() {
105+
if r := recover(); r != nil {
106+
lfs.logger.Warn("Database corruption detected, recreating database file",
107+
zap.String("file", absoluteName),
108+
zap.Any("panic", r))
109+
110+
// Rename the corrupted file with ISO 8601 timestamp
111+
timestamp := time.Now().Format("2006-01-02T15:04:05.000")
112+
backupName := absoluteName + "." + timestamp + ".backup"
113+
if renameErr := os.Rename(absoluteName, backupName); renameErr != nil {
114+
err = fmt.Errorf("error renaming corrupted database. Please remove %s manually: %w", absoluteName, renameErr)
115+
return
116+
}
117+
118+
lfs.logger.Info("Corrupted database file renamed",
119+
zap.String("original", absoluteName),
120+
zap.String("backup", backupName))
121+
122+
// Try to create client again with fresh database
123+
client, err = newClient(lfs.logger, absoluteName, lfs.cfg.Timeout, lfs.cfg.Compaction, !lfs.cfg.FSync)
124+
}
125+
}()
126+
127+
// Try to create the client normally first
128+
client, err = newClient(lfs.logger, absoluteName, lfs.cfg.Timeout, lfs.cfg.Compaction, !lfs.cfg.FSync)
129+
return client, err
130+
}
131+
95132
func kindString(k component.Kind) string {
96133
switch k {
97134
case component.KindReceiver:

extension/storage/filestorage/extension_test.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -666,7 +666,8 @@ func TestRecreate(t *testing.T) {
666666
require.NoError(t, ext.Shutdown(ctx))
667667
}
668668

669-
// step 3: re-create the extension, but with Recreate=true and make sure that the data is not preset
669+
// step 3: re-create the extension, but with Recreate=true and make sure that the data still exists
670+
// (since recreate now only happens on panic, not always when recreate=true)
670671
{
671672
config.Recreate = true
672673
ext, err := f.Create(ctx, extensiontest.NewNopSettings(f.Type()), config)
@@ -679,9 +680,9 @@ func TestRecreate(t *testing.T) {
679680
require.NoError(t, err)
680681
require.NotNil(t, client)
681682

682-
// The data shouldn't exist.
683+
// The data should still exist since no panic occurred
683684
val, err := client.Get(ctx, "key")
684-
require.Nil(t, val)
685+
require.Equal(t, val, []byte("val"))
685686
require.NoError(t, err)
686687

687688
// close the extension

0 commit comments

Comments
 (0)