Skip to content

Commit 49bbad9

Browse files
authored
[TT-1993] Reduces Splunk Upload Size for Flakeguard (#1625)
1 parent 6b6dd54 commit 49bbad9

File tree

12 files changed

+16456
-1206
lines changed

12 files changed

+16456
-1206
lines changed

.github/workflows/framework-golden-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ jobs:
8686
with:
8787
go-version: 1.23
8888
- name: Cache Go modules
89-
uses: actions/cache@v3
89+
uses: actions/cache@v4
9090
with:
9191
path: |
9292
~/.cache/go-build

tools/flakeguard/cmd/aggregate_results.go

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@ var AggregateResultsCmd = &cobra.Command{
3535
splunkToken, _ := cmd.Flags().GetString("splunk-token")
3636
splunkEvent, _ := cmd.Flags().GetString("splunk-event")
3737

38+
initialDirSize, err := getDirSize(resultsPath)
39+
if err != nil {
40+
log.Error().Err(err).Str("path", resultsPath).Msg("Error getting initial directory size")
41+
// intentionally don't exit here, as we can still proceed with the aggregation
42+
}
43+
3844
// Ensure the output directory exists
3945
if err := fs.MkdirAll(outputDir, 0755); err != nil {
4046
log.Error().Err(err).Str("path", outputDir).Msg("Error creating output directory")
@@ -160,7 +166,14 @@ var AggregateResultsCmd = &cobra.Command{
160166
log.Error().Stack().Err(err).Msg("Error saving aggregated test report")
161167
os.Exit(ErrorExitCode)
162168
}
163-
log.Info().Str("report", aggregatedReportPath).Msg("Aggregation complete")
169+
170+
finalDirSize, err := getDirSize(resultsPath)
171+
if err != nil {
172+
log.Error().Err(err).Str("path", resultsPath).Msg("Error getting final directory size")
173+
// intentionally don't exit here, as we can still proceed with the aggregation
174+
}
175+
diskSpaceUsed := byteCountSI(finalDirSize - initialDirSize)
176+
log.Info().Str("disk space used", diskSpaceUsed).Str("report", aggregatedReportPath).Msg("Aggregation complete")
164177
},
165178
}
166179

@@ -185,3 +198,33 @@ func init() {
185198
log.Fatal().Err(err).Msg("Error marking flag as required")
186199
}
187200
}
201+
202+
// getDirSize returns the size of a directory in bytes
203+
// helpful for tracking how much data is being produced on disk
204+
func getDirSize(path string) (int64, error) {
205+
var size int64
206+
err := filepath.Walk(path, func(_ string, info os.FileInfo, err error) error {
207+
if err != nil {
208+
return err
209+
}
210+
if !info.IsDir() {
211+
size += info.Size()
212+
}
213+
return nil
214+
})
215+
return size, err
216+
}
217+
218+
// byteCountSI returns a human-readable byte count (decimal SI units)
219+
func byteCountSI(b int64) string {
220+
const unit = 1000
221+
if b < unit {
222+
return fmt.Sprintf("%d B", b)
223+
}
224+
div, exp := int64(unit), 0
225+
for n := b / unit; n >= unit; n /= unit {
226+
div *= unit
227+
exp++
228+
}
229+
return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "kMGTPE"[exp])
230+
}

tools/flakeguard/cmd/generate_report.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@ var GenerateReportCmd = &cobra.Command{
3434
githubRunID, _ := cmd.Flags().GetInt64("github-run-id")
3535
artifactName, _ := cmd.Flags().GetString("failed-tests-artifact-name")
3636

37+
initialDirSize, err := getDirSize(outputDir)
38+
if err != nil {
39+
log.Error().Err(err).Str("path", outputDir).Msg("Error getting initial directory size")
40+
// intentionally don't exit here, as we can still proceed with the generation
41+
}
42+
3743
// Get the GitHub token from environment variable
3844
githubToken := os.Getenv("GITHUB_TOKEN")
3945
if githubToken == "" {
@@ -160,7 +166,13 @@ var GenerateReportCmd = &cobra.Command{
160166
log.Info().Msg("PR comment markdown generated successfully")
161167
}
162168

163-
log.Info().Str("output", outputDir).Msg("Reports generated successfully")
169+
finalDirSize, err := getDirSize(outputDir)
170+
if err != nil {
171+
log.Error().Err(err).Str("path", outputDir).Msg("Error getting initial directory size")
172+
// intentionally don't exit here, as we can still proceed with the generation
173+
}
174+
diskSpaceUsed := byteCountSI(finalDirSize - initialDirSize)
175+
log.Info().Str("disk space used", diskSpaceUsed).Str("output", outputDir).Msg("Reports generated successfully")
164176
},
165177
}
166178

tools/flakeguard/cmd/run.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"fmt"
77
"os"
88
"os/exec"
9+
"path/filepath"
910

1011
"github.com/rs/zerolog/log"
1112
"github.com/smartcontractkit/chainlink-testing-framework/tools/flakeguard/reports"
@@ -40,6 +41,13 @@ var RunTestsCmd = &cobra.Command{
4041
shuffleSeed, _ := cmd.Flags().GetString("shuffle-seed")
4142
omitOutputsOnSuccess, _ := cmd.Flags().GetBool("omit-test-outputs-on-success")
4243

44+
outputDir := filepath.Dir(outputPath)
45+
initialDirSize, err := getDirSize(outputDir)
46+
if err != nil {
47+
log.Error().Err(err).Str("path", outputDir).Msg("Error getting initial directory size")
48+
// intentionally don't exit here, as we can still proceed with the run
49+
}
50+
4351
if maxPassRatio < 0 || maxPassRatio > 1 {
4452
log.Error().Float64("max pass ratio", maxPassRatio).Msg("Error: max pass ratio must be between 0 and 1")
4553
os.Exit(ErrorExitCode)
@@ -113,10 +121,17 @@ var RunTestsCmd = &cobra.Command{
113121
return !tr.Skipped && tr.PassRatio < maxPassRatio
114122
})
115123

124+
finalDirSize, err := getDirSize(outputDir)
125+
if err != nil {
126+
log.Error().Err(err).Str("path", outputDir).Msg("Error getting initial directory size")
127+
// intentionally don't exit here, as we can still proceed with the run
128+
}
129+
diskSpaceUsed := byteCountSI(finalDirSize - initialDirSize)
130+
116131
if len(flakyTests) > 0 {
117-
log.Info().Int("count", len(flakyTests)).Str("pass ratio threshold", fmt.Sprintf("%.2f%%", maxPassRatio*100)).Msg("Found flaky tests")
132+
log.Info().Str("disk space used", diskSpaceUsed).Int("count", len(flakyTests)).Str("pass ratio threshold", fmt.Sprintf("%.2f%%", maxPassRatio*100)).Msg("Found flaky tests")
118133
} else {
119-
log.Info().Msg("No flaky tests found")
134+
log.Info().Str("disk space used", diskSpaceUsed).Msg("No flaky tests found")
120135
}
121136

122137
fmt.Printf("\nFlakeguard Summary\n")

tools/flakeguard/reports/io.go

Lines changed: 54 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,15 @@ package reports
33
import (
44
"bufio"
55
"bytes"
6+
"encoding/binary"
67
"encoding/json"
78
"errors"
89
"fmt"
910
"io"
1011
"os"
1112
"path/filepath"
1213
"strings"
14+
"sync"
1315
"time"
1416

1517
"github.com/go-resty/resty/v2"
@@ -519,31 +521,28 @@ func sendDataToSplunk(opts *aggregateOptions, report TestReport) error {
519521
SetHeader("Content-Type", "application/json").
520522
SetLogger(ZerologRestyLogger{})
521523

522-
log.Debug().Str("report id", report.ID).Int("results", len(report.Results)).Msg("Sending aggregated data to Splunk")
524+
log.Debug().Str("report id", report.ID).Int("results", len(results)).Msg("Sending aggregated data to Splunk")
525+
526+
const (
527+
resultsBatchSize = 10
528+
splunkSizeLimitBytes = 100_000_000 // 100MB. Actual limit is over 800MB, but that's excessive
529+
exampleSplunkReportFileName = "example_results/example_splunk_report.json"
530+
exampleSplunkResultsFileName = "example_results/example_splunk_results_batch_%d.json"
531+
)
523532

524533
var (
525534
splunkErrs = []error{}
526-
resultsBatchSize = 10
527535
resultsBatch = []SplunkTestResult{}
528536
successfulResultsSent = 0
537+
batchNum = 1
529538
)
530539

531-
var (
532-
exampleSplunkResultsFileName = "example_results/example_splunk_results.json"
533-
exampleSplunkResultsFile *os.File
534-
exampleSplunkReportFileName = "example_results/example_splunk_report.json"
535-
exampleSplunkReportFile *os.File
536-
err error
537-
)
538-
539-
if isExampleRun {
540-
exampleSplunkResultsFile, err = os.Create(exampleSplunkResultsFileName)
541-
if err != nil {
542-
return fmt.Errorf("error creating example Splunk results file: %w", err)
543-
}
544-
defer exampleSplunkResultsFile.Close()
545-
}
546540
for resultCount, result := range results {
541+
// No need to send log outputs to Splunk
542+
result.FailedOutputs = nil
543+
result.PassedOutputs = nil
544+
result.PackageOutputs = nil
545+
547546
resultsBatch = append(resultsBatch, SplunkTestResult{
548547
Event: SplunkTestResultEvent{
549548
Event: opts.splunkEvent,
@@ -554,13 +553,21 @@ func sendDataToSplunk(opts *aggregateOptions, report TestReport) error {
554553
Index: SplunkIndex,
555554
})
556555

557-
if len(resultsBatch) >= resultsBatchSize || resultCount == len(results)-1 {
556+
if len(resultsBatch) >= resultsBatchSize ||
557+
resultCount == len(results)-1 ||
558+
binary.Size(resultsBatch) >= splunkSizeLimitBytes {
559+
558560
batchData, testNames, err := batchSplunkResults(resultsBatch)
559561
if err != nil {
560562
return fmt.Errorf("error batching results: %w", err)
561563
}
562564

563565
if isExampleRun {
566+
exampleSplunkResultsFileName := fmt.Sprintf(exampleSplunkResultsFileName, batchNum)
567+
exampleSplunkResultsFile, err := os.Create(exampleSplunkResultsFileName)
568+
if err != nil {
569+
return fmt.Errorf("error creating example Splunk results file: %w", err)
570+
}
564571
for _, result := range resultsBatch {
565572
jsonResult, err := json.Marshal(result)
566573
if err != nil {
@@ -571,6 +578,10 @@ func sendDataToSplunk(opts *aggregateOptions, report TestReport) error {
571578
return fmt.Errorf("error writing result for '%s' to file: %w", result.Event.Data.TestName, err)
572579
}
573580
}
581+
err = exampleSplunkResultsFile.Close()
582+
if err != nil {
583+
return fmt.Errorf("error closing example Splunk results file: %w", err)
584+
}
574585
} else {
575586
resp, err := client.R().SetBody(batchData.String()).Post("")
576587
if err != nil {
@@ -588,11 +599,12 @@ func sendDataToSplunk(opts *aggregateOptions, report TestReport) error {
588599
}
589600
}
590601
resultsBatch = []SplunkTestResult{}
602+
batchNum++
591603
}
592604
}
593605

594606
if isExampleRun {
595-
log.Info().Msgf("Example Run. See '%s' for the results that would be sent to splunk", exampleSplunkResultsFileName)
607+
log.Info().Msg("Example Run. See 'example_results/splunk_results' for the results that would be sent to splunk")
596608
}
597609

598610
reportData := SplunkTestReport{
@@ -607,7 +619,7 @@ func sendDataToSplunk(opts *aggregateOptions, report TestReport) error {
607619
}
608620

609621
if isExampleRun {
610-
exampleSplunkReportFile, err = os.Create(exampleSplunkReportFileName)
622+
exampleSplunkReportFile, err := os.Create(exampleSplunkReportFileName)
611623
if err != nil {
612624
return fmt.Errorf("error creating example Splunk report file: %w", err)
613625
}
@@ -642,6 +654,7 @@ func sendDataToSplunk(opts *aggregateOptions, report TestReport) error {
642654
log.Debug().
643655
Int("successfully sent", successfulResultsSent).
644656
Int("total results", len(results)).
657+
Int("result batches", batchNum).
645658
Str("duration", time.Since(start).String()).
646659
Str("report id", report.ID).
647660
Msg("All results sent successfully to Splunk")
@@ -672,20 +685,34 @@ func batchSplunkResults(results []SplunkTestResult) (batchData bytes.Buffer, res
672685

673686
// unBatchSplunkResults un-batches a batch of TestResult objects into a slice of TestResult objects
674687
func unBatchSplunkResults(batch []byte) ([]*SplunkTestResult, error) {
675-
var results []*SplunkTestResult
676-
scanner := bufio.NewScanner(bufio.NewReader(bytes.NewReader(batch)))
688+
results := make([]*SplunkTestResult, 0, bytes.Count(batch, []byte{'\n'}))
689+
scanner := bufio.NewScanner(bytes.NewReader(batch))
690+
691+
maxCapacity := 1024 * 1024 // 1 MB
692+
buf := make([]byte, maxCapacity)
693+
scanner.Buffer(buf, maxCapacity)
694+
695+
var pool sync.Pool
696+
pool.New = func() interface{} { return new(SplunkTestResult) }
697+
677698
for scanner.Scan() {
678-
line := scanner.Text()
679-
if strings.TrimSpace(line) == "" {
699+
line := scanner.Bytes()
700+
if len(bytes.TrimSpace(line)) == 0 {
680701
continue // Skip empty lines
681702
}
682-
var result *SplunkTestResult
683-
if err := json.Unmarshal([]byte(line), &result); err != nil {
703+
704+
result := pool.Get().(*SplunkTestResult)
705+
if err := json.Unmarshal(line, result); err != nil {
684706
return results, fmt.Errorf("error unmarshaling result: %w", err)
685707
}
686708
results = append(results, result)
687709
}
688-
return results, scanner.Err()
710+
711+
if err := scanner.Err(); err != nil {
712+
return results, fmt.Errorf("error scanning: %w", err)
713+
}
714+
715+
return results, nil
689716
}
690717

691718
// aggregateReports aggregates multiple TestReport objects into a single TestReport

tools/flakeguard/reports/io_test.go

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@ const (
1717
splunkToken = "test-token"
1818
splunkEvent = "test"
1919
reportID = "123"
20-
totalTestRuns = 255
20+
totalTestRuns = 270
2121
testRunCount = 15
22-
uniqueTests = 18
22+
uniqueTests = 19
2323
)
2424

2525
func TestAggregateResultFilesSplunk(t *testing.T) {
@@ -140,9 +140,7 @@ func BenchmarkAggregateResultFiles(b *testing.B) {
140140
zerolog.SetGlobalLevel(zerolog.Disabled)
141141
for i := 0; i < b.N; i++ {
142142
_, err := LoadAndAggregate("./testdata", WithReportID(reportID))
143-
if err != nil {
144-
b.Fatalf("LoadAndAggregate failed: %v", err)
145-
}
143+
require.NoError(b, err, "LoadAndAggregate failed")
146144
}
147145
}
148146

@@ -156,8 +154,6 @@ func BenchmarkAggregateResultFilesSplunk(b *testing.B) {
156154
b.ResetTimer()
157155
for i := 0; i < b.N; i++ {
158156
_, err := LoadAndAggregate("./testdata", WithReportID(reportID), WithSplunk(srv.URL, splunkToken, "test"))
159-
if err != nil {
160-
b.Fatalf("LoadAndAggregate failed: %v", err)
161-
}
157+
require.NoError(b, err, "LoadAndAggregate failed")
162158
}
163159
}

0 commit comments

Comments
 (0)