diff --git a/pkg/handlers/default.go b/pkg/handlers/default.go index e18511b011d8..381ee8dd58e1 100644 --- a/pkg/handlers/default.go +++ b/pkg/handlers/default.go @@ -1,6 +1,7 @@ package handlers import ( + "bytes" "context" "errors" "fmt" @@ -17,14 +18,33 @@ import ( // once it has been extracted or decompressed by the specific handler. // This allows the specialized handlers to focus on their specific archive formats while leveraging // the common functionality provided by the defaultHandler for processing the extracted content. -type defaultHandler struct{ metrics *metrics } +type defaultHandler struct { + chunkReader sources.ChunkReader + metrics *metrics +} + +// defaultHandlerOption is a functional option for configuring a defaultHandler. +type defaultHandlerOption func(*defaultHandler) + +// withChunkReader sets a custom ChunkReader for the handler. +// This is primarily used for testing to inject mock chunk readers. +func withChunkReader(cr sources.ChunkReader) defaultHandlerOption { + return func(h *defaultHandler) { h.chunkReader = cr } +} // newDefaultHandler creates a defaultHandler with metrics configured based on the provided handlerType. // The handlerType parameter is used to initialize the metrics instance with the appropriate handler type, // ensuring that the metrics recorded within the defaultHandler methods are correctly attributed to the // specific handler that invoked them. -func newDefaultHandler(handlerType handlerType) *defaultHandler { - return &defaultHandler{metrics: newHandlerMetrics(handlerType)} +func newDefaultHandler(handlerType handlerType, opts ...defaultHandlerOption) *defaultHandler { + h := &defaultHandler{ + chunkReader: sources.NewChunkReader(), + metrics: newHandlerMetrics(handlerType), + } + for _, opt := range opts { + opt(h) + } + return h } // HandleFile processes non-archive files. @@ -91,6 +111,7 @@ func (h *defaultHandler) measureLatencyAndHandleErrors( // on the type, particularly for binary files. It manages reading file chunks and writing them to the archive channel, // effectively collecting the final bytes for further processing. This function is a key component in ensuring that all // file content, regardless of being an archive or not, is handled appropriately. +// It also tracks line numbers for each chunk to enable accurate line number reporting for secrets. func (h *defaultHandler) handleNonArchiveContent( ctx logContext.Context, reader mimeTypeReader, @@ -106,10 +127,12 @@ func (h *defaultHandler) handleNonArchiveContent( return nil } - chunkReader := sources.NewChunkReader() - for data := range chunkReader(ctx, reader) { + // Track the current line number (1-indexed) across chunks. + // This allows accurate line number reporting for secrets found in later chunks. + currentLine := int64(1) + for chunkResult := range h.chunkReader(ctx, reader) { dataOrErr := DataOrErr{} - if err := data.Error(); err != nil { + if err := chunkResult.Error(); err != nil { h.metrics.incErrors() dataOrErr.Err = fmt.Errorf("%w: error reading chunk: %v", ErrProcessingWarning, err) if writeErr := common.CancellableWrite(ctx, dataOrErrChan, dataOrErr); writeErr != nil { @@ -118,11 +141,18 @@ func (h *defaultHandler) handleNonArchiveContent( continue } - dataOrErr.Data = data.Bytes() + chunkBytes := chunkResult.Bytes() + dataOrErr.Data = chunkBytes + dataOrErr.LineNumber = currentLine if err := common.CancellableWrite(ctx, dataOrErrChan, dataOrErr); err != nil { return err } - h.metrics.incBytesProcessed(len(data.Bytes())) + h.metrics.incBytesProcessed(len(chunkBytes)) + + // Count newlines in the content portion only (excluding peek chunkResult) to avoid + // double-counting newlines that will appear at the start of the next chunk. + contentSize := chunkResult.ContentSize() + currentLine += int64(bytes.Count(chunkBytes[:contentSize], []byte{'\n'})) } return nil } diff --git a/pkg/handlers/default_test.go b/pkg/handlers/default_test.go index 936a5086d1cf..9220e182aa56 100644 --- a/pkg/handlers/default_test.go +++ b/pkg/handlers/default_test.go @@ -2,10 +2,13 @@ package handlers import ( "os" + "strings" "testing" "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/trufflesecurity/trufflehog/v3/pkg/sources" "github.com/trufflesecurity/trufflehog/v3/pkg/context" ) @@ -34,3 +37,83 @@ func TestHandleNonArchiveFile(t *testing.T) { assert.Equal(t, wantChunkCount, count) } + +// TestHandleFileLineNumbers verifies that line numbers are correctly tracked +// across multiple chunks when processing filesystem files. +// This is a regression test for https://github.com/trufflesecurity/trufflehog/issues/1876 +func TestHandleFileLineNumbers(t *testing.T) { + t.Run("single chunk starts at line 1", func(t *testing.T) { + // Create a mock chunk reader with one chunk containing 3 lines. + chunks := []sources.ChunkResult{ + sources.NewChunkResult([]byte("line1\nline2\nline3\n"), 18), + } + + handler := newDefaultHandler(defaultHandlerType, withChunkReader(mockChunkReader(chunks))) + reader, err := newFileReader(context.Background(), strings.NewReader("ignored")) + require.NoError(t, err) + + var results []DataOrErr + for dataOrErr := range handler.HandleFile(context.Background(), reader) { + results = append(results, dataOrErr) + } + + require.Len(t, results, 1) + assert.Equal(t, int64(1), results[0].LineNumber, "first chunk should start at line 1") + }) + + t.Run("multiple chunks track line numbers correctly", func(t *testing.T) { + // Create mock chunks with known newline counts. + // Chunk 1: 10 lines (contentSize covers all data) + // Chunk 2: 5 lines + // Chunk 3: 3 lines + chunk1Data := []byte(strings.Repeat("line\n", 10)) // 10 newlines + chunk2Data := []byte(strings.Repeat("line\n", 5)) // 5 newlines + chunk3Data := []byte(strings.Repeat("line\n", 3)) // 3 newlines + + chunks := []sources.ChunkResult{ + sources.NewChunkResult(chunk1Data, len(chunk1Data)), + sources.NewChunkResult(chunk2Data, len(chunk2Data)), + sources.NewChunkResult(chunk3Data, len(chunk3Data)), + } + + handler := newDefaultHandler(defaultHandlerType, withChunkReader(mockChunkReader(chunks))) + reader, err := newFileReader(context.Background(), strings.NewReader("ignored")) + require.NoError(t, err) + + var results []DataOrErr + for dataOrErr := range handler.HandleFile(context.Background(), reader) { + results = append(results, dataOrErr) + } + + require.Len(t, results, 3) + assert.Equal(t, int64(1), results[0].LineNumber, "chunk 1 should start at line 1") + assert.Equal(t, int64(11), results[1].LineNumber, "chunk 2 should start at line 11 (1 + 10)") + assert.Equal(t, int64(16), results[2].LineNumber, "chunk 3 should start at line 16 (11 + 5)") + }) + + t.Run("contentSize excludes peek data from line counting", func(t *testing.T) { + // Simulate peek overlap: chunk has 15 lines total but only 10 are content. + // The remaining 5 are "peek" data that shouldn't be counted. + fullData := []byte(strings.Repeat("line\n", 15)) // 15 newlines in data + contentSize := len([]byte(strings.Repeat("line\n", 10))) // Only 10 are content + + chunks := []sources.ChunkResult{ + sources.NewChunkResult(fullData, contentSize), + sources.NewChunkResult([]byte("final\n"), 6), + } + + handler := newDefaultHandler(defaultHandlerType, withChunkReader(mockChunkReader(chunks))) + reader, err := newFileReader(context.Background(), strings.NewReader("ignored")) + require.NoError(t, err) + + var results []DataOrErr + for dataOrErr := range handler.HandleFile(context.Background(), reader) { + results = append(results, dataOrErr) + } + + require.Len(t, results, 2) + assert.Equal(t, int64(1), results[0].LineNumber) + // Second chunk should start at line 11 (only 10 lines counted from first chunk's content) + assert.Equal(t, int64(11), results[1].LineNumber, "peek data should not be counted") + }) +} diff --git a/pkg/handlers/handlers.go b/pkg/handlers/handlers.go index 64af0419f841..4f98eeef3342 100644 --- a/pkg/handlers/handlers.go +++ b/pkg/handlers/handlers.go @@ -11,6 +11,7 @@ import ( "github.com/gabriel-vasile/mimetype" "github.com/mholt/archives" + "google.golang.org/protobuf/proto" logContext "github.com/trufflesecurity/trufflehog/v3/pkg/context" "github.com/trufflesecurity/trufflehog/v3/pkg/feature" @@ -184,6 +185,8 @@ func newFileReader(ctx context.Context, r io.Reader, options ...readerOption) (f type DataOrErr struct { Data []byte Err error + // LineNumber is the 1-indexed starting line of this data within the file. + LineNumber int64 } // FileHandler represents a handler for files. @@ -403,6 +406,7 @@ func HandleFile( // // The function also listens for context cancellation to gracefully terminate processing if the context is done. // It returns nil upon successful processing of all data, or the first encountered fatal error. +// Line numbers from DataOrErr are propagated to the chunk's source metadata for accurate reporting. func handleChunksWithError( ctx logContext.Context, dataErrChan <-chan DataOrErr, @@ -427,6 +431,8 @@ func handleChunksWithError( if len(dataOrErr.Data) > 0 { chunk := *chunkSkel chunk.Data = dataOrErr.Data + populateChunkLineNumber(&chunk, dataOrErr.LineNumber) + if err := reporter.ChunkOk(ctx, chunk); err != nil { return fmt.Errorf("error reporting chunk: %w", err) } @@ -437,6 +443,37 @@ func handleChunksWithError( } } +// populateChunkLineNumber sets the line number in the chunk's source metadata. +// It clones the metadata to avoid sharing state between chunks, since chunk +// skeletons reuse the same metadata pointer. Line number is expected to be 1-indexed. +// If lineNumber is 0 or metadata is nil, no changes are made. +func populateChunkLineNumber(chunk *sources.Chunk, lineNumber int64) { + if lineNumber == 0 || chunk.SourceMetadata == nil { + return + } + + cloned := proto.CloneOf(chunk.SourceMetadata) + switch m := cloned.Data.(type) { + case *source_metadatapb.MetaData_Filesystem: + m.Filesystem.Line = lineNumber + case *source_metadatapb.MetaData_AzureRepos: + m.AzureRepos.Line = lineNumber + case *source_metadatapb.MetaData_Bitbucket: + m.Bitbucket.Line = lineNumber + case *source_metadatapb.MetaData_Gerrit: + m.Gerrit.Line = lineNumber + case *source_metadatapb.MetaData_Github: + m.Github.Line = lineNumber + case *source_metadatapb.MetaData_Gitlab: + m.Gitlab.Line = lineNumber + case *source_metadatapb.MetaData_Git: + m.Git.Line = lineNumber + case *source_metadatapb.MetaData_Huggingface: + m.Huggingface.Line = lineNumber + } + chunk.SourceMetadata = cloned +} + // isFatal determines whether the given error is a fatal error that should // terminate processing the current file, or a non-critical error that can be logged and ignored. // "Fatal" errors include context cancellation, deadline exceeded, and the diff --git a/pkg/handlers/handlers_test.go b/pkg/handlers/handlers_test.go index 2513a4cb6b86..7d8218fc3dc4 100644 --- a/pkg/handlers/handlers_test.go +++ b/pkg/handlers/handlers_test.go @@ -21,6 +21,7 @@ import ( diskbufferreader "github.com/trufflesecurity/disk-buffer-reader" "github.com/trufflesecurity/trufflehog/v3/pkg/context" + "github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb" "github.com/trufflesecurity/trufflehog/v3/pkg/sources" ) @@ -884,3 +885,108 @@ func TestHandleChunksWithError(t *testing.T) { }) } } + +// mockChunkReader creates a ChunkReader that returns predefined chunks. +// Each chunk has data and contentSize (contentSize is used to determine +// how many newlines to count for line tracking). +func mockChunkReader(chunks []sources.ChunkResult) sources.ChunkReader { + return func(ctx context.Context, reader io.Reader) <-chan sources.ChunkResult { + ch := make(chan sources.ChunkResult, len(chunks)) + for _, c := range chunks { + ch <- c + } + close(ch) + return ch + } +} + +// TestPopulateChunkLineNumber verifies that populateChunkLineNumber correctly clones +// metadata and sets line numbers for different metadata types. +func TestPopulateChunkLineNumber(t *testing.T) { + tests := []struct { + name string + metadata *source_metadatapb.MetaData + lineNumber int64 + getLine func(*source_metadatapb.MetaData) int64 + }{ + { + name: "Filesystem metadata", + metadata: &source_metadatapb.MetaData{ + Data: &source_metadatapb.MetaData_Filesystem{ + Filesystem: &source_metadatapb.Filesystem{File: "test.txt"}, + }, + }, + lineNumber: 42, + getLine: func(m *source_metadatapb.MetaData) int64 { + return m.Data.(*source_metadatapb.MetaData_Filesystem).Filesystem.Line + }, + }, + { + name: "Git metadata", + metadata: &source_metadatapb.MetaData{ + Data: &source_metadatapb.MetaData_Git{ + Git: &source_metadatapb.Git{File: "test.go"}, + }, + }, + lineNumber: 100, + getLine: func(m *source_metadatapb.MetaData) int64 { + return m.Data.(*source_metadatapb.MetaData_Git).Git.Line + }, + }, + { + name: "Github metadata", + metadata: &source_metadatapb.MetaData{ + Data: &source_metadatapb.MetaData_Github{ + Github: &source_metadatapb.Github{File: "test.py"}, + }, + }, + lineNumber: 200, + getLine: func(m *source_metadatapb.MetaData) int64 { + return m.Data.(*source_metadatapb.MetaData_Github).Github.Line + }, + }, + { + name: "nil metadata", + metadata: nil, + lineNumber: 10, + getLine: func(m *source_metadatapb.MetaData) int64 { return 0 }, + }, + { + name: "zero line number", + metadata: &source_metadatapb.MetaData{ + Data: &source_metadatapb.MetaData_Filesystem{ + Filesystem: &source_metadatapb.Filesystem{File: "test.txt"}, + }, + }, + lineNumber: 0, + getLine: func(m *source_metadatapb.MetaData) int64 { + return m.Data.(*source_metadatapb.MetaData_Filesystem).Filesystem.Line + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + chunk := &sources.Chunk{SourceMetadata: tc.metadata} + originalMetadata := tc.metadata + + populateChunkLineNumber(chunk, tc.lineNumber) + + if tc.metadata == nil || tc.lineNumber == 0 { + // Metadata should remain unchanged + assert.Equal(t, originalMetadata, chunk.SourceMetadata) + return + } + + // Verify the line number is set correctly + actualLine := tc.getLine(chunk.SourceMetadata) + assert.Equal(t, tc.lineNumber, actualLine, + "line number should be set to %d, got %d", tc.lineNumber, actualLine) + + // Verify the original is not modified (metadata was cloned) + originalLine := tc.getLine(tc.metadata) + assert.Equal(t, int64(0), originalLine, + "original metadata should not be modified, but line is %d", originalLine) + }) + } +} diff --git a/pkg/sources/chunker.go b/pkg/sources/chunker.go index 3bce67bb0c94..437d782a8114 100644 --- a/pkg/sources/chunker.go +++ b/pkg/sources/chunker.go @@ -48,7 +48,9 @@ func WithFileSize(size int) ConfigOption { // it contains the data and error of a chunk. type ChunkResult struct { data []byte - err error + // contentSize is the size of actual content, excluding peek data + contentSize int + err error } // Bytes for a ChunkResult. @@ -56,11 +58,28 @@ func (cr ChunkResult) Bytes() []byte { return cr.data } +// ContentSize returns the size of actual content, excluding peek data. +// Use this when you need to process only the unique content of each chunk +// without the overlapping peek portion. +func (cr ChunkResult) ContentSize() int { + return cr.contentSize +} + // Error for a ChunkResult. func (cr ChunkResult) Error() error { return cr.err } +// NewChunkResult creates a ChunkResult with the given data and content size. +func NewChunkResult(data []byte, contentSize int) ChunkResult { + return ChunkResult{data: data, contentSize: contentSize} +} + +// NewChunkResultError creates a ChunkResult containing an error. +func NewChunkResultError(err error) ChunkResult { + return ChunkResult{err: err} +} + const ( // Size thresholds. xsmallFileSizeThreshold = 4 * 1024 // 4KB @@ -148,31 +167,29 @@ func readInChunks(ctx context.Context, reader io.Reader, config *chunkReaderConf } else { panicErr = fmt.Errorf("panic occurred: %v", r) } - chunkResultChan <- ChunkResult{ - err: fmt.Errorf("panic error: %w", panicErr), - } + chunkResultChan <- NewChunkResultError(fmt.Errorf("panic error: %w", panicErr)) } }() for { - chunkRes := ChunkResult{} chunkBytes := make([]byte, config.totalSize) chunkBytes = chunkBytes[:config.chunkSize] n, err := io.ReadFull(chunkReader, chunkBytes) + if n > 0 { peekData, _ := chunkReader.Peek(config.totalSize - n) chunkBytes = append(chunkBytes[:n], peekData...) - chunkRes.data = chunkBytes } // If there is an error other than EOF, or if we have read some bytes, send the chunk. // io.ReadFull will only return io.EOF when n == 0. + var chunkRes ChunkResult switch { case isErrAndNotEOF(err): ctx.Logger().Error(err, "error reading chunk") - chunkRes.err = err + chunkRes = NewChunkResultError(err) case n > 0: - chunkRes.err = nil + chunkRes = NewChunkResult(chunkBytes, n) default: return } diff --git a/pkg/sources/filesystem/filesystem_test.go b/pkg/sources/filesystem/filesystem_test.go index b0cf51c10792..23487fab1dc7 100644 --- a/pkg/sources/filesystem/filesystem_test.go +++ b/pkg/sources/filesystem/filesystem_test.go @@ -8,9 +8,10 @@ import ( "time" "github.com/go-logr/logr" - "github.com/kylelemons/godebug/pretty" + "github.com/google/go-cmp/cmp" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "google.golang.org/protobuf/testing/protocmp" "google.golang.org/protobuf/types/known/anypb" "github.com/trufflesecurity/trufflehog/v3/pkg/context" @@ -48,6 +49,7 @@ func TestSource_Scan(t *testing.T) { Data: &source_metadatapb.MetaData_Filesystem{ Filesystem: &source_metadatapb.Filesystem{ File: "filesystem.go", + Line: 1, // First chunk starts at line 1 }, }, }, @@ -84,8 +86,8 @@ func TestSource_Scan(t *testing.T) { for chunk := range chunksCh { if chunk.SourceMetadata.GetFilesystem().GetFile() == "filesystem.go" { counter++ - if diff := pretty.Compare(chunk.SourceMetadata, tt.wantSourceMetadata); diff != "" { - t.Errorf("Source.Chunks() %s diff: (-got +want)\n%s", tt.name, diff) + if diff := cmp.Diff(tt.wantSourceMetadata, chunk.SourceMetadata, protocmp.Transform()); diff != "" { + t.Errorf("Source.Chunks() %s metadata mismatch (-want +got):\n%s", tt.name, diff) } } }