Skip to content

Commit 144e7eb

Browse files
feat: add max file diff size and allow base-commit and depth flags (#336)
<!-- Thanks for contributing to 2ms by offering a pull request. --> **Proposed Changes** This PR introduces hard limits and enhanced error handling to the Git plugin to prevent memory exhaustion and improve performance when scanning large repositories with massive file diffs. **Key changes:** - File diff size limits - Added 50MB hard limit for individual file diffs; - New error type `ErrFileDiffSizeExceeded` for proper error handling. - Git scan options - Support for simultaneous use of `base-commit` and `depth` parameters. **Checklist** - [x] I covered my changes with tests. - [ ] I Updated the documentation that is affected by my changes: - [ ] Change in the CLI arguments - [ ] Change in the configuration file I submit this contribution under the Apache-2.0 license.
1 parent a7cccca commit 144e7eb

File tree

2 files changed

+157
-53
lines changed

2 files changed

+157
-53
lines changed

plugins/git.go

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package plugins
22

33
import (
4+
"errors"
45
"fmt"
56
"os"
67
"strings"
@@ -20,9 +21,12 @@ const (
2021
AddedContent DiffType = iota
2122
RemovedContent
2223

23-
maxChunkSize = 256 * 1024 // 256KB max chunk size for git diff content (optimized for regexp)
24+
maxChunkSize = 256 * 1024 // 256KB max chunk size for git diff content (optimized for regexp)
25+
maxFileDiffSize = 50 * 1024 * 1024 // 50MB max file diff size limit
2426
)
2527

28+
var ErrFileDiffSizeExceeded = errors.New("file diff size exceeded")
29+
2630
const (
2731
argDepth = "depth"
2832
argScanAllBranches = "all-branches"
@@ -299,10 +303,10 @@ func (p *GitPlugin) buildScanOptions() string {
299303
options = append(options, "--all")
300304
}
301305

302-
// If base commit is specified, use commit range instead of depth
303306
if p.baseCommit != "" {
304307
options = append(options, fmt.Sprintf("%s..HEAD", p.baseCommit))
305-
} else if p.depth > 0 {
308+
}
309+
if p.depth > 0 {
306310
options = append(options, fmt.Sprintf("-n %d", p.depth))
307311
}
308312

@@ -348,7 +352,18 @@ func (p *GitPlugin) processFileDiff(file *gitdiff.File, itemsChan chan ISourceIt
348352
return
349353
}
350354

351-
chunks := extractChanges(p.gitChangesPool, file.TextFragments)
355+
chunks, err := extractChanges(p.gitChangesPool, file.TextFragments)
356+
defer p.gitChangesPool.putSlice(chunks)
357+
358+
// Check if file diff size exceeded the limit during extraction
359+
if err != nil {
360+
log.Warn().
361+
Str("file", fileName).
362+
Str("commit", file.PatchHeader.SHA).
363+
Err(err).
364+
Msg("Skipping file diff")
365+
return
366+
}
352367

353368
for _, chunk := range chunks {
354369
id := fmt.Sprintf("%s-%s-%s-%s", p.GetName(), p.projectName, file.PatchHeader.SHA, fileName)
@@ -378,32 +393,38 @@ func (p *GitPlugin) processFileDiff(file *gitdiff.File, itemsChan chan ISourceIt
378393
}
379394
}
380395
}
381-
382-
p.gitChangesPool.putSlice(chunks)
383396
}
384397

385398
// extractChanges performs memory-bounded chunked processing of git diff fragments
386-
func extractChanges(changesPool *gitChangesPool, fragments []*gitdiff.TextFragment) []gitdiffChunk {
399+
func extractChanges(changesPool *gitChangesPool, fragments []*gitdiff.TextFragment) ([]gitdiffChunk, error) {
387400
chunks := changesPool.getSlice()
388401
currentAdded := addedPool.Get()
389402
defer addedPool.Put(currentAdded)
390403
currentRemoved := removedPool.Get()
391404
defer removedPool.Put(currentRemoved)
392405

393406
currentSize := 0
407+
totalSize := 0
394408

395409
for _, tf := range fragments {
396410
if tf == nil {
397411
continue
398412
}
399-
400413
for i := range tf.Lines {
401414
line := tf.Lines[i].Line
402415
lineSize := len(line)
403416

417+
totalSize += lineSize
418+
if totalSize > maxFileDiffSize {
419+
tf.Lines[i].Line = ""
420+
changesPool.putSlice(chunks)
421+
return nil, fmt.Errorf("%w: file diff size %d bytes exceeds max limit %d bytes",
422+
ErrFileDiffSizeExceeded, totalSize, maxFileDiffSize)
423+
}
424+
404425
// Skip excessively large lines (potential issue)
405426
if lineSize > 1024*1024 { // 1MB line limit
406-
tf.Lines[i].Line = "" // Clear line to free memory
427+
tf.Lines[i].Line = ""
407428
continue
408429
}
409430

@@ -443,7 +464,7 @@ func extractChanges(changesPool *gitChangesPool, fragments []*gitdiff.TextFragme
443464
})
444465
}
445466

446-
return chunks
467+
return chunks, nil
447468
}
448469

449470
func (p *GitPlugin) readGitLog(path, scanOptions string, errChan chan error) (<-chan *gitdiff.File, func()) {

plugins/git_test.go

Lines changed: 126 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -71,18 +71,18 @@ func TestBuildScanOptions(t *testing.T) {
7171
expectedOptions: "--full-history --all def456..HEAD",
7272
},
7373
{
74-
name: "Base commit takes precedence over depth",
74+
name: "Base commit with depth: both flags are used",
7575
scanAllBranches: false,
7676
depth: 10,
7777
baseCommit: "ghi789",
78-
expectedOptions: "--full-history ghi789..HEAD",
78+
expectedOptions: "--full-history ghi789..HEAD -n 10",
7979
},
8080
{
81-
name: "Base commit with all branches takes precedence over depth",
81+
name: "Base commit with all branches and depth: all flags are used",
8282
scanAllBranches: true,
8383
depth: 15,
8484
baseCommit: "jkl012",
85-
expectedOptions: "--full-history --all jkl012..HEAD",
85+
expectedOptions: "--full-history --all jkl012..HEAD -n 15",
8686
},
8787
}
8888

@@ -491,6 +491,7 @@ func TestExtractChanges(t *testing.T) {
491491
name string
492492
fragments []*gitdiff.TextFragment
493493
expectedChunks int
494+
expectedError error
494495
description string
495496
}{
496497
{
@@ -523,69 +524,151 @@ func TestExtractChanges(t *testing.T) {
523524
expectedChunks: 3,
524525
description: "Should handle mixed add/delete operations in single chunk",
525526
},
527+
{
528+
name: "File diff size exceeded",
529+
fragments: createLargeTestFragments(51 * 1024 * 1024), // 51MB
530+
expectedChunks: 0,
531+
description: "Should return error when file diff size exceeds limit",
532+
expectedError: ErrFileDiffSizeExceeded,
533+
},
526534
}
527535

528536
for _, tt := range tests {
529537
t.Run(tt.name, func(t *testing.T) {
530538
changesPool := newGitChangesPool(0)
531-
chunks := extractChanges(changesPool, tt.fragments)
539+
chunks, err := extractChanges(changesPool, tt.fragments)
532540

541+
assert.ErrorIs(t, err, tt.expectedError)
533542
assert.Equal(t, tt.expectedChunks, len(chunks), tt.description)
534543

535-
// Verify memory is freed for non-nil fragments
536-
for _, fragment := range tt.fragments {
537-
if fragment != nil {
538-
for _, line := range fragment.Lines {
539-
assert.Empty(t, line.Line, "Line content should be cleared to free memory")
544+
if tt.expectedError == nil {
545+
// Verify memory is freed for non-nil fragments
546+
for _, fragment := range tt.fragments {
547+
if fragment != nil {
548+
for _, line := range fragment.Lines {
549+
assert.Empty(t, line.Line, "Line content should be cleared to free memory")
550+
}
540551
}
541552
}
542-
}
543553

544-
// Verify chunks contain expected content types
545-
for _, chunk := range chunks {
546-
if tt.expectedChunks > 0 {
547-
// At least one chunk should have some content
548-
hasContent := chunk.Added != "" || chunk.Removed != ""
549-
assert.True(t, hasContent, "Chunk should contain added or removed content")
554+
// Verify chunks contain expected content types
555+
for _, chunk := range chunks {
556+
if tt.expectedChunks > 0 {
557+
// At least one chunk should have some content
558+
hasContent := chunk.Added != "" || chunk.Removed != ""
559+
assert.True(t, hasContent, "Chunk should contain added or removed content")
560+
}
550561
}
551562
}
552563
})
553564
}
554565
}
555566

556567
func TestProcessFileDiff(t *testing.T) {
557-
plugin := &GitPlugin{
558-
Plugin: Plugin{},
559-
projectName: "test-project",
560-
gitChangesPool: newGitChangesPool(0),
568+
tests := []struct {
569+
name string
570+
fragmentSize int
571+
isBinary bool
572+
isDelete bool
573+
expectedItems int
574+
}{
575+
{
576+
name: "normal file with small diff",
577+
fragmentSize: 2000, // 2KB
578+
expectedItems: 2,
579+
},
580+
{
581+
name: "normal file with large diff under limit",
582+
fragmentSize: 2 * 1024 * 1024, // 2MiB
583+
expectedItems: 16, // (2000 KiB /256 KiB = 8 chunks) x 2 (added and removed)
584+
},
585+
{
586+
name: "file exceeding size limit",
587+
fragmentSize: 60 * 1024 * 1024, // 60MiB - exceeds 50MiB limit
588+
expectedItems: 0,
589+
},
590+
{
591+
name: "binary file",
592+
fragmentSize: 1024,
593+
isBinary: true,
594+
expectedItems: 0,
595+
},
596+
{
597+
name: "deleted file",
598+
fragmentSize: 1024,
599+
isDelete: true,
600+
expectedItems: 1,
601+
},
602+
{
603+
name: "empty fragments",
604+
fragmentSize: 0,
605+
expectedItems: 0,
606+
},
561607
}
562608

563-
// Create test file with large diff
609+
for _, tt := range tests {
610+
t.Run(tt.name, func(t *testing.T) {
611+
plugin := &GitPlugin{
612+
Plugin: Plugin{},
613+
projectName: "test-project",
614+
gitChangesPool: newGitChangesPool(0),
615+
}
616+
617+
// Create test file
618+
file := createTestFile("test-file", "abc123", tt.name, tt.fragmentSize, tt.isBinary, tt.isDelete)
619+
620+
items := make(chan ISourceItem, 100)
621+
622+
plugin.processFileDiff(file, items)
623+
close(items)
624+
625+
// Collect all items
626+
var itemCount int
627+
var collectedItems []ISourceItem
628+
for item := range items {
629+
itemCount++
630+
collectedItems = append(collectedItems, item)
631+
assert.NotNil(t, item.GetContent(), "Item content should not be nil")
632+
assert.NotEmpty(t, item.GetID(), "Item ID should not be empty")
633+
}
634+
635+
assert.Equal(t, tt.expectedItems, itemCount)
636+
if itemCount > 0 {
637+
for _, item := range collectedItems {
638+
assert.Contains(t, item.GetID(), "abc123", "Item ID should contain commit SHA")
639+
assert.Contains(t, item.GetID(), "test-file", "Item ID should contain file name")
640+
641+
// Validate GitInfo
642+
if gitInfo := item.GetGitInfo(); gitInfo != nil {
643+
assert.NotNil(t, gitInfo.Hunks, "GitInfo should have hunks")
644+
assert.True(t, gitInfo.ContentType == AddedContent || gitInfo.ContentType == RemovedContent,
645+
"GitInfo should have valid content type")
646+
}
647+
}
648+
}
649+
})
650+
}
651+
}
652+
653+
// createTestFile creates a gitdiff.File for testing with specified parameters
654+
func createTestFile(fileName, commitSHA, commitTitle string, fragmentSize int, isBinary, isDelete bool) *gitdiff.File {
564655
file := &gitdiff.File{
565-
NewName: "test-file.go",
566-
OldName: "test-file.go",
656+
NewName: fileName,
657+
OldName: fileName,
567658
PatchHeader: &gitdiff.PatchHeader{
568-
SHA: "abc123",
569-
Title: "Test commit",
659+
SHA: commitSHA,
660+
Title: commitTitle,
570661
},
571-
TextFragments: createLargeTestFragments(2 * 1024 * 1024), // 2MB
662+
IsBinary: isBinary,
663+
IsDelete: isDelete,
572664
}
573665

574-
items := make(chan ISourceItem, 100)
575-
576-
plugin.processFileDiff(file, items)
577-
close(items)
578-
579-
// Collect all items
580-
var itemCount int
581-
for item := range items {
582-
itemCount++
583-
assert.NotNil(t, item.GetContent(), "Item content should not be nil")
584-
assert.NotEmpty(t, item.GetID(), "Item ID should not be empty")
666+
// Only create fragments for non-binary files with content
667+
if !isBinary && fragmentSize > 0 {
668+
file.TextFragments = createLargeTestFragments(fragmentSize)
585669
}
586670

587-
// Should have created multiple chunks and therefore multiple items
588-
assert.Greater(t, itemCount, 1, "Should create multiple items for large file")
671+
return file
589672
}
590673

591674
func TestStringBuilderPool(t *testing.T) {
@@ -637,11 +720,11 @@ func TestStringBuilderPool(t *testing.T) {
637720
}
638721

639722
func createLargeTestFragments(totalSize int) []*gitdiff.TextFragment {
640-
lineSize := 1024 // 1KB per line
641-
numLines := totalSize / lineSize
723+
lineSize := 1024 // 1KiB per line
724+
numLines := (totalSize + lineSize - 1) / lineSize // Ceiling division
642725
lines := make([]gitdiff.Line, numLines)
643726

644-
for i := 0; i < numLines; i++ {
727+
for i := range numLines {
645728
op := gitdiff.OpAdd
646729
if i%2 == 0 {
647730
op = gitdiff.OpDelete

0 commit comments

Comments
 (0)