-
Notifications
You must be signed in to change notification settings - Fork 28
perf: improve memory consumption in 2ms file walk #287
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
cx-rui-oliveira
merged 23 commits into
master
from
AST-79069-improve-memory-consumption-in-2-ms-file-walk
May 22, 2025
Merged
Changes from all commits
Commits
Show all changes
23 commits
Select commit
Hold shift + click to select a range
dd37da0
feat: add chunking
cx-rui-oliveira 32460c6
feat: add concurrency limit
cx-rui-oliveira 3891f22
refactor: use chunking with threshold and weighted semaphore
cx-rui-oliveira 1b9fb40
refactor: remove unnecessary flag
cx-rui-oliveira 66b5f18
chore: merge and resolve conflicts
cx-rui-oliveira 5eaec37
chore: run go mod tidy
cx-rui-oliveira c1d499d
refactor: clean up code
cx-rui-oliveira 0be4b5f
test: add UTs for chunking and peeking logic
cx-rui-oliveira ced6eef
test: update getItems UT for filesystem
cx-rui-oliveira 93f6752
chore: merge and resolve conflicts
cx-rui-oliveira 6dcde46
chore: fix lint issues
cx-rui-oliveira 7249d81
chore: ignore test secrets
cx-rui-oliveira 10d45b4
refactor: created dedicated packages for semaphore and chunk
cx-rui-oliveira e3a6811
test: add mocks for new interfaces (engine, chunk and semaphore)
cx-rui-oliveira 968e4e3
test: create/update UTs for engine, chunk and semaphore
cx-rui-oliveira 60ebfba
refactor: add functional options pattern
cx-rui-oliveira 1dd1fae
refactor: simplify reading chunk logic
cx-rui-oliveira 6dced71
test: update UTs
cx-rui-oliveira b3d896e
fix: use correct error
cx-rui-oliveira d069b61
update checkmarx scan workflow
cx-leonardo-fontes 45c80de
refactor: address PR suggestions
cx-rui-oliveira b18a224
chore: merge and resolve conflicts
cx-rui-oliveira 4404615
Merge branch 'Update-Checkmarx-Scan-Workflow' into AST-79069-improve-…
cx-rui-oliveira File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,206 @@ | ||
| package chunk | ||
|
|
||
| //go:generate mockgen -source=$GOFILE -destination=${GOPACKAGE}_mock.go -package=${GOPACKAGE} | ||
|
|
||
| import ( | ||
| "bufio" | ||
| "bytes" | ||
| "errors" | ||
| "fmt" | ||
| "sync" | ||
| "unicode" | ||
|
|
||
| "github.com/h2non/filetype" | ||
| ) | ||
|
|
||
| const ( | ||
| defaultSize = 100 * 1024 // 100Kib | ||
| defaultMaxPeekSize = 25 * 1024 // 25Kib | ||
| defaultFileThreshold = 1 * 1024 * 1024 // 1MiB | ||
| ) | ||
|
|
||
| var ErrUnsupportedFileType = errors.New("unsupported file type") | ||
|
|
||
| type Option func(*Chunk) | ||
|
|
||
| // WithSize sets the chunk size | ||
| func WithSize(size int) Option { | ||
| return func(args *Chunk) { | ||
| args.size = size | ||
| } | ||
| } | ||
|
|
||
| // WithMaxPeekSize sets the max size of look-ahead bytes | ||
| func WithMaxPeekSize(maxPeekSize int) Option { | ||
| return func(args *Chunk) { | ||
| args.maxPeekSize = maxPeekSize | ||
| } | ||
| } | ||
|
|
||
| // WithSmallFileThreshold sets the threshold for small files | ||
| func WithSmallFileThreshold(smallFileThreshold int64) Option { | ||
| return func(args *Chunk) { | ||
| args.smallFileThreshold = smallFileThreshold | ||
| } | ||
| } | ||
|
|
||
| // Chunk holds two pools and sizing parameters needed for reading chunks of data with look-ahead | ||
| type Chunk struct { | ||
| bufPool *sync.Pool // *bytes.Buffer with cap Size + MaxPeekSize | ||
| peekedBufPool *sync.Pool // *[]byte slices of length Size + MaxPeekSize | ||
| size int // base chunk size | ||
| maxPeekSize int // max size of look-ahead bytes | ||
| smallFileThreshold int64 // files smaller than this skip chunking | ||
| } | ||
|
|
||
| type IChunk interface { | ||
| GetSize() int | ||
| GetMaxPeekSize() int | ||
| GetFileThreshold() int64 | ||
| ReadChunk(reader *bufio.Reader, totalLines int) (string, error) | ||
| } | ||
|
|
||
| func New(opts ...Option) *Chunk { | ||
| // set default options | ||
| c := &Chunk{ | ||
| size: defaultSize, | ||
| maxPeekSize: defaultMaxPeekSize, | ||
| smallFileThreshold: defaultFileThreshold, | ||
| } | ||
| // apply overrides | ||
| for _, opt := range opts { | ||
| opt(c) | ||
| } | ||
| c.bufPool = &sync.Pool{ | ||
| New: func() interface{} { | ||
| // pre-allocate dynamic-size buffer for reading chunks (up to chunk size + peek size) | ||
| return bytes.NewBuffer(make([]byte, 0, c.size+c.maxPeekSize)) | ||
| }, | ||
| } | ||
| c.peekedBufPool = &sync.Pool{ | ||
| New: func() interface{} { | ||
| // pre-allocate fixed-size block for loading chunks | ||
| b := make([]byte, c.size+c.maxPeekSize) | ||
| return &b | ||
| }, | ||
| } | ||
| return c | ||
| } | ||
|
|
||
| // GetBuf returns a bytes.Buffer from the pool, seeded with the data | ||
| func (c *Chunk) GetBuf(data []byte) (*bytes.Buffer, bool) { | ||
| window, ok := c.bufPool.Get().(*bytes.Buffer) | ||
| if !ok { | ||
| return nil, false | ||
| } | ||
| window.Write(data) // seed the buffer with the data | ||
| return window, ok | ||
| } | ||
|
|
||
| // PutBuf returns the bytes.Buffer to the pool | ||
| func (c *Chunk) PutBuf(window *bytes.Buffer) { | ||
| window.Reset() | ||
| c.bufPool.Put(window) | ||
| } | ||
|
|
||
| // GetPeekedBuf returns a fixed-size []byte from the pool | ||
| func (c *Chunk) GetPeekedBuf() (*[]byte, bool) { | ||
| b, ok := c.peekedBufPool.Get().(*[]byte) | ||
| return b, ok | ||
| } | ||
|
|
||
| // PutPeekedBuf returns the fixed-size []byte to the pool | ||
| func (c *Chunk) PutPeekedBuf(b *[]byte) { | ||
| *b = (*b)[:0] // reset the slice to zero length | ||
| c.peekedBufPool.Put(b) | ||
| } | ||
|
|
||
| func (c *Chunk) GetSize() int { | ||
| return c.size | ||
| } | ||
|
|
||
| func (c *Chunk) GetMaxPeekSize() int { | ||
| return c.maxPeekSize | ||
| } | ||
|
|
||
| func (c *Chunk) GetFileThreshold() int64 { | ||
| return c.smallFileThreshold | ||
| } | ||
|
|
||
| // ReadChunk reads the next chunk of data from file | ||
| func (c *Chunk) ReadChunk(reader *bufio.Reader, totalLines int) (string, error) { | ||
| // borrow a []bytes from the pool and seed it with raw data from file (up to chunk size + peek size) | ||
| rawData, ok := c.GetPeekedBuf() | ||
| if !ok { | ||
| return "", fmt.Errorf("expected *bytes.Buffer, got %T", rawData) | ||
| } | ||
| defer c.PutPeekedBuf(rawData) | ||
| n, err := reader.Read(*rawData) | ||
|
|
||
| var chunkStr string | ||
| // "Callers should always process the n > 0 bytes returned before considering the error err." | ||
| // https://pkg.go.dev/io#Reader | ||
| if n > 0 { | ||
| // only check the filetype at the start of file | ||
| if totalLines == 0 && ShouldSkipFile((*rawData)[:n]) { | ||
| return "", fmt.Errorf("skipping file: %w", ErrUnsupportedFileType) | ||
| } | ||
|
|
||
| chunkStr, err = c.generateChunk((*rawData)[:n]) | ||
| } | ||
| if err != nil { | ||
| return "", err | ||
| } | ||
| return chunkStr, nil | ||
| } | ||
|
|
||
| // generateChunk processes block of raw data and generates chunk to be scanned | ||
| func (c *Chunk) generateChunk(rawData []byte) (string, error) { | ||
| // Borrow a buffer from the pool and seed it with raw data (up to chunk size) | ||
| initialChunkLen := min(len(rawData), c.size) | ||
| chunkData, ok := c.GetBuf(rawData[:initialChunkLen]) | ||
| if !ok { | ||
| return "", fmt.Errorf("expected *bytes.Buffer, got %T", chunkData) | ||
| } | ||
| defer c.PutBuf(chunkData) | ||
|
|
||
| // keep seeding chunk until detecting the “\n...\n” (i.e. safe boundary) | ||
| // or reaching the max limit of chunk size (i.e. chunk size + peek size) | ||
| for i := chunkData.Len(); i < len(rawData); i++ { | ||
| if endsWithTwoNewlines(rawData[:i]) { | ||
| break | ||
| } | ||
| chunkData.WriteByte(rawData[i]) | ||
| } | ||
|
|
||
| return chunkData.String(), nil | ||
| } | ||
|
|
||
| // endsWithTwoNewlines returns true if b ends in at least two '\n's (ignoring any number of ' ', '\r', or '\t' between them) | ||
| func endsWithTwoNewlines(b []byte) bool { | ||
| count := 0 | ||
| for i := len(b) - 1; i >= 0; i-- { | ||
| if b[i] == '\n' { | ||
| count++ | ||
| if count >= 2 { | ||
| return true | ||
| } | ||
| } else if unicode.IsSpace(rune(b[i])) { | ||
| // the presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count | ||
| continue | ||
| } else { | ||
| return false | ||
| } | ||
| } | ||
| return false | ||
| } | ||
|
|
||
| // ShouldSkipFile checks if the file should be skipped based on its content type | ||
| func ShouldSkipFile(data []byte) bool { | ||
| // TODO: could other optimizations be introduced here? | ||
| mimetype, err := filetype.Match(data) | ||
| if err != nil { | ||
| return true // could not determine file type | ||
| } | ||
| return mimetype.MIME.Type == "application" // skip binary files | ||
| } | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.