Skip to content

Commit 40b26c3

Browse files
committed
*: don't read blob content if it's not used
Blob contents should not be read unless the blobs.content column is specifically used in the query (either in direct use or in a projection) to save memory. Operations such as aggregations may use a huge amount of memory without this optimisation. Thanks to the column pushdown we are able to know if the column is in use, so if it is, just flag the iterators to let them know they should read the content. Signed-off-by: Miguel Molina <[email protected]>
1 parent 066354b commit 40b26c3

File tree

6 files changed

+158
-49
lines changed

6 files changed

+158
-49
lines changed

blobs.go

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"io/ioutil"
77

88
"gopkg.in/src-d/go-mysql-server.v0/sql"
9+
"gopkg.in/src-d/go-mysql-server.v0/sql/expression"
910

1011
"gopkg.in/src-d/go-git.v4/plumbing"
1112
"gopkg.in/src-d/go-git.v4/plumbing/object"
@@ -70,7 +71,7 @@ func (r *blobsTable) TransformExpressionsUp(f sql.TransformExprFunc) (sql.Node,
7071

7172
func (r blobsTable) RowIter(ctx *sql.Context) (sql.RowIter, error) {
7273
span, ctx := ctx.Span("gitbase.BlobsTable")
73-
iter := new(blobIter)
74+
iter := &blobIter{readContent: true}
7475

7576
repoIter, err := NewRowRepoIter(ctx, iter)
7677
if err != nil {
@@ -91,23 +92,26 @@ func (blobsTable) HandledFilters(filters []sql.Expression) []sql.Expression {
9192

9293
func (r *blobsTable) WithProjectAndFilters(
9394
ctx *sql.Context,
94-
_, filters []sql.Expression,
95+
columns, filters []sql.Expression,
9596
) (sql.RowIter, error) {
9697
span, ctx := ctx.Span("gitbase.BlobsTable")
9798
iter, err := rowIterWithSelectors(
9899
ctx, BlobsSchema, BlobsTableName, filters,
99100
[]string{"hash"},
100101
func(selectors selectors) (RowRepoIter, error) {
101102
if len(selectors["hash"]) == 0 {
102-
return new(blobIter), nil
103+
return &blobIter{readContent: shouldReadContent(columns)}, nil
103104
}
104105

105106
hashes, err := selectors.textValues("hash")
106107
if err != nil {
107108
return nil, err
108109
}
109110

110-
return &blobsByHashIter{hashes: hashes}, nil
111+
return &blobsByHashIter{
112+
hashes: hashes,
113+
readContent: shouldReadContent(columns),
114+
}, nil
111115
},
112116
)
113117

@@ -120,7 +124,8 @@ func (r *blobsTable) WithProjectAndFilters(
120124
}
121125

122126
type blobIter struct {
123-
iter *object.BlobIter
127+
iter *object.BlobIter
128+
readContent bool
124129
}
125130

126131
func (i *blobIter) NewIterator(repo *Repository) (RowRepoIter, error) {
@@ -129,7 +134,7 @@ func (i *blobIter) NewIterator(repo *Repository) (RowRepoIter, error) {
129134
return nil, err
130135
}
131136

132-
return &blobIter{iter: iter}, nil
137+
return &blobIter{iter: iter, readContent: i.readContent}, nil
133138
}
134139

135140
func (i *blobIter) Next() (sql.Row, error) {
@@ -138,7 +143,7 @@ func (i *blobIter) Next() (sql.Row, error) {
138143
return nil, err
139144
}
140145

141-
return blobToRow(o)
146+
return blobToRow(o, i.readContent)
142147
}
143148

144149
func (i *blobIter) Close() error {
@@ -150,13 +155,14 @@ func (i *blobIter) Close() error {
150155
}
151156

152157
type blobsByHashIter struct {
153-
repo *Repository
154-
pos int
155-
hashes []string
158+
repo *Repository
159+
pos int
160+
hashes []string
161+
readContent bool
156162
}
157163

158164
func (i *blobsByHashIter) NewIterator(repo *Repository) (RowRepoIter, error) {
159-
return &blobsByHashIter{repo, 0, i.hashes}, nil
165+
return &blobsByHashIter{repo, 0, i.hashes, i.readContent}, nil
160166
}
161167

162168
func (i *blobsByHashIter) Next() (sql.Row, error) {
@@ -176,26 +182,26 @@ func (i *blobsByHashIter) Next() (sql.Row, error) {
176182
return nil, err
177183
}
178184

179-
return blobToRow(blob)
185+
return blobToRow(blob, i.readContent)
180186
}
181187
}
182188

183189
func (i *blobsByHashIter) Close() error {
184190
return nil
185191
}
186192

187-
func blobToRow(c *object.Blob) (sql.Row, error) {
193+
func blobToRow(c *object.Blob, readContent bool) (sql.Row, error) {
188194
var content []byte
189195
var isAllowed = blobsAllowBinary
190-
if !isAllowed {
196+
if !isAllowed && readContent {
191197
ok, err := isBinary(c)
192198
if err != nil {
193199
return nil, err
194200
}
195201
isAllowed = !ok
196202
}
197203

198-
if c.Size <= int64(blobsMaxSize) && isAllowed {
204+
if c.Size <= int64(blobsMaxSize) && isAllowed && readContent {
199205
r, err := c.Reader()
200206
if err != nil {
201207
return nil, err
@@ -248,3 +254,19 @@ func isBinary(blob *object.Blob) (bool, error) {
248254
}
249255
}
250256
}
257+
258+
func shouldReadContent(columns []sql.Expression) bool {
259+
for _, e := range columns {
260+
var found bool
261+
expression.Inspect(e, func(e sql.Expression) bool {
262+
gf, ok := e.(*expression.GetField)
263+
found = ok && gf.Table() == BlobsTableName && gf.Name() == "content"
264+
return !found
265+
})
266+
267+
if found {
268+
return true
269+
}
270+
}
271+
return false
272+
}

integration_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -229,10 +229,10 @@ func TestSquashCorrectness(t *testing.T) {
229229
`SELECT * FROM tree_entries te INNER JOIN blobs b ON te.entry_hash = b.hash`,
230230

231231
`SELECT * FROM repositories r
232-
INNER JOIN refs re
232+
INNER JOIN refs re
233233
ON r.id = re.repository_id
234-
INNER JOIN commits c
235-
ON re.hash = c.hash
234+
INNER JOIN commits c
235+
ON re.hash = c.hash
236236
WHERE re.name = 'HEAD'`,
237237

238238
`SELECT * FROM commits c

internal/rule/squashjoins.go

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,14 @@ func buildSquashedTable(
312312
return nil, errInvalidIteratorChain.New("tree_entries", iter)
313313
}
314314
case gitbase.BlobsTableName:
315+
var readContent bool
316+
for _, e := range columns {
317+
if containsField(e, gitbase.BlobsTableName, "content") {
318+
readContent = true
319+
break
320+
}
321+
}
322+
315323
switch it := iter.(type) {
316324
case gitbase.RefsIter:
317325
var f sql.Expression
@@ -328,6 +336,7 @@ func buildSquashedTable(
328336
iter = gitbase.NewCommitBlobsIter(
329337
gitbase.NewRefHEADCommitsIter(it, nil, true),
330338
f,
339+
readContent,
331340
)
332341
case gitbase.CommitsIter:
333342
var f sql.Expression
@@ -341,13 +350,10 @@ func buildSquashedTable(
341350
return nil, err
342351
}
343352

344-
iter = gitbase.NewTreeEntryBlobsIter(
345-
gitbase.NewCommitMainTreeEntriesIter(
346-
it,
347-
nil,
348-
true,
349-
),
353+
iter = gitbase.NewCommitBlobsIter(
354+
it,
350355
f,
356+
readContent,
351357
)
352358
case gitbase.TreeEntriesIter:
353359
var f sql.Expression
@@ -361,7 +367,7 @@ func buildSquashedTable(
361367
return nil, err
362368
}
363369

364-
iter = gitbase.NewTreeEntryBlobsIter(it, f)
370+
iter = gitbase.NewTreeEntryBlobsIter(it, f, readContent)
365371
default:
366372
return nil, errInvalidIteratorChain.New("blobs", iter)
367373
}
@@ -936,6 +942,19 @@ func isNum(n int64) validator {
936942
}
937943
}
938944

945+
func containsField(e sql.Expression, table, name string) bool {
946+
var found bool
947+
expression.Inspect(e, func(e sql.Expression) bool {
948+
gf, ok := e.(*expression.GetField)
949+
if ok && gf.Table() == table && gf.Name() == name {
950+
found = true
951+
return false
952+
}
953+
return true
954+
})
955+
return found
956+
}
957+
939958
func fixFieldIndexes(e sql.Expression, schema sql.Schema) (sql.Expression, error) {
940959
if e == nil {
941960
return nil, nil

0 commit comments

Comments
 (0)