Skip to content

Commit c03dad8

Browse files
authored
perf(arrow): Reduce the amount of allocated objects (#645)
### Rationale for this change `apache/arrow-go` is used in the new query engine in Grafana Loki and we are actively working on improving its performance. Here are few low hanging fruits that reduce allocations by many gigabytes when used on the hot path. ### What changes are included in this PR? This PR reduces amount of allocated objects. * `arrow/datatype_binary.go`: straightforward, those `Layout` objects are allocated on each call currently. * `arrow/schema.go`: `Schema.Fields()` does not return a clone anymore and `NewSchemaWithEndian()` does not clone inputs anymore. All call sites are func-signature compatible, but should not modify those args/results anymore. In our opinion this is a reasonable trade-off for higher performance. I am happy to discuss other ways to implement that. ### Are these changes tested? * Tested with the current `apache/arrow-go` unit test suite. * Tested with `grafana/loki` test suite for the query engine. ### Are there any user-facing changes? Yes. Explained above.
1 parent 77835f9 commit c03dad8

File tree

2 files changed

+103
-77
lines changed

2 files changed

+103
-77
lines changed

arrow/datatype_binary.go

Lines changed: 66 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -27,95 +27,97 @@ type OffsetTraits interface {
2727
BytesRequired(int) int
2828
}
2929

30+
var (
31+
binaryTypeLayout = DataTypeLayout{
32+
Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(Int32SizeBytes), SpecVariableWidth()},
33+
}
34+
stringTypeLayout = DataTypeLayout{
35+
Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(Int32SizeBytes), SpecVariableWidth()},
36+
}
37+
largeBinaryTypeLayout = DataTypeLayout{
38+
Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(Int64SizeBytes), SpecVariableWidth()},
39+
}
40+
largeStringTypeLayout = DataTypeLayout{
41+
Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(Int64SizeBytes), SpecVariableWidth()},
42+
}
43+
44+
variadic = SpecVariableWidth()
45+
46+
binaryViewTypeLayout = DataTypeLayout{
47+
Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(ViewHeaderSizeBytes)},
48+
VariadicSpec: &variadic,
49+
}
50+
stringViewTypeLayout = DataTypeLayout{
51+
Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(ViewHeaderSizeBytes)},
52+
VariadicSpec: &variadic,
53+
}
54+
)
55+
3056
type BinaryType struct{}
3157

32-
func (t *BinaryType) ID() Type { return BINARY }
33-
func (t *BinaryType) Name() string { return "binary" }
34-
func (t *BinaryType) String() string { return "binary" }
35-
func (t *BinaryType) binary() {}
36-
func (t *BinaryType) Fingerprint() string { return typeFingerprint(t) }
37-
func (t *BinaryType) Layout() DataTypeLayout {
38-
return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(),
39-
SpecFixedWidth(Int32SizeBytes), SpecVariableWidth()}}
40-
}
58+
func (t *BinaryType) ID() Type { return BINARY }
59+
func (t *BinaryType) Name() string { return "binary" }
60+
func (t *BinaryType) String() string { return "binary" }
61+
func (t *BinaryType) binary() {}
62+
func (t *BinaryType) Fingerprint() string { return typeFingerprint(t) }
63+
func (t *BinaryType) Layout() DataTypeLayout { return binaryTypeLayout }
4164
func (t *BinaryType) OffsetTypeTraits() OffsetTraits { return Int32Traits }
4265
func (BinaryType) IsUtf8() bool { return false }
4366

4467
type StringType struct{}
4568

46-
func (t *StringType) ID() Type { return STRING }
47-
func (t *StringType) Name() string { return "utf8" }
48-
func (t *StringType) String() string { return "utf8" }
49-
func (t *StringType) binary() {}
50-
func (t *StringType) Fingerprint() string { return typeFingerprint(t) }
51-
func (t *StringType) Layout() DataTypeLayout {
52-
return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(),
53-
SpecFixedWidth(Int32SizeBytes), SpecVariableWidth()}}
54-
}
69+
func (t *StringType) ID() Type { return STRING }
70+
func (t *StringType) Name() string { return "utf8" }
71+
func (t *StringType) String() string { return "utf8" }
72+
func (t *StringType) binary() {}
73+
func (t *StringType) Fingerprint() string { return typeFingerprint(t) }
74+
func (t *StringType) Layout() DataTypeLayout { return stringTypeLayout }
5575
func (t *StringType) OffsetTypeTraits() OffsetTraits { return Int32Traits }
5676
func (StringType) IsUtf8() bool { return true }
5777

5878
type LargeBinaryType struct{}
5979

60-
func (t *LargeBinaryType) ID() Type { return LARGE_BINARY }
61-
func (t *LargeBinaryType) Name() string { return "large_binary" }
62-
func (t *LargeBinaryType) String() string { return "large_binary" }
63-
func (t *LargeBinaryType) binary() {}
64-
func (t *LargeBinaryType) Fingerprint() string { return typeFingerprint(t) }
65-
func (t *LargeBinaryType) Layout() DataTypeLayout {
66-
return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(),
67-
SpecFixedWidth(Int64SizeBytes), SpecVariableWidth()}}
68-
}
80+
func (t *LargeBinaryType) ID() Type { return LARGE_BINARY }
81+
func (t *LargeBinaryType) Name() string { return "large_binary" }
82+
func (t *LargeBinaryType) String() string { return "large_binary" }
83+
func (t *LargeBinaryType) binary() {}
84+
func (t *LargeBinaryType) Fingerprint() string { return typeFingerprint(t) }
85+
func (t *LargeBinaryType) Layout() DataTypeLayout { return largeBinaryTypeLayout }
6986
func (t *LargeBinaryType) OffsetTypeTraits() OffsetTraits { return Int64Traits }
7087
func (LargeBinaryType) IsUtf8() bool { return false }
7188

7289
type LargeStringType struct{}
7390

74-
func (t *LargeStringType) ID() Type { return LARGE_STRING }
75-
func (t *LargeStringType) Name() string { return "large_utf8" }
76-
func (t *LargeStringType) String() string { return "large_utf8" }
77-
func (t *LargeStringType) binary() {}
78-
func (t *LargeStringType) Fingerprint() string { return typeFingerprint(t) }
79-
func (t *LargeStringType) Layout() DataTypeLayout {
80-
return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(),
81-
SpecFixedWidth(Int64SizeBytes), SpecVariableWidth()}}
82-
}
91+
func (t *LargeStringType) ID() Type { return LARGE_STRING }
92+
func (t *LargeStringType) Name() string { return "large_utf8" }
93+
func (t *LargeStringType) String() string { return "large_utf8" }
94+
func (t *LargeStringType) binary() {}
95+
func (t *LargeStringType) Fingerprint() string { return typeFingerprint(t) }
96+
func (t *LargeStringType) Layout() DataTypeLayout { return largeStringTypeLayout }
8397
func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits }
8498
func (LargeStringType) IsUtf8() bool { return true }
8599

86100
type BinaryViewType struct{}
87101

88-
func (*BinaryViewType) ID() Type { return BINARY_VIEW }
89-
func (*BinaryViewType) Name() string { return "binary_view" }
90-
func (*BinaryViewType) String() string { return "binary_view" }
91-
func (*BinaryViewType) IsUtf8() bool { return false }
92-
func (*BinaryViewType) binary() {}
93-
func (*BinaryViewType) view() {}
94-
func (t *BinaryViewType) Fingerprint() string { return typeFingerprint(t) }
95-
func (*BinaryViewType) Layout() DataTypeLayout {
96-
variadic := SpecVariableWidth()
97-
return DataTypeLayout{
98-
Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(ViewHeaderSizeBytes)},
99-
VariadicSpec: &variadic,
100-
}
101-
}
102+
func (*BinaryViewType) ID() Type { return BINARY_VIEW }
103+
func (*BinaryViewType) Name() string { return "binary_view" }
104+
func (*BinaryViewType) String() string { return "binary_view" }
105+
func (*BinaryViewType) IsUtf8() bool { return false }
106+
func (*BinaryViewType) binary() {}
107+
func (*BinaryViewType) view() {}
108+
func (t *BinaryViewType) Fingerprint() string { return typeFingerprint(t) }
109+
func (*BinaryViewType) Layout() DataTypeLayout { return binaryViewTypeLayout }
102110

103111
type StringViewType struct{}
104112

105-
func (*StringViewType) ID() Type { return STRING_VIEW }
106-
func (*StringViewType) Name() string { return "string_view" }
107-
func (*StringViewType) String() string { return "string_view" }
108-
func (*StringViewType) IsUtf8() bool { return true }
109-
func (*StringViewType) binary() {}
110-
func (*StringViewType) view() {}
111-
func (t *StringViewType) Fingerprint() string { return typeFingerprint(t) }
112-
func (*StringViewType) Layout() DataTypeLayout {
113-
variadic := SpecVariableWidth()
114-
return DataTypeLayout{
115-
Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(ViewHeaderSizeBytes)},
116-
VariadicSpec: &variadic,
117-
}
118-
}
113+
func (*StringViewType) ID() Type { return STRING_VIEW }
114+
func (*StringViewType) Name() string { return "string_view" }
115+
func (*StringViewType) String() string { return "string_view" }
116+
func (*StringViewType) IsUtf8() bool { return true }
117+
func (*StringViewType) binary() {}
118+
func (*StringViewType) view() {}
119+
func (t *StringViewType) Fingerprint() string { return typeFingerprint(t) }
120+
func (*StringViewType) Layout() DataTypeLayout { return stringViewTypeLayout }
119121

120122
var (
121123
BinaryTypes = struct {

arrow/schema.go

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -169,26 +169,38 @@ func NewSchema(fields []Field, metadata *Metadata) *Schema {
169169
}
170170

171171
func NewSchemaWithEndian(fields []Field, metadata *Metadata, e endian.Endianness) *Schema {
172+
var mdClone *Metadata
173+
if metadata != nil {
174+
md := metadata.clone()
175+
mdClone = &md
176+
}
177+
178+
fClone := make([]Field, len(fields))
179+
copy(fClone, fields)
180+
181+
return newSchema(fClone, mdClone, e)
182+
}
183+
184+
func newSchema(fields []Field, metadata *Metadata, e endian.Endianness) *Schema {
172185
sc := &Schema{
173-
fields: make([]Field, 0, len(fields)),
186+
fields: fields,
174187
index: make(map[string][]int, len(fields)),
175188
endianness: e,
176189
}
177190
if metadata != nil {
178-
sc.meta = metadata.clone()
191+
sc.meta = *metadata
179192
}
180193
for i, field := range fields {
181194
if field.Type == nil {
182195
panic("arrow: field with nil DataType")
183196
}
184-
sc.fields = append(sc.fields, field)
185197
sc.index[field.Name] = append(sc.index[field.Name], i)
186198
}
187199
return sc
188200
}
189201

190202
func (sc *Schema) WithEndianness(e endian.Endianness) *Schema {
191-
return NewSchemaWithEndian(sc.fields, &sc.meta, e)
203+
return newSchema(sc.fields, &sc.meta, e)
192204
}
193205

194206
func (sc *Schema) Endianness() endian.Endianness { return sc.endianness }
@@ -207,11 +219,17 @@ func (sc *Schema) FieldsByName(n string) ([]Field, bool) {
207219
if !ok {
208220
return nil, ok
209221
}
210-
fields := make([]Field, 0, len(indices))
211-
for _, v := range indices {
212-
fields = append(fields, sc.fields[v])
222+
if len(indices) == 1 {
223+
return sc.fields[indices[0] : indices[0]+1], ok
224+
} else if len(indices) > 1 {
225+
fields := make([]Field, 0, len(indices))
226+
for _, v := range indices {
227+
fields = append(fields, sc.fields[v])
228+
}
229+
return fields, ok
213230
}
214-
return fields, ok
231+
232+
return nil, false
215233
}
216234

217235
// FieldIndices returns the indices of the named field or nil.
@@ -250,11 +268,17 @@ func (s *Schema) AddField(i int, field Field) (*Schema, error) {
250268
return nil, fmt.Errorf("arrow: invalid field index %d", i)
251269
}
252270

253-
fields := make([]Field, len(s.fields)+1)
254-
copy(fields[:i], s.fields[:i])
255-
fields[i] = field
256-
copy(fields[i+1:], s.fields[i:])
257-
return NewSchema(fields, &s.meta), nil
271+
var fields []Field
272+
if i == len(s.fields) {
273+
fields = append(s.fields, field)
274+
} else {
275+
fields = make([]Field, len(s.fields)+1)
276+
copy(fields[:i], s.fields[:i])
277+
fields[i] = field
278+
copy(fields[i+1:], s.fields[i:])
279+
}
280+
281+
return newSchema(fields, &s.meta, s.endianness), nil
258282
}
259283

260284
func (s *Schema) String() string {

0 commit comments

Comments
 (0)