Skip to content

Commit 4586c71

Browse files
authored
runtime/vam: return error on cast of invalid UTF-8 bytes to string (#6273)
The sequential runtime returns a structured error when a bytes value containing invalid UTF-8 is cast to a string. Do the same in the vector runtime. The change in runtime/vam/expr/cast.castToString implements the new behavior. The remaining changes just add a msgSuffix parameter to errCastFailed so castToString can produce the same error message as the sequential runtime. There are some possible optimizations here but they can wait until castToString appears in CPU profiles. The test coverage added in runtime/ztests/expr/cast/string.yaml duplicates runtime/sam/expr/ztests/cast-bytes-string-err.yaml, so remove the latter file.
1 parent 2d64111 commit 4586c71

File tree

9 files changed

+67
-50
lines changed

9 files changed

+67
-50
lines changed

runtime/sam/expr/ztests/cast-bytes-string-err.yaml

Lines changed: 0 additions & 7 deletions
This file was deleted.

runtime/vam/expr/cast/bool.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import (
66
"github.com/brimdata/super/vector/bitvec"
77
)
88

9-
func castToBool(vec vector.Any, index []uint32) (vector.Any, []uint32, bool) {
9+
func castToBool(vec vector.Any, index []uint32) (vector.Any, []uint32, string, bool) {
1010
var out *vector.Bool
1111
switch vec := vec.(type) {
1212
case *vector.Int:
@@ -17,17 +17,17 @@ func castToBool(vec vector.Any, index []uint32) (vector.Any, []uint32, bool) {
1717
out = numberToBool(vec.Values, index)
1818
case *vector.String:
1919
vvec, errs := stringToBool(vec, index)
20-
return vvec, errs, true
20+
return vvec, errs, "", true
2121
default:
22-
return nil, nil, false
22+
return nil, nil, "", false
2323
}
2424
nulls := vector.NullsOf(vec)
2525
if index == nil {
2626
out.Nulls = nulls
2727
} else {
2828
out.Nulls = nulls.Pick(index)
2929
}
30-
return out, nil, true
30+
return out, nil, "", true
3131
}
3232

3333
func numberToBool[E numeric](s []E, index []uint32) *vector.Bool {

runtime/vam/expr/cast/bytes.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@ import (
44
"github.com/brimdata/super/vector"
55
)
66

7-
func castToBytes(vec vector.Any, index []uint32) (vector.Any, []uint32, bool) {
7+
func castToBytes(vec vector.Any, index []uint32) (vector.Any, []uint32, string, bool) {
88
strVec, ok := vec.(*vector.String)
99
if !ok {
10-
return nil, nil, false
10+
return nil, nil, "", false
1111
}
1212
out := vector.Any(vector.NewBytes(strVec.Table(), vector.NullsOf(strVec)))
1313
if index != nil {
1414
out = vector.Pick(out, index)
1515
}
16-
return out, nil, true
16+
return out, nil, "", true
1717
}

runtime/vam/expr/cast/cast.go

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ func To(sctx *super.Context, vec vector.Any, typ super.Type) vector.Any {
1313
var c caster
1414
id := typ.ID()
1515
if super.IsNumber(id) {
16-
c = func(vec vector.Any, index []uint32) (vector.Any, []uint32, bool) {
16+
c = func(vec vector.Any, index []uint32) (vector.Any, []uint32, string, bool) {
1717
return castToNumber(vec, typ, index)
1818
}
1919
} else {
@@ -29,29 +29,30 @@ func To(sctx *super.Context, vec vector.Any, typ super.Type) vector.Any {
2929
case super.IDNet:
3030
c = castToNet
3131
case super.IDType:
32-
c = func(vec vector.Any, index []uint32) (vector.Any, []uint32, bool) {
32+
c = func(vec vector.Any, index []uint32) (vector.Any, []uint32, string, bool) {
3333
return castToType(sctx, vec, index)
3434
}
3535
default:
36-
return errCastFailed(sctx, vec, typ)
36+
return errCastFailed(sctx, vec, typ, "")
3737
}
3838
}
3939
return assemble(sctx, vec, typ, c)
4040
}
4141

42-
type caster func(vector.Any, []uint32) (vector.Any, []uint32, bool)
42+
type caster func(vector.Any, []uint32) (vector.Any, []uint32, string, bool)
4343

4444
func assemble(sctx *super.Context, vec vector.Any, typ super.Type, fn caster) vector.Any {
4545
var out vector.Any
4646
var errs []uint32
47+
var errMsg string
4748
var ok bool
4849
switch vec := vec.(type) {
4950
case *vector.Const:
5051
return castConst(sctx, vec, typ)
5152
case *vector.View:
52-
out, errs, ok = fn(vec.Any, vec.Index)
53+
out, errs, errMsg, ok = fn(vec.Any, vec.Index)
5354
case *vector.Dict:
54-
out, errs, ok = fn(vec.Any, nil)
55+
out, errs, errMsg, ok = fn(vec.Any, nil)
5556
if ok {
5657
if len(errs) > 0 {
5758
index, counts, nulls, nerrs := vec.RebuildDropTags(errs...)
@@ -62,13 +63,13 @@ func assemble(sctx *super.Context, vec vector.Any, typ super.Type, fn caster) ve
6263
}
6364
}
6465
default:
65-
out, errs, ok = fn(vec, nil)
66+
out, errs, errMsg, ok = fn(vec, nil)
6667
}
6768
if !ok {
68-
return errCastFailed(sctx, vec, typ)
69+
return errCastFailed(sctx, vec, typ, errMsg)
6970
}
7071
if len(errs) > 0 {
71-
return vector.Combine(out, errs, errCastFailed(sctx, vector.Pick(vec, errs), typ))
72+
return vector.Combine(out, errs, errCastFailed(sctx, vector.Pick(vec, errs), typ, errMsg))
7273
}
7374
return out
7475
}
@@ -88,17 +89,21 @@ func castConst(sctx *super.Context, vec *vector.Const, typ super.Type) vector.An
8889
trueCount++
8990
}
9091
}
91-
err := errCastFailed(sctx, vector.NewConst(vec.Value(), vec.Len()-trueCount, bitvec.Zero), typ)
92+
err := errCastFailed(sctx, vector.NewConst(vec.Value(), vec.Len()-trueCount, bitvec.Zero), typ, "")
9293
nulls := vector.NewConst(super.NewValue(typ, nil), trueCount, bitvec.Zero)
9394
return vector.NewDynamic(index, []vector.Any{err, nulls})
9495
}
95-
return errCastFailed(sctx, vec, typ)
96+
return errCastFailed(sctx, vec, typ, "")
9697
}
9798
return vector.NewConst(val, vec.Len(), vec.Nulls)
9899
}
99100

100-
func errCastFailed(sctx *super.Context, vec vector.Any, typ super.Type) vector.Any {
101-
return vector.NewWrappedError(sctx, "cannot cast to "+sup.FormatType(typ), vec)
101+
func errCastFailed(sctx *super.Context, vec vector.Any, typ super.Type, msgSuffix string) vector.Any {
102+
msg := "cannot cast to " + sup.FormatType(typ)
103+
if msgSuffix != "" {
104+
msg = msg + ": " + msgSuffix
105+
}
106+
return vector.NewWrappedError(sctx, msg, vec)
102107
}
103108

104109
func lengthOf(vec vector.Any, index []uint32) uint32 {

runtime/vam/expr/cast/ip.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@ import (
88
"github.com/brimdata/super/vector/bitvec"
99
)
1010

11-
func castToIP(vec vector.Any, index []uint32) (vector.Any, []uint32, bool) {
11+
func castToIP(vec vector.Any, index []uint32) (vector.Any, []uint32, string, bool) {
1212
switch vec := vec.(type) {
1313
case *vector.IP:
14-
return vec, nil, true
14+
return vec, nil, "", true
1515
case *vector.String:
1616
n := lengthOf(vec, index)
1717
var nulls bitvec.Bits
@@ -37,16 +37,16 @@ func castToIP(vec vector.Any, index []uint32) (vector.Any, []uint32, bool) {
3737
}
3838
ips = append(ips, ip)
3939
}
40-
return vector.NewIP(ips, nulls), errs, true
40+
return vector.NewIP(ips, nulls), errs, "", true
4141
default:
42-
return nil, nil, false
42+
return nil, nil, "", false
4343
}
4444
}
4545

46-
func castToNet(vec vector.Any, index []uint32) (vector.Any, []uint32, bool) {
46+
func castToNet(vec vector.Any, index []uint32) (vector.Any, []uint32, string, bool) {
4747
switch vec := vec.(type) {
4848
case *vector.Net:
49-
return vec, nil, true
49+
return vec, nil, "", true
5050
case *vector.String:
5151
n := lengthOf(vec, index)
5252
var nulls bitvec.Bits
@@ -72,8 +72,8 @@ func castToNet(vec vector.Any, index []uint32) (vector.Any, []uint32, bool) {
7272
}
7373
nets = append(nets, net)
7474
}
75-
return vector.NewNet(nets, nulls), errs, true
75+
return vector.NewNet(nets, nulls), errs, "", true
7676
default:
77-
return nil, nil, false
77+
return nil, nil, "", false
7878
}
7979
}

runtime/vam/expr/cast/number.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@ type numeric interface {
1717
constraints.Float | constraints.Integer
1818
}
1919

20-
func castToNumber(vec vector.Any, typ super.Type, index []uint32) (vector.Any, []uint32, bool) {
20+
func castToNumber(vec vector.Any, typ super.Type, index []uint32) (vector.Any, []uint32, string, bool) {
2121
if vec.Type().ID() == super.IDString {
2222
out, errs := castStringToNumber(vec, typ, index)
23-
return out, errs, true
23+
return out, errs, "", true
2424
}
2525
nulls := vector.NullsOf(vec)
2626
if index != nil {
@@ -32,21 +32,21 @@ func castToNumber(vec vector.Any, typ super.Type, index []uint32) (vector.Any, [
3232
if len(errs) > 0 {
3333
nulls = nulls.Pick(inverseIndex(errs, nulls.Len()))
3434
}
35-
return vector.NewInt(typ, vals, nulls), errs, true
35+
return vector.NewInt(typ, vals, nulls), errs, "", true
3636
case super.IsUnsigned(id):
3737
vals, errs := toNumeric[uint64](vec, typ, index)
3838
if len(errs) > 0 {
3939
nulls = nulls.Pick(inverseIndex(errs, nulls.Len()))
4040
}
41-
return vector.NewUint(typ, vals, nulls), errs, true
41+
return vector.NewUint(typ, vals, nulls), errs, "", true
4242
case super.IsFloat(id):
4343
vals, errs := toNumeric[float64](vec, typ, index)
4444
if errs != nil {
4545
nulls = nulls.Pick(inverseIndex(errs, nulls.Len()))
4646
}
47-
return vector.NewFloat(typ, vals, nulls), errs, true
47+
return vector.NewFloat(typ, vals, nulls), errs, "", true
4848
default:
49-
return nil, nil, false
49+
return nil, nil, "", false
5050
}
5151
}
5252

runtime/vam/expr/cast/string.go

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package cast
33
import (
44
"strconv"
55
"time"
6+
"unicode/utf8"
67

78
"github.com/brimdata/super"
89
"github.com/brimdata/super/pkg/nano"
@@ -11,7 +12,7 @@ import (
1112
"github.com/brimdata/super/vector"
1213
)
1314

14-
func castToString(vec vector.Any, index []uint32) (vector.Any, []uint32, bool) {
15+
func castToString(vec vector.Any, index []uint32) (vector.Any, []uint32, string, bool) {
1516
nulls := vector.NullsOf(vec)
1617
if index != nil {
1718
nulls = nulls.Pick(index)
@@ -56,21 +57,35 @@ func castToString(vec vector.Any, index []uint32) (vector.Any, []uint32, bool) {
5657
}
5758
case *vector.String:
5859
if index == nil {
59-
return vec, nil, true
60+
return vec, nil, "", true
6061
}
6162
for _, idx := range index {
6263
bytes = append(bytes, vec.Value(idx)...)
6364
offs = append(offs, uint32(len(bytes)))
6465
}
6566
case *vector.Bytes:
67+
var errs []uint32
6668
for i := range n {
6769
idx := i
6870
if index != nil {
6971
idx = index[i]
7072
}
71-
bytes = append(bytes, vec.Value(idx)...)
72-
offs = append(offs, uint32(len(bytes)))
73+
if !utf8.Valid(vec.Value(idx)) {
74+
errs = append(errs, i)
75+
}
76+
}
77+
const errMsg = "invalid UTF-8"
78+
if len(errs) == int(n) {
79+
return nil, nil, errMsg, false
80+
}
81+
out := vector.Any(vector.NewString(vec.Table(), vec.Nulls))
82+
if index != nil {
83+
out = vector.Pick(out, index)
84+
}
85+
if len(errs) > 0 {
86+
out = vector.ReversePick(out, errs)
7387
}
88+
return out, errs, errMsg, true
7489
case *vector.IP:
7590
for i := range n {
7691
idx := i
@@ -115,7 +130,7 @@ func castToString(vec vector.Any, index []uint32) (vector.Any, []uint32, bool) {
115130
offs = append(offs, uint32(len(bytes)))
116131
}
117132
}
118-
return vector.NewString(vector.NewBytesTable(offs, bytes), nulls), nil, true
133+
return vector.NewString(vector.NewBytesTable(offs, bytes), nulls), nil, "", true
119134
}
120135

121136
func timeToString(vec *vector.Int, index []uint32, n uint32) ([]uint32, []byte) {

runtime/vam/expr/cast/type.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@ import (
77
"github.com/brimdata/super/vector/bitvec"
88
)
99

10-
func castToType(sctx *super.Context, vec vector.Any, index []uint32) (vector.Any, []uint32, bool) {
10+
func castToType(sctx *super.Context, vec vector.Any, index []uint32) (vector.Any, []uint32, string, bool) {
1111
switch vec := vec.(type) {
1212
case *vector.TypeValue:
13-
return vec, nil, true
13+
return vec, nil, "", true
1414
case *vector.String:
1515
n := lengthOf(vec, index)
1616
out := vector.NewTypeValueEmpty(0, bitvec.Zero)
@@ -36,8 +36,8 @@ func castToType(sctx *super.Context, vec vector.Any, index []uint32) (vector.Any
3636
}
3737
out.Append(val.Bytes())
3838
}
39-
return out, errs, true
39+
return out, errs, "", true
4040
default:
41-
return nil, nil, false
41+
return nil, nil, "", false
4242
}
4343
}

runtime/ztests/expr/cast/string.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ input: |
2121
1.2.3.4/32
2222
0x68692c20776f726c64
2323
0x666f6f20626172
24+
0x00
25+
0xc328
2426
"hi, world"
2527
"foo bar"
2628
{foo:"bar"}
@@ -46,6 +48,8 @@ output: |
4648
"1.2.3.4/32"
4749
"hi, world"
4850
"foo bar"
51+
"\u0000"
52+
error({message:"cannot cast to string: invalid UTF-8",on:0xc328})
4953
"hi, world"
5054
"foo bar"
5155
"{foo:\"bar\"}"

0 commit comments

Comments
 (0)