Skip to content

Commit e6a894d

Browse files
committed
Update parquet-go and drop string workaround
With parquet-go v0.26.0 it now properly handles optional []byte data so we no longer need to store it as a string. ``` go get github.com/parquet-go/parquet-go go mod tidy ``` Using a []byte also means we can make the test code check for nil rather than a zero length string which is what we actually want.
1 parent 5d4bae0 commit e6a894d

File tree

4 files changed

+21
-20
lines changed

4 files changed

+21
-20
lines changed

go.mod

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ require (
1212
github.com/hashicorp/golang-lru/v2 v2.0.7
1313
github.com/lestrrat-go/jwx/v2 v2.1.6
1414
github.com/miekg/dns v1.1.68
15-
github.com/parquet-go/parquet-go v0.25.1
15+
github.com/parquet-go/parquet-go v0.26.0
1616
github.com/prometheus/client_golang v1.23.2
1717
github.com/segmentio/go-hll v1.0.1
1818
github.com/smhanov/dawg v0.0.0-20220118194912-66057bdbf2e3
@@ -65,6 +65,8 @@ require (
6565
github.com/lestrrat-go/option v1.0.1 // indirect
6666
github.com/lestrrat-go/option/v2 v2.0.0 // indirect
6767
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
68+
github.com/parquet-go/bitpack v0.2.0 // indirect
69+
github.com/parquet-go/jsonlite v0.8.1 // indirect
6870
github.com/pelletier/go-toml/v2 v2.2.4 // indirect
6971
github.com/pierrec/lz4/v4 v4.1.22 // indirect
7072
github.com/pkg/errors v0.9.1 // indirect

go.sum

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,12 @@ github.com/miekg/dns v1.1.68 h1:jsSRkNozw7G/mnmXULynzMNIsgY2dHC8LO6U6Ij2JEA=
127127
github.com/miekg/dns v1.1.68/go.mod h1:fujopn7TB3Pu3JM69XaawiU0wqjpL9/8xGop5UrTPps=
128128
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
129129
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
130-
github.com/parquet-go/parquet-go v0.25.1 h1:l7jJwNM0xrk0cnIIptWMtnSnuxRkwq53S+Po3KG8Xgo=
131-
github.com/parquet-go/parquet-go v0.25.1/go.mod h1:AXBuotO1XiBtcqJb/FKFyjBG4aqa3aQAAWF3ZPzCanY=
130+
github.com/parquet-go/bitpack v0.2.0 h1:1qA39QcA+HeExChZOATm78XMs5W2NY/Y2l17M5kDUuE=
131+
github.com/parquet-go/bitpack v0.2.0/go.mod h1:XnVk9TH+O40eOOmvpAVZ7K2ocQFrQwysLMnc6M/8lgs=
132+
github.com/parquet-go/jsonlite v0.8.1 h1:TdvfyPaVLTlz/Zsl+amWO4h0tpEwXwRkd7xa4iPhL5E=
133+
github.com/parquet-go/jsonlite v0.8.1/go.mod h1:nDjpkpL4EOtqs6NQugUsi0Rleq9sW/OtC1NnZEnxzF0=
134+
github.com/parquet-go/parquet-go v0.26.0 h1:5rWuYYCKouRlo1kLihNAcw2+mb/OLJhIZjjpFu1lX9k=
135+
github.com/parquet-go/parquet-go v0.26.0/go.mod h1:7K8PVhWjeOLCtcV0cT3DFMfegbcM9uwvVNc2F+Cmsw4=
132136
github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4=
133137
github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY=
134138
github.com/pierrec/lz4/v4 v4.1.22 h1:cKFw6uJDK+/gfw5BcDL0JL5aBsAFdsIT18eRtLj7VIU=

pkg/runner/runner.go

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -167,15 +167,10 @@ type histogramData struct {
167167
V4ClientCount uint64 `parquet:"v4client_count"`
168168
V6ClientCount uint64 `parquet:"v6client_count"`
169169

170-
// Would probably be cleaner to use a []byte instead of string with
171-
// struct tag "bytes" here, but it seems the parquet-go library does
172-
// not handle "optional" []byte fields correctly right now, see:
173-
// https://github.com/parquet-go/parquet-go/issues/303
174-
//
175170
// These fields are NULL when HLL uses explicit storage, otherwise
176171
// contain the probabilistic HLL bytes
177-
V4ClientCountHLLBytes string `parquet:"v4client_count_hll,bytes,optional"`
178-
V6ClientCountHLLBytes string `parquet:"v6client_count_hll,bytes,optional"`
172+
V4ClientCountHLLBytes []byte `parquet:"v4client_count_hll,optional"`
173+
V6ClientCountHLLBytes []byte `parquet:"v6client_count_hll,optional"`
179174
}
180175

181176
// We need to create the session data schema by hand instead of basing it of
@@ -2720,10 +2715,10 @@ func (edm *dnstapMinimiser) writeHistogramParquet(output io.Writer, startTime ti
27202715

27212716
// Include bytes from our hll data structures if they are stored with a probabilistic storage type
27222717
if v4HLLType == hllSparse || v4HLLType == hllDense {
2723-
hGramData.V4ClientCountHLLBytes = string(v4HLLBytes)
2718+
hGramData.V4ClientCountHLLBytes = v4HLLBytes
27242719
}
27252720
if v6HLLType == hllSparse || v6HLLType == hllDense {
2726-
hGramData.V6ClientCountHLLBytes = string(v6HLLBytes)
2721+
hGramData.V6ClientCountHLLBytes = v6HLLBytes
27272722
}
27282723

27292724
_, err = parquetWriter.Write([]histogramData{*hGramData})

pkg/runner/runner_test.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1879,8 +1879,8 @@ func TestHistogramWriter(t *testing.T) {
18791879
FailCount: 19,
18801880
OtherRcodeCount: 20,
18811881
EDMStatusBits: 21,
1882-
V4ClientCountHLLBytes: string(v4hll.ToBytes()),
1883-
V6ClientCountHLLBytes: string(v6hll.ToBytes()),
1882+
V4ClientCountHLLBytes: v4hll.ToBytes(),
1883+
V6ClientCountHLLBytes: v6hll.ToBytes(),
18841884
}
18851885

18861886
_, err = parquetWriter.Write([]histogramData{hd})
@@ -1957,8 +1957,8 @@ func BenchmarkHistogramWriter(b *testing.B) {
19571957
FailCount: 19,
19581958
OtherRcodeCount: 20,
19591959
EDMStatusBits: 21,
1960-
V4ClientCountHLLBytes: string(v4hll.ToBytes()),
1961-
V6ClientCountHLLBytes: string(v6hll.ToBytes()),
1960+
V4ClientCountHLLBytes: v4hll.ToBytes(),
1961+
V6ClientCountHLLBytes: v6hll.ToBytes(),
19621962
}
19631963

19641964
for b.Loop() {
@@ -2130,14 +2130,14 @@ func TestWriteHistogramParquetExplicitThreshold(t *testing.T) {
21302130
}
21312131

21322132
for _, row := range rows {
2133-
if test.ipv4HllIsNull && len(row.V4ClientCountHLLBytes) != 0 {
2134-
t.Fatalf("IPv4 HLL data should be length 0 but is %d", len(row.V4ClientCountHLLBytes))
2133+
if test.ipv4HllIsNull && row.V4ClientCountHLLBytes != nil {
2134+
t.Fatalf("IPv4 HLL data should be nil but is %#v", row.V4ClientCountHLLBytes)
21352135
}
21362136
if !test.ipv4HllIsNull && len(row.V4ClientCountHLLBytes) == 0 {
21372137
t.Fatal("IPv4 HLL data is 0 when it should have content")
21382138
}
2139-
if test.ipv6HllIsNull && len(row.V6ClientCountHLLBytes) != 0 {
2140-
t.Fatalf("IPv6 HLL data should be length 0 but is %d", len(row.V6ClientCountHLLBytes))
2139+
if test.ipv6HllIsNull && row.V6ClientCountHLLBytes != nil {
2140+
t.Fatalf("IPv6 HLL data should be nil but is %#v", row.V6ClientCountHLLBytes)
21412141
}
21422142
if !test.ipv6HllIsNull && len(row.V6ClientCountHLLBytes) == 0 {
21432143
t.Fatal("IPv6 HLL data is 0 when it should have content")

0 commit comments

Comments
 (0)