Skip to content

Commit 4e458f2

Browse files
Fixed CompressedBin algo, updated heuristic based on benchmarks
1 parent 20a7a77 commit 4e458f2

File tree

3 files changed

+88
-23
lines changed

3 files changed

+88
-23
lines changed

algo/uidlist.go

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ func IntersectCompressedWith(pack *pb.UidPack, afterUID uint64, v, o *pb.List) {
6060

6161
// Select appropriate function based on heuristics.
6262
ratio := float64(m) / float64(n)
63-
if ratio < 500 {
63+
if ratio < 100 {
6464
IntersectCompressedWithLinJump(&dec, v.Uids, &dst)
6565
} else {
6666
IntersectCompressedWithBin(&dec, v.Uids, &dst)
@@ -94,7 +94,7 @@ func IntersectCompressedWithLinJump(dec *codec.Decoder, v []uint64, o *[]uint64)
9494
// https://link.springer.com/chapter/10.1007/978-3-642-12476-1_3
9595
// Call seek on dec before calling this function
9696
func IntersectCompressedWithBin(dec *codec.Decoder, q []uint64, o *[]uint64) {
97-
ld := dec.ApproxLen()
97+
ld := codec.ExactLen(dec.Pack)
9898
lq := len(q)
9999

100100
if lq == 0 {
@@ -105,13 +105,19 @@ func IntersectCompressedWithBin(dec *codec.Decoder, q []uint64, o *[]uint64) {
105105
}
106106

107107
// Pick the shorter list and do binary search
108-
if ld < lq {
108+
if ld <= lq {
109109
for {
110110
blockUids := dec.Uids()
111111
if len(blockUids) == 0 {
112112
break
113113
}
114-
IntersectWithBin(blockUids, q, o)
114+
if ld*10 < lq {
115+
IntersectWithBin(blockUids, q, o)
116+
} else {
117+
// For small enough difference between two arrays, we should just
118+
// do lin intersect
119+
IntersectWithLin(blockUids, q, o)
120+
}
115121
lastUid := blockUids[len(blockUids)-1]
116122
qidx := sort.Search(len(q), func(idx int) bool {
117123
return q[idx] >= lastUid
@@ -125,26 +131,29 @@ func IntersectCompressedWithBin(dec *codec.Decoder, q []uint64, o *[]uint64) {
125131
return
126132
}
127133

128-
var uids []uint64
129-
for _, u := range q {
134+
uids := dec.Uids()
135+
qidx := -1
136+
for {
137+
qidx += 1
138+
if qidx >= len(q) {
139+
return
140+
}
141+
u := q[qidx]
130142
if len(uids) == 0 || u > uids[len(uids)-1] {
131-
uids = dec.Seek(u, codec.SeekStart)
143+
if lq*10 < ld {
144+
uids = dec.LinearSeek(u)
145+
} else {
146+
uids = dec.SeekToBlock(u, codec.SeekStart)
147+
}
132148
if len(uids) == 0 {
133149
return
134150
}
135151
}
136-
uidIdx := sort.Search(len(uids), func(idx int) bool {
137-
return uids[idx] >= u
138-
})
139-
if uidIdx >= len(uids) {
140-
// We know that u < max(uids). If we didn't find it here, it's not here.
141-
continue
142-
}
143-
if uids[uidIdx] == u {
144-
*o = append(*o, u)
145-
uidIdx++
152+
_, off := IntersectWithJump(uids, q[qidx:], o)
153+
if off == 0 {
154+
off = 1 // if v[k] isn't in u, move forward
146155
}
147-
uids = uids[uidIdx:]
156+
qidx += off
148157
}
149158
}
150159

algo/uidlist_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ func BenchmarkListIntersectCompressBin(b *testing.B) {
373373
for _, r := range rs {
374374
sz1 := sz
375375
sz2 := int(float64(sz) * r)
376-
if sz2 > 1000000 || sz2 == 0 {
376+
if sz2 > 10000000 || sz2 == 0 {
377377
break
378378
}
379379

codec/codec.go

Lines changed: 60 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,59 @@ func (d *Decoder) ApproxLen() int {
223223

224224
type searchFunc func(int) bool
225225

226+
// SeekToBlock will find the nearest block, and unpack it. Unlike Seek, it doesn't
227+
// apply search in the resulting uid list and then move the pointer forward. When we are going
228+
// to intersect the list later, this function is useful.
229+
func (d *Decoder) SeekToBlock(uid uint64, whence seekPos) []uint64 {
230+
if d.Pack == nil {
231+
return []uint64{}
232+
}
233+
prevBlockIdx := d.blockIdx
234+
d.blockIdx = 0
235+
if uid == 0 {
236+
return d.UnpackBlock()
237+
}
238+
239+
pack := d.Pack
240+
blocksFunc := func() searchFunc {
241+
var f searchFunc
242+
switch whence {
243+
case SeekStart:
244+
f = func(i int) bool { return pack.Blocks[i+prevBlockIdx].Base >= uid }
245+
case SeekCurrent:
246+
f = func(i int) bool { return pack.Blocks[i+prevBlockIdx].Base > uid }
247+
}
248+
return f
249+
}
250+
251+
idx := sort.Search(len(pack.Blocks[prevBlockIdx:]), blocksFunc()) + prevBlockIdx
252+
// The first block.Base >= uid.
253+
if idx == 0 {
254+
return d.UnpackBlock()
255+
}
256+
// The uid is the first entry in the block.
257+
if idx < len(pack.Blocks) && pack.Blocks[idx].Base == uid {
258+
d.blockIdx = idx
259+
return d.UnpackBlock()
260+
}
261+
262+
// Either the idx = len(pack.Blocks) that means it wasn't found in any of the block's base. Or,
263+
// we found the first block index whose base is greater than uid. In these cases, go to the
264+
// previous block and search there.
265+
d.blockIdx = idx - 1 // Move to the previous block. If blockIdx<0, unpack will deal with it.
266+
if d.blockIdx != prevBlockIdx {
267+
d.UnpackBlock() // And get all their uids.
268+
}
269+
270+
if uid < d.uids[len(d.uids)-1] {
271+
return d.uids
272+
}
273+
274+
// Could not find any uid in the block, which is >= uid. The next block might still have valid
275+
// entries > uid.
276+
return d.Next()
277+
}
278+
226279
// Seek will search for uid in a packed block using the specified whence position.
227280
// The value of whence must be one of the predefined values SeekStart or SeekCurrent.
228281
// SeekStart searches uid and includes it as part of the results.
@@ -233,6 +286,7 @@ func (d *Decoder) Seek(uid uint64, whence seekPos) []uint64 {
233286
if d.Pack == nil {
234287
return []uint64{}
235288
}
289+
prevBlockIdx := d.blockIdx
236290
d.blockIdx = 0
237291
if uid == 0 {
238292
return d.UnpackBlock()
@@ -243,14 +297,14 @@ func (d *Decoder) Seek(uid uint64, whence seekPos) []uint64 {
243297
var f searchFunc
244298
switch whence {
245299
case SeekStart:
246-
f = func(i int) bool { return pack.Blocks[i].Base >= uid }
300+
f = func(i int) bool { return pack.Blocks[i+prevBlockIdx].Base >= uid }
247301
case SeekCurrent:
248-
f = func(i int) bool { return pack.Blocks[i].Base > uid }
302+
f = func(i int) bool { return pack.Blocks[i+prevBlockIdx].Base > uid }
249303
}
250304
return f
251305
}
252306

253-
idx := sort.Search(len(pack.Blocks), blocksFunc())
307+
idx := sort.Search(len(pack.Blocks[prevBlockIdx:]), blocksFunc()) + prevBlockIdx
254308
// The first block.Base >= uid.
255309
if idx == 0 {
256310
return d.UnpackBlock()
@@ -265,7 +319,9 @@ func (d *Decoder) Seek(uid uint64, whence seekPos) []uint64 {
265319
// we found the first block index whose base is greater than uid. In these cases, go to the
266320
// previous block and search there.
267321
d.blockIdx = idx - 1 // Move to the previous block. If blockIdx<0, unpack will deal with it.
268-
d.UnpackBlock() // And get all their uids.
322+
if d.blockIdx != prevBlockIdx {
323+
d.UnpackBlock() // And get all their uids.
324+
}
269325

270326
uidsFunc := func() searchFunc {
271327
var f searchFunc

0 commit comments

Comments
 (0)