Skip to content

Commit 30ed3df

Browse files
author
kira
committed
zilliz streaming
1 parent e1bbf5c commit 30ed3df

File tree

4 files changed

+189
-0
lines changed

4 files changed

+189
-0
lines changed

.github/workflows/neurips23.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,9 @@ jobs:
105105
- algorithm: pinecone
106106
dataset: random-filter-s
107107
track: filter
108+
- algorithm: zilliz
109+
dataset: random-xs
110+
track: streaming
108111
fail-fast: false
109112

110113
steps:
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
FROM neurips23
2+
3+
RUN apt update
4+
RUN apt install -y software-properties-common
5+
RUN add-apt-repository -y ppa:git-core/ppa
6+
RUN apt update
7+
RUN DEBIAN_FRONTEND=noninteractive apt install -y git make cmake g++ libaio-dev libgoogle-perftools-dev libunwind-dev clang-format libboost-dev libboost-program-options-dev libmkl-full-dev libcpprest-dev python3.10
8+
9+
RUN git clone https://github.com/hhy3/zilliz-bigann.git --branch streaming
10+
RUN pip install ./zilliz-bigann/*.whl
11+
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
random-xs-clustered:
2+
zilliz:
3+
docker-tag: neurips23-streaming-zilliz
4+
module: neurips23.streaming.zilliz.zilliz
5+
constructor: Zilliz
6+
base-args: ["@metric"]
7+
run-groups:
8+
base:
9+
args: |
10+
[{"R":32, "L":100, "insert_threads":16, "consolidate_threads":16}]
11+
query-args: |
12+
[{"Ls":200, "T":8}]
13+
14+
random-xs:
15+
zilliz:
16+
docker-tag: neurips23-streaming-zilliz
17+
module: neurips23.streaming.zilliz.zilliz
18+
constructor: Zilliz
19+
base-args: ["@metric"]
20+
run-groups:
21+
base:
22+
args: |
23+
[{"R":32, "L":50, "insert_threads":16, "consolidate_threads":16}]
24+
query-args: |
25+
[{"Ls":50, "T":8}]
26+
27+
msturing-10M-clustered:
28+
zilliz:
29+
docker-tag: neurips23-streaming-zilliz
30+
module: neurips23.streaming.zilliz.zilliz
31+
constructor: Zilliz
32+
base-args: ["@metric"]
33+
run-groups:
34+
base:
35+
args: |
36+
[{"R":16, "L":10, "insert_threads":8, "consolidate_threads":8}]
37+
query-args: |
38+
[
39+
{"Ls":100, "T":8}
40+
]
41+
42+
msturing-30M-clustered:
43+
zilliz:
44+
docker-tag: neurips23-streaming-zilliz
45+
module: neurips23.streaming.zilliz.zilliz
46+
constructor: Zilliz
47+
base-args: ["@metric"]
48+
run-groups:
49+
base:
50+
args: |
51+
[
52+
{"R":32, "L":100, "insert_threads":8, "consolidate_threads":8}
53+
]
54+
query-args: |
55+
[
56+
{"Ls":450, "T":8},
57+
{"Ls":500, "T":8},
58+
{"Ls":530, "T":8},
59+
{"Ls":550, "T":8}
60+
]
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
from __future__ import absolute_import
2+
import psutil
3+
import os
4+
import time
5+
import numpy as np
6+
7+
import diskannpy
8+
import fast_refine
9+
10+
from neurips23.streaming.base import BaseStreamingANN
11+
12+
class Zilliz(BaseStreamingANN):
13+
def __init__(self, metric, index_params):
14+
self.name = "pyanns"
15+
if (index_params.get("R")==None):
16+
print("Error: missing parameter R")
17+
return
18+
if (index_params.get("L")==None):
19+
print("Error: missing parameter L")
20+
return
21+
self._index_params = index_params
22+
self._metric = metric
23+
24+
self.R = index_params.get("R")
25+
self.L = index_params.get("L")
26+
self.insert_threads = index_params.get("insert_threads")
27+
self.consolidate_threads = index_params.get("consolidate_threads")
28+
self.mx = None
29+
self.mi = None
30+
31+
def index_name(self):
32+
return f"R{self.R}_L{self.L}"
33+
34+
def create_index_dir(self, dataset):
35+
index_dir = os.path.join(os.getcwd(), "data", "indices", "streaming")
36+
os.makedirs(index_dir, mode=0o777, exist_ok=True)
37+
index_dir = os.path.join(index_dir, 'pyanns')
38+
os.makedirs(index_dir, mode=0o777, exist_ok=True)
39+
index_dir = os.path.join(index_dir, dataset.short_name())
40+
os.makedirs(index_dir, mode=0o777, exist_ok=True)
41+
index_dir = os.path.join(index_dir, self.index_name())
42+
os.makedirs(index_dir, mode=0o777, exist_ok=True)
43+
return index_dir
44+
45+
def translate_dist_fn(self, metric):
46+
if metric == 'euclidean':
47+
return 'l2'
48+
elif metric == 'ip':
49+
return 'mips'
50+
else:
51+
raise Exception('Invalid metric')
52+
53+
def translate_dtype(self, dtype:str):
54+
return np.uint8
55+
56+
def setup(self, dtype, max_pts, ndim):
57+
self.index = diskannpy.DynamicMemoryIndex(
58+
distance_metric = self.translate_dist_fn(self._metric),
59+
vector_dtype = self.translate_dtype(dtype),
60+
max_vectors = max_pts,
61+
dimensions = ndim,
62+
graph_degree = self.R,
63+
complexity=self.L,
64+
num_threads = self.insert_threads, #to allocate scratch space for up to 64 search threads
65+
initial_search_complexity = 100
66+
)
67+
self.refiner = fast_refine.Refiner(ndim, max_pts)
68+
self.max_pts = max_pts
69+
print('Index class constructed and ready for update/search')
70+
self.active_indices = set()
71+
self.num_unprocessed_deletes = 0
72+
73+
def quant(self, X, mi, mx):
74+
return np.round(np.clip((X - mi) / (mx - mi) * 127.0, 0.0, 127.0)).astype('uint8')
75+
76+
def insert(self, X, ids):
77+
if self.mi is None:
78+
self.mi = X.min()
79+
self.mx = X.max()
80+
81+
self.refiner.batch_insert(X, ids)
82+
X = self.quant(X, self.mi, self.mx)
83+
self.active_indices.update(ids+1)
84+
print('#active pts', len(self.active_indices), '#unprocessed deletes', self.num_unprocessed_deletes)
85+
if len(self.active_indices) + self.num_unprocessed_deletes >= self.max_pts:
86+
self.index.consolidate_delete()
87+
self.num_unprocessed_deletes = 0
88+
89+
self.index.batch_insert(X, ids+1)
90+
91+
def delete(self, ids):
92+
self.refiner.batch_delete(ids)
93+
for id in ids:
94+
self.index.mark_deleted(id+1)
95+
self.active_indices.difference_update(ids+1)
96+
self.num_unprocessed_deletes += len(ids)
97+
98+
def query(self, X, k):
99+
"""Carry out a batch query for k-NN of query set X."""
100+
nq, d = X.shape
101+
Xq = self.quant(X, self.mi, self.mx)
102+
k_mul = 5
103+
k_reorder = k * k_mul
104+
I, _ = self.index.batch_search(
105+
Xq, k_reorder, self.Ls, self.search_threads)
106+
I = I - 1
107+
self.res = self.refiner.batch_refine(X, I, k).reshape(nq, k)
108+
109+
def set_query_arguments(self, query_args):
110+
self._query_args = query_args
111+
self.Ls = 0 if query_args.get("Ls") == None else query_args.get("Ls")
112+
self.search_threads = self._query_args.get("T")
113+
114+
def __str__(self):
115+
return f'pyanns({self.index_name(), self._query_args})'

0 commit comments

Comments
 (0)