Skip to content

Commit e1bbf5c

Browse files
authored
pinecone's filter solution (#258)
* pinecone filter solution * support random-filter-s * add test * fix algo name in CI * make algo name same as folder * algo name again
1 parent bee146d commit e1bbf5c

File tree

4 files changed

+171
-0
lines changed

4 files changed

+171
-0
lines changed

.github/workflows/neurips23.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,9 @@ jobs:
102102
- algorithm: puck
103103
dataset: random-xs
104104
track: ood
105+
- algorithm: pinecone
106+
dataset: random-filter-s
107+
track: filter
105108
fail-fast: false
106109

107110
steps:
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
FROM neurips23
2+
3+
# install MKL support
4+
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libmkl-full-dev
5+
6+
# copy and install the pys2 python package
7+
RUN git clone --branch filter https://github.com/pinecone-io/bigann.git
8+
RUN pip install ./bigann/*.whl
9+
# verify that the build worked
10+
RUN python3 -c 'import pys2;'
11+
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
random-filter-s:
2+
pinecone:
3+
docker-tag: neurips23-filter-pinecone
4+
module: neurips23.filter.pinecone.pinecone_index
5+
constructor: PineconeIndex
6+
base-args: ["@metric"]
7+
run-groups:
8+
base:
9+
args: |
10+
[{"indexkey": "FilterIVFFlatU8", "num_clusters": "128", "precompute_intersection_threshold": "5000"}]
11+
query-args: |
12+
[
13+
{"fraction_coefficient": "0.3", "fraction_exponent": "0.1", "skip_clustering_threshold": 2000},
14+
{"fraction_coefficient": "0.7", "fraction_exponent": "0.1", "skip_clustering_threshold": 2000},
15+
{"fraction_coefficient": "1.0", "fraction_exponent": "0.1", "skip_clustering_threshold": 2000},
16+
{"fraction_coefficient": "2.0", "fraction_exponent": "0.1", "skip_clustering_threshold": 2000}
17+
]
18+
yfcc-10M:
19+
pinecone:
20+
docker-tag: neurips23-filter-pinecone
21+
module: neurips23.filter.pinecone.pinecone_index
22+
constructor: PineconeIndex
23+
base-args: ["@metric"]
24+
run-groups:
25+
base:
26+
args: |
27+
[{"indexkey": "FilterIVFFlatU8", "num_clusters": "1024", "precompute_intersection_threshold": "2500"}]
28+
query-args: |
29+
[
30+
{"fraction_coefficient": "19.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
31+
{"fraction_coefficient": "18.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
32+
{"fraction_coefficient": "17.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
33+
{"fraction_coefficient": "16.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
34+
{"fraction_coefficient": "15.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
35+
{"fraction_coefficient": "14.7", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
36+
{"fraction_coefficient": "14.3", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
37+
{"fraction_coefficient": "14.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
38+
{"fraction_coefficient": "13.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
39+
{"fraction_coefficient": "11.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000}
40+
]
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
import os
2+
import numpy as np
3+
4+
from neurips23.filter.base import BaseFilterANN
5+
from benchmark.datasets import DATASETS
6+
7+
import pys2
8+
9+
class PineconeIndex(BaseFilterANN):
10+
11+
def __init__(self, metric, index_params):
12+
self._index_params = index_params
13+
self._metric = metric
14+
print(index_params)
15+
self.indexkey = index_params.get("indexkey", "FilterIVFFlatU8")
16+
self.nt = index_params.get("threads", 1)
17+
self.qas = {}
18+
19+
def fit(self, dataset):
20+
ds = DATASETS[dataset]()
21+
22+
if ds.search_type() != "knn_filtered":
23+
raise NotImplementedError()
24+
25+
print(f"Building index")
26+
index = pys2.FilterIndexWrapper(ds.d,
27+
self.indexkey,
28+
self._index_params,
29+
ds.get_dataset_fn(),
30+
os.path.join(ds.basedir, ds.ds_metadata_fn))
31+
32+
self.index = index
33+
34+
def load_index(self, dataset):
35+
"""
36+
Load the index for dataset. Returns False if index
37+
is not available, True otherwise.
38+
39+
Checking the index usually involves the dataset name
40+
and the index build parameters passed during construction.
41+
42+
If the file does not exist, there is an option to download it from a public url
43+
"""
44+
filename = dataset + '.index'
45+
46+
if not os.path.exists(filename):
47+
return False
48+
49+
print("Loading index from " + filename)
50+
self.index = pys2.load_filter_ivf_index(filename)
51+
return True
52+
53+
54+
def index_files_to_store(self, dataset):
55+
"""
56+
Specify a triplet with the local directory path of index files,
57+
the common prefix name of index component(s) and a list of
58+
index components that need to be uploaded to (after build)
59+
or downloaded from (for search) cloud storage.
60+
61+
For local directory path under docker environment, please use
62+
a directory under
63+
data/indices/track(T1 or T2)/algo.__str__()/DATASETS[dataset]().short_name()
64+
"""
65+
raise NotImplementedError()
66+
67+
def query(self, X, k):
68+
raise NotImplementedError()
69+
70+
def filtered_query(self, X, filter, k):
71+
72+
if (X.dtype.kind == 'f'):
73+
print('data type of X is ' + str(X.dtype))
74+
X = X*10 + 128
75+
X = X.astype(np.uint8)
76+
padding_size = 192 - X.shape[1]
77+
X = np.pad(X, ((0, 0), (0, padding_size)), mode='constant')
78+
79+
80+
results_tuple = self.index.search_parallel(X, filter.indptr, filter.indices, k) # this returns a tuple: (results_array, query_time, post_processing_time)
81+
self.I = results_tuple[0]
82+
print("query and postprocessing times: ", results_tuple[1:])
83+
84+
85+
def get_results(self):
86+
return self.I
87+
88+
def set_query_arguments(self, query_args):
89+
self.qas = query_args
90+
print("setting query args:" + str(self.qas))
91+
92+
if "skip_clustering_threshold" in query_args:
93+
self.skip_clustering_threshold = query_args['skip_clustering_threshold']
94+
self.index.set_search_param('skip_clustering_threshold', str(self.skip_clustering_threshold))
95+
self.qas = query_args
96+
else:
97+
self.skip_clustering_threshold = 0
98+
99+
if "fraction_coefficient" in query_args:
100+
self.fraction_coefficient = query_args['fraction_coefficient']
101+
self.index.set_search_param('fraction_coefficient', str(self.fraction_coefficient))
102+
self.qas = query_args
103+
else:
104+
self.fraction_coefficient = 18.0
105+
106+
if "fraction_exponent" in query_args:
107+
self.fraction_exponent = query_args['fraction_exponent']
108+
self.index.set_search_param('fraction_exponent', str(self.fraction_exponent))
109+
self.qas = query_args
110+
else:
111+
self.fraction_coefficient = 0.65
112+
113+
114+
def __str__(self):
115+
return f'pinecone_filter({self.indexkey, self._index_params, self.qas})'
116+
117+

0 commit comments

Comments
 (0)