Skip to content

Commit f8fb6a7

Browse files
lerman25github-actions[bot]
authored andcommitted
Update int8 single hnsw + structural changes (#601)
* Change dataset to 1 mil * Change dataset to 1 mil batch * change file * change dataset url to 1mil all * change dataset url to 1mil int8 * restructer python scripts in data, add int8 to serializer, add batch-iter-int8 to yml (cherry picked from commit 5a86c90)
1 parent 7767b95 commit f8fb6a7

File tree

7 files changed

+77
-12
lines changed

7 files changed

+77
-12
lines changed

.github/workflows/benchmark.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ on:
1111
options:
1212
- benchmarks-all
1313
- benchmarks-default
14-
- bm-basics-int8-single
1514
- bm-basics-fp32-single
1615
- bm-basics-fp32-multi
1716
- bm-basics-fp64-single
@@ -20,6 +19,7 @@ on:
2019
- bm-basics-bf16-multi
2120
- bm-basics-fp16-single
2221
- bm-basics-fp16-multi
22+
- bm-basics-int8-single
2323
- bm-batch-iter-fp32-single
2424
- bm-batch-iter-fp32-multi
2525
- bm-batch-iter-fp64-single
@@ -28,6 +28,7 @@ on:
2828
- bm-batch-iter-bf16-multi
2929
- bm-batch-iter-fp16-single
3030
- bm-batch-iter-fp16-multi
31+
- bm-batch-iter-int8-single
3132
- bm-updated-fp32-single
3233
- bm-spaces
3334
description: 'Benchmarks set to run'

tests/benchmark/data/hnsw_indices/hnsw_indices_all.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,5 @@ https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/dbpedia-cosine-dim768-fp
2525
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/fashion_images_multi_value-cosine-dim512-M64-efc512-fp16.hnsw_v3
2626
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/fashion_images_multi_value-cosine-dim512-fp16-test_vectors.raw
2727

28-
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/wipedia-cosine-dim1024-M64-efc512-int8.hnsw_v3
29-
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/wipedia-cosine-dim1024-int8-test_vectors.raw
28+
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/wipedia_single-cosine-dim1024-M64-efc512-int8.hnsw_v3
29+
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/wipedia_single-cosine-dim1024-int8-test_vectors.raw
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/wipedia-cosine-dim1024-M64-efc512-int8.hnsw_v3
2-
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/wipedia-cosine-dim1024-int8-test_vectors.raw
1+
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/wipedia_single-cosine-dim1024-M64-efc512-int8.hnsw_v3
2+
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/wipedia_single-cosine-dim1024-int8-test_vectors.raw
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# This file is a template file for downloading datasets
2+
# In this version, it downloads the "wipedia_single" dataset used for int8
3+
# Refrain from pushing changes unless necessary
4+
from datasets import load_dataset
5+
import numpy as np
6+
import os
7+
import h5py
8+
from math import ceil
9+
from tqdm import tqdm
10+
INT8_KEY = 'emb_int8'
11+
DATASET = 'wikipedia-1024_eng_v3_single'
12+
hdf5_output_file_name = "%s.hdf5" %DATASET
13+
14+
lang = "en" #Use the English Wikipedia subset
15+
16+
num_vectors_train = 1_000_000
17+
num_vectors_test = 10_000
18+
num_vectors = num_vectors_train + num_vectors_test
19+
20+
dim = 1024
21+
docs = load_dataset("Cohere/wikipedia-2023-11-embed-multilingual-v3-int8-binary", lang, split="train", streaming=True)
22+
label_size = 1
23+
data = np.empty((num_vectors//label_size,label_size, dim), dtype=np.int8)
24+
25+
26+
ids = []
27+
counter = 0
28+
label_index = 0
29+
30+
with tqdm(total=num_vectors) as progress_bar:
31+
for doc in docs:
32+
if counter == num_vectors:
33+
break
34+
ids.append(doc['_id'])
35+
emb = doc['emb_int8']
36+
data[label_index, counter % label_size] = emb
37+
counter += 1
38+
if counter % label_size == 0:
39+
label_index += 1
40+
progress_bar.update(1)
41+
42+
train_data = data[:num_vectors_train // label_size].reshape(-1, dim)
43+
test_data = data[num_vectors_train // label_size:].reshape(-1, dim)
44+
45+
print(f"Train data shape: {train_data.shape}")
46+
print(f"Test data shape: {test_data.shape}")
47+
48+
with h5py.File(hdf5_output_file_name, 'w') as hdf5_file:
49+
hdf5_file.create_dataset('train', data=train_data)
50+
hdf5_file.create_dataset('test', data=test_data)

tests/benchmark/data/serializer.py renamed to tests/benchmark/data/scripts/serializer.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,13 +136,21 @@
136136
'metric': VecSimMetric_L2,
137137
'skipRaw': True,
138138
},
139+
{
140+
'filename': 'wikipedia_single-1024_eng_v3',
141+
'nickname': 'wipedia_single',
142+
'dim': 1024,
143+
'type': VecSimType_INT8,
144+
'metric': VecSimMetric_Cosine,
145+
},
139146
]
140147

141148
TYPES_ATTR = {
142149
VecSimType_FLOAT32: {"size_in_bytes": 4, "vector_type": np.float32},
143150
VecSimType_FLOAT64: {"size_in_bytes": 8, "vector_type": np.float64},
144151
VecSimType_BFLOAT16: {"size_in_bytes": 2, "vector_type": bfloat16},
145152
VecSimType_FLOAT16: {"size_in_bytes": 2, "vector_type": np.float16},
153+
VecSimType_INT8: {"size_in_bytes": 1, "vector_type": np.int8}
146154
}
147155

148156

@@ -190,7 +198,9 @@ def serialize(files=DEFAULT_FILES):
190198
elif hnswparams.type == VecSimType_FLOAT16:
191199
serialized_file_name = serialized_file_name + '-fp16'
192200
serialized_raw_name = serialized_raw_name + '-fp16'
193-
201+
elif hnswparams.type == VecSimType_INT8:
202+
serialized_file_name = serialized_file_name + '-int8'
203+
serialized_raw_name = serialized_raw_name + '-int8'
194204
print('first, exporting test set to binary')
195205
if not file.get('skipRaw', False):
196206
test = f['test']
@@ -201,6 +211,8 @@ def serialize(files=DEFAULT_FILES):
201211
test = np.array(test_set, dtype=bfloat16)
202212
elif hnswparams.type == VecSimType_FLOAT16:
203213
test = test.astype(np.float16)
214+
elif hnswparams.type == VecSimType_INT8:
215+
test = test.astype(np.int8)
204216
print(f"creating test set of {len(test)} vectors")
205217
with open(os.path.join(location, serialized_raw_name + '-test_vectors.raw'), 'wb') as testfile:
206218
for vec in test:
@@ -222,6 +234,8 @@ def serialize(files=DEFAULT_FILES):
222234
data = np.array(data_set, dtype=bfloat16)
223235
elif hnswparams.type == VecSimType_FLOAT16:
224236
data = data.astype(np.float16)
237+
elif hnswparams.type == VecSimType_INT8:
238+
data = data.astype(np.int8)
225239
print(f"creating index with {hnswparams.initialCapacity} vectors")
226240
for label, cur in enumerate(data):
227241
for vec in cur if hnswparams.multi else [cur]:

tests/benchmark/run_files/bm_basics_single_int8.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,16 @@
99
bool BM_VecSimGeneral::is_multi = false;
1010

1111
size_t BM_VecSimGeneral::n_queries = 10000;
12-
size_t BM_VecSimGeneral::n_vectors = 999424;
12+
size_t BM_VecSimGeneral::n_vectors = 1000000;
1313
size_t BM_VecSimGeneral::dim = 1024;
1414
size_t BM_VecSimGeneral::M = 64;
1515
size_t BM_VecSimGeneral::EF_C = 512;
1616
tieredIndexMock BM_VecSimGeneral::mock_thread_pool{};
1717

1818
const char *BM_VecSimGeneral::hnsw_index_file =
19-
"tests/benchmark/data/wipedia-cosine-dim1024-M64-efc512-int8.hnsw_v3";
19+
"tests/benchmark/data/wipedia_single-cosine-dim1024-M64-efc512-int8.hnsw_v3";
2020
const char *BM_VecSimGeneral::test_queries_file =
21-
"tests/benchmark/data/wipedia-cosine-dim1024-int8-test_vectors.raw";
21+
"tests/benchmark/data/wipedia_single-cosine-dim1024-int8-test_vectors.raw";
2222

2323
#define BM_FUNC_NAME(bm_func, algo) bm_func##_##algo##_Single
2424
#define BM_ADD_LABEL AddLabel_Single

tests/benchmark/run_files/bm_batch_iterator_single_int8.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,17 @@
33
bool BM_VecSimGeneral::is_multi = false;
44

55
size_t BM_VecSimGeneral::n_queries = 10000;
6-
size_t BM_VecSimGeneral::n_vectors = 999424;
6+
size_t BM_VecSimGeneral::n_vectors = 1000000;
77
size_t BM_VecSimGeneral::dim = 1024;
88
size_t BM_VecSimGeneral::M = 64;
99
size_t BM_VecSimGeneral::EF_C = 512;
1010
size_t BM_VecSimGeneral::block_size = 1024;
1111
tieredIndexMock BM_VecSimGeneral::mock_thread_pool{};
1212

1313
const char *BM_VecSimGeneral::hnsw_index_file =
14-
"tests/benchmark/data/wipedia-cosine-dim1024-M64-efc512-int8.hnsw_v3";
14+
"tests/benchmark/data/wipedia_single-cosine-dim1024-M64-efc512-int8.hnsw_v3";
1515
const char *BM_VecSimGeneral::test_queries_file =
16-
"tests/benchmark/data/wipedia-cosine-dim1024-int8-test_vectors.raw";
16+
"tests/benchmark/data/wipedia_single-cosine-dim1024-int8-test_vectors.raw";
1717

1818
#define BM_FUNC_NAME(bm_func, algo) algo##_##bm_func##_Single
1919

0 commit comments

Comments
 (0)