Skip to content

Commit 21eda51

Browse files
authored
Update generate_data.py to return whether we should upload the generated indexes (#324)
1 parent 9edaba3 commit 21eda51

File tree

3 files changed

+42
-15
lines changed

3 files changed

+42
-15
lines changed

.github/workflows/build-wheels.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ jobs:
4747
pip install .
4848
4949
# Generate data.
50-
python backwards-compatibility-data/generate_data.py $release_tag
50+
result=$(python backwards-compatibility-data/generate_data.py $release_tag)
51+
echo "Should we upload these generated indexes? $result"
5152
5253
# Fetch.
5354
echo "git fetch"

backwards-compatibility-data/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
This folder contains test indices built using different versions of TileDB-Vector-Search. It is used to test the ability of the latest version of TileDB-Vector-Search to load and query arrays built by previous versions.
44

5+
In CI we run `generate_data.py` on each release and on major and minor version bump releases create PR with the generated data into `main`. We do not check in the generated data for patch releases.
6+
57
### Usage
68

79
To generate new data, run:

backwards-compatibility-data/generate_data.py

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,17 @@
55
from tiledb.vector_search.utils import load_fvecs
66
from tiledb.vector_search.utils import write_fvecs
77

8+
base_dir = os.path.dirname(os.path.abspath(__file__))
9+
810

911
def create_sift_micro():
1012
"""
1113
Create a smaller version of the base SIFT 10K dataset (http://corpus-texmex.irisa.fr). You
1214
don't need to run this again, but it's saved here just in case. To query an index built with
1315
this data just select vectors from this file as the query vectors.
1416
"""
15-
script_dir = os.path.dirname(os.path.abspath(__file__))
1617
base_uri = os.path.join(
17-
script_dir,
18+
base_dir,
1819
"..",
1920
"apis",
2021
"python",
@@ -24,20 +25,18 @@ def create_sift_micro():
2425
"siftsmall_base.fvecs",
2526
)
2627
write_fvecs(
27-
os.path.join(script_dir, "siftmicro_base.fvecs"), load_fvecs(base_uri)[:100]
28+
os.path.join(base_dir, "siftmicro_base.fvecs"), load_fvecs(base_uri)[:100]
2829
)
2930

3031

31-
def generate_release_data(version):
32-
script_dir = os.path.dirname(os.path.abspath(__file__))
33-
32+
def generate_indexes(version):
3433
# Create the new release directory.
35-
release_dir = os.path.join(script_dir, "data", version)
36-
shutil.rmtree(release_dir, ignore_errors=True)
37-
os.makedirs(release_dir, exist_ok=True)
34+
index_dir = os.path.join(base_dir, "data", version)
35+
shutil.rmtree(index_dir, ignore_errors=True)
36+
os.makedirs(index_dir, exist_ok=True)
3837

3938
# Get the data we'll use to generate the index.
40-
base_uri = os.path.join(script_dir, "siftmicro_base.fvecs")
39+
base_uri = os.path.join(base_dir, "siftmicro_base.fvecs")
4140
base = load_fvecs(base_uri)
4241
indices = [
4342
0,
@@ -72,8 +71,7 @@ def generate_release_data(version):
7271
data_types = ["float32", "uint8"]
7372
for index_type in index_types:
7473
for data_type in data_types:
75-
index_uri = f"{release_dir}/{index_type.lower()}_{data_type}"
76-
print(f"Creating index at {index_uri}")
74+
index_uri = f"{index_dir}/{index_type.lower()}_{data_type}"
7775
index = ingest(
7876
index_type=index_type,
7977
index_uri=index_uri,
@@ -85,6 +83,28 @@ def generate_release_data(version):
8583
assert result_d.flatten().tolist() == [0 for _ in range(len(indices))]
8684

8785

86+
def check_should_upload_indexes(version) -> bool:
87+
"""
88+
Returns True if the minor version of the version string is greater than the minor version of the last version uploaded. When we run on CI we only want to upload data when the minor version changes. Examples:
89+
- We have 0.1.0, 0.1.1. We get version=0.1.2. In this case we return False.
90+
- We have 0.1.0, 0.1.1. We get version=0.2.0. In this case we return True.
91+
- We have 0.1.0, 0.1.1. We get version=0.2.9. In this case we return True.
92+
"""
93+
split_version = args.version.split(".")
94+
minor_version = split_version[1] if len(split_version) >= 2 else None
95+
if minor_version is None:
96+
return False
97+
98+
data_dir = os.path.join(base_dir, "data")
99+
for folder in os.listdir(data_dir):
100+
split_folder = folder.split(".")
101+
folder_minor_version = split_folder[1] if len(split_folder) >= 2 else None
102+
if folder_minor_version == minor_version:
103+
return False
104+
105+
return True
106+
107+
88108
if __name__ == "__main__":
89109
import argparse
90110

@@ -94,5 +114,9 @@ def generate_release_data(version):
94114
help="The name of the of the TileDB-Vector-Search version which we are creating indices for.",
95115
)
96116
args = p.parse_args()
97-
print(f"Building indexes for version {args.version}")
98-
generate_release_data(args.version)
117+
118+
should_upload_indexes = check_should_upload_indexes(args.version)
119+
120+
generate_indexes(args.version)
121+
122+
print("true" if should_upload_indexes else "false")

0 commit comments

Comments
 (0)