Skip to content

Commit ea81b5d

Browse files
authored
KnnPerfTest Median Results Display (#425)
* initial commit * Added runs parameter to command line, refactored code * Removed median chart from single-test runs, added full summary output * Reverted parameters * Reformatted and reverted parameters
1 parent bf91cb8 commit ea81b5d

File tree

2 files changed

+67
-10
lines changed

2 files changed

+67
-10
lines changed

gradle/knn.gradle

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,10 @@ task runKnnPerfTest (type: Exec) {
4747

4848
workingDir rootProject.getRootDir()
4949

50-
commandLine 'python3', 'src/python/knnPerfTest.py'
50+
doFirst {
51+
def runs = project.hasProperty('runs') ? project.property('runs') : '1'
52+
commandLine 'python3', 'src/python/knnPerfTest.py', '--runs', runs
53+
}
5154
}
5255

5356
task extractVectorTasks (type: Copy) {

src/python/knnPerfTest.py

Lines changed: 63 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@
77
# - why only one thread
88
# - report net concurrency utilized in the table
99

10+
import argparse
1011
import multiprocessing
1112
import re
13+
import statistics
1214
import subprocess
1315
import sys
1416

@@ -20,17 +22,20 @@
2022

2123
# SETUP:
2224
### Download and extract data files: Wikipedia line docs + GloVe
23-
# python src/python/setup.py -download
25+
# python src/python/initial_setup.py -download OR curl -O https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip -k
2426
# cd ../data
2527
# unzip glove.6B.zip
26-
# unlzma enwiki-20120502-lines-1k.txt.lzma
28+
# unlzma enwiki-20120502-lines-1k.txt.lzma OR xz enwiki-20120502-lines-1k.txt.lzma
2729
### Create document and task vectors
2830
# ./gradlew vectors-100
2931
#
3032
# change the parameters below and then run (you can still manually run this file, but using gradle command
3133
# below will auto recompile if you made any changes to java files in luceneutils)
3234
# ./gradlew runKnnPerfTest
3335
#
36+
# for the median result of n runs with the same parameters:
37+
# ./gradlew runKnnPerfTest -Pruns=n
38+
#
3439
# you may want to modify the following settings:
3540

3641
DO_PROFILING = False
@@ -135,9 +140,9 @@ def run_knn_benchmark(checkout, values):
135140
indexes = [0] * len(values.keys())
136141
indexes[-1] = -1
137142
args = []
138-
# dim = 100
139-
# doc_vectors = constants.GLOVE_VECTOR_DOCS_FILE
140-
# query_vectors = '%s/luceneutil/tasks/vector-task-100d.vec' % constants.BASE_DIR
143+
dim = 100
144+
doc_vectors = "%s/lucene_util/tasks/enwiki-20120502-lines-1k-100d.vec" % constants.BASE_DIR
145+
query_vectors = "%s/lucene_util/tasks/vector-task-100d.vec" % constants.BASE_DIR
141146
# dim = 768
142147
# doc_vectors = '/lucenedata/enwiki/enwiki-20120502-lines-1k-mpnet.vec'
143148
# query_vectors = '/lucenedata/enwiki/enwiki-20120502.mpnet.vec'
@@ -153,9 +158,9 @@ def run_knn_benchmark(checkout, values):
153158
# query_vectors = '/d/electronics_query_vectors.bin'
154159

155160
# Cohere dataset
156-
dim = 768
157-
doc_vectors = f"{constants.BASE_DIR}/data/cohere-wikipedia-docs-{dim}d.vec"
158-
query_vectors = f"{constants.BASE_DIR}/data/cohere-wikipedia-queries-{dim}d.vec"
161+
# dim = 768
162+
# doc_vectors = f"{constants.BASE_DIR}/data/cohere-wikipedia-docs-{dim}d.vec"
163+
# query_vectors = f"{constants.BASE_DIR}/data/cohere-wikipedia-queries-{dim}d.vec"
159164
# doc_vectors = f"/lucenedata/enwiki/{'cohere-wikipedia'}-docs-{dim}d.vec"
160165
# query_vectors = f"/lucenedata/enwiki/{'cohere-wikipedia'}-queries-{dim}d.vec"
161166
parentJoin_meta_file = f"{constants.BASE_DIR}/data/{'cohere-wikipedia'}-metadata.csv"
@@ -285,6 +290,7 @@ def run_knn_benchmark(checkout, values):
285290

286291
print_fixed_width(all_results, skip_headers)
287292
print_chart(all_results)
293+
return all_results, skip_headers
288294

289295

290296
def print_fixed_width(all_results, columns_to_skip):
@@ -425,7 +431,55 @@ def chart_args_label(args):
425431
return str(args)
426432

427433

434+
def run_n_knn_benchmarks(LUCENE_CHECKOUT, PARAMS, n):
435+
rec, lat, net, avg = [], [], [], []
436+
tests = []
437+
for i in range(n):
438+
results, skip_headers = run_knn_benchmark(LUCENE_CHECKOUT, PARAMS)
439+
tests.append(results)
440+
first_4_numbers = results[0][0].split("\t")[:4]
441+
first_4_numbers = [float(num) for num in first_4_numbers]
442+
443+
# store relevant data points
444+
rec.append(first_4_numbers[0])
445+
lat.append(first_4_numbers[1])
446+
net.append(first_4_numbers[2])
447+
avg.append(first_4_numbers[3])
448+
449+
# reconstruct string with median results
450+
med_results = []
451+
med_string = ""
452+
med_string += f"{round(statistics.median(rec), 3)}\t"
453+
med_string += f"{round(statistics.median(lat), 3)}\t"
454+
med_string += f"{round(statistics.median(net), 3)}\t"
455+
med_string += f"{round(statistics.median(avg), 3)}\t"
456+
457+
split_results = results[0][0].split("\t")
458+
split_string = "\t".join(split_results[4:])
459+
med_string += split_string
460+
med_tuple = (med_string, results[0][1])
461+
med_results.append(med_tuple)
462+
463+
# re-print all tables in a row
464+
print("\nFinal Results:")
465+
for i in range(n):
466+
print(f"\nTest {i + 1}:")
467+
print_fixed_width(tests[i], skip_headers)
468+
469+
# print median results in table
470+
print("\nMedian Results:")
471+
print_chart(med_results)
472+
print_fixed_width(med_results, skip_headers)
473+
474+
428475
if __name__ == "__main__":
476+
parser = argparse.ArgumentParser(description="Run KNN benchmarks")
477+
parser.add_argument("--runs", type=int, default=1, help="Number of times to run the benchmark (default: 1)")
478+
n = parser.parse_args()
479+
429480
# Where the version of Lucene is that will be tested. Now this will be sourced from gradle.properties
430481
LUCENE_CHECKOUT = getLuceneDirFromGradleProperties()
431-
run_knn_benchmark(LUCENE_CHECKOUT, PARAMS)
482+
if n.runs == 1:
483+
run_knn_benchmark(LUCENE_CHECKOUT, PARAMS)
484+
else:
485+
run_n_knn_benchmarks(LUCENE_CHECKOUT, PARAMS, n.runs)

0 commit comments

Comments
 (0)