77# - why only one thread
88# - report net concurrency utilized in the table
99
10+ import argparse
1011import multiprocessing
1112import re
13+ import statistics
1214import subprocess
1315import sys
1416
2022
2123# SETUP:
2224### Download and extract data files: Wikipedia line docs + GloVe
23- # python src/python/setup .py -download
25+ # python src/python/initial_setup .py -download OR curl -O https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip -k
2426# cd ../data
2527# unzip glove.6B.zip
26- # unlzma enwiki-20120502-lines-1k.txt.lzma
28+ # unlzma enwiki-20120502-lines-1k.txt.lzma OR xz enwiki-20120502-lines-1k.txt.lzma
2729### Create document and task vectors
2830# ./gradlew vectors-100
2931#
3032# change the parameters below and then run (you can still manually run this file, but using gradle command
3133# below will auto recompile if you made any changes to java files in luceneutils)
3234# ./gradlew runKnnPerfTest
3335#
36+ # for the median result of n runs with the same parameters:
37+ # ./gradlew runKnnPerfTest -Pruns=n
38+ #
3439# you may want to modify the following settings:
3540
3641DO_PROFILING = False
@@ -135,9 +140,9 @@ def run_knn_benchmark(checkout, values):
135140 indexes = [0 ] * len (values .keys ())
136141 indexes [- 1 ] = - 1
137142 args = []
138- # dim = 100
139- # doc_vectors = constants.GLOVE_VECTOR_DOCS_FILE
140- # query_vectors = ' %s/luceneutil /tasks/vector-task-100d.vec' % constants.BASE_DIR
143+ dim = 100
144+ doc_vectors = "%s/lucene_util/tasks/enwiki-20120502-lines-1k-100d.vec" % constants .BASE_DIR
145+ query_vectors = " %s/lucene_util /tasks/vector-task-100d.vec" % constants .BASE_DIR
141146 # dim = 768
142147 # doc_vectors = '/lucenedata/enwiki/enwiki-20120502-lines-1k-mpnet.vec'
143148 # query_vectors = '/lucenedata/enwiki/enwiki-20120502.mpnet.vec'
@@ -153,9 +158,9 @@ def run_knn_benchmark(checkout, values):
153158 # query_vectors = '/d/electronics_query_vectors.bin'
154159
155160 # Cohere dataset
156- dim = 768
157- doc_vectors = f"{ constants .BASE_DIR } /data/cohere-wikipedia-docs-{ dim } d.vec"
158- query_vectors = f"{ constants .BASE_DIR } /data/cohere-wikipedia-queries-{ dim } d.vec"
161+ # dim = 768
162+ # doc_vectors = f"{constants.BASE_DIR}/data/cohere-wikipedia-docs-{dim}d.vec"
163+ # query_vectors = f"{constants.BASE_DIR}/data/cohere-wikipedia-queries-{dim}d.vec"
159164 # doc_vectors = f"/lucenedata/enwiki/{'cohere-wikipedia'}-docs-{dim}d.vec"
160165 # query_vectors = f"/lucenedata/enwiki/{'cohere-wikipedia'}-queries-{dim}d.vec"
161166 parentJoin_meta_file = f"{ constants .BASE_DIR } /data/{ 'cohere-wikipedia' } -metadata.csv"
@@ -285,6 +290,7 @@ def run_knn_benchmark(checkout, values):
285290
286291 print_fixed_width (all_results , skip_headers )
287292 print_chart (all_results )
293+ return all_results , skip_headers
288294
289295
290296def print_fixed_width (all_results , columns_to_skip ):
@@ -425,7 +431,55 @@ def chart_args_label(args):
425431 return str (args )
426432
427433
434+ def run_n_knn_benchmarks (LUCENE_CHECKOUT , PARAMS , n ):
435+ rec , lat , net , avg = [], [], [], []
436+ tests = []
437+ for i in range (n ):
438+ results , skip_headers = run_knn_benchmark (LUCENE_CHECKOUT , PARAMS )
439+ tests .append (results )
440+ first_4_numbers = results [0 ][0 ].split ("\t " )[:4 ]
441+ first_4_numbers = [float (num ) for num in first_4_numbers ]
442+
443+ # store relevant data points
444+ rec .append (first_4_numbers [0 ])
445+ lat .append (first_4_numbers [1 ])
446+ net .append (first_4_numbers [2 ])
447+ avg .append (first_4_numbers [3 ])
448+
449+ # reconstruct string with median results
450+ med_results = []
451+ med_string = ""
452+ med_string += f"{ round (statistics .median (rec ), 3 )} \t "
453+ med_string += f"{ round (statistics .median (lat ), 3 )} \t "
454+ med_string += f"{ round (statistics .median (net ), 3 )} \t "
455+ med_string += f"{ round (statistics .median (avg ), 3 )} \t "
456+
457+ split_results = results [0 ][0 ].split ("\t " )
458+ split_string = "\t " .join (split_results [4 :])
459+ med_string += split_string
460+ med_tuple = (med_string , results [0 ][1 ])
461+ med_results .append (med_tuple )
462+
463+ # re-print all tables in a row
464+ print ("\n Final Results:" )
465+ for i in range (n ):
466+ print (f"\n Test { i + 1 } :" )
467+ print_fixed_width (tests [i ], skip_headers )
468+
469+ # print median results in table
470+ print ("\n Median Results:" )
471+ print_chart (med_results )
472+ print_fixed_width (med_results , skip_headers )
473+
474+
428475if __name__ == "__main__" :
476+ parser = argparse .ArgumentParser (description = "Run KNN benchmarks" )
477+ parser .add_argument ("--runs" , type = int , default = 1 , help = "Number of times to run the benchmark (default: 1)" )
478+ n = parser .parse_args ()
479+
429480 # Where the version of Lucene is that will be tested. Now this will be sourced from gradle.properties
430481 LUCENE_CHECKOUT = getLuceneDirFromGradleProperties ()
431- run_knn_benchmark (LUCENE_CHECKOUT , PARAMS )
482+ if n .runs == 1 :
483+ run_knn_benchmark (LUCENE_CHECKOUT , PARAMS )
484+ else :
485+ run_n_knn_benchmarks (LUCENE_CHECKOUT , PARAMS , n .runs )
0 commit comments