1111CHUNK_SIZE = 512
1212# END CONFIG #
1313
14- def bench () -> dict [str , float ]:
14+ def bench () -> dict [str , float ]:
1515 # Initialise the chunkers.
1616 semchunk_chunker = semchunk .chunkerify (tiktoken .encoding_for_model ('gpt-4' ), CHUNK_SIZE )
1717 sts_chunker = TextSplitter .from_tiktoken_model ('gpt-4' , CHUNK_SIZE )
@@ -21,7 +21,7 @@ def bench_semchunk(texts: list[str]) -> None:
2121 semchunk_chunker (texts )
2222
2323 def bench_sts (texts : list [str ]) -> None :
24- [ sts_chunker .chunks ( text ) for text in texts ]
24+ sts_chunker .chunk_all ( texts )
2525
2626 libraries = {
2727 'semchunk' : bench_semchunk ,
@@ -31,22 +31,23 @@ def bench_sts(texts: list[str]) -> None:
3131 # Download the Gutenberg corpus.
3232 try :
3333 gutenberg = nltk .corpus .gutenberg
34-
34+
3535 except Exception :
3636 nltk .download ('gutenberg' )
3737 gutenberg = nltk .corpus .gutenberg
38-
38+
3939 # Benchmark the libraries.
40- benchmarks = dict .fromkeys (libraries .keys (), 0 )
40+ benchmarks = dict .fromkeys (libraries .keys (), 0.0 )
4141 texts = [gutenberg .raw (fileid ) for fileid in gutenberg .fileids ()]
42-
42+
4343 for library , function in libraries .items ():
4444 start = time .time ()
4545 function (texts )
4646 benchmarks [library ] = time .time () - start
47-
47+
4848 return benchmarks
4949
5050if __name__ == '__main__' :
51+ nltk .download ('gutenberg' )
5152 for library , time_taken in bench ().items ():
52- print (f'{ library } : { time_taken :.2f} s' )
53+ print (f'{ library } : { time_taken :.2f} s' )
0 commit comments