|
11 | 11 | CHUNK_SIZE = 512 |
12 | 12 | # END CONFIG # |
13 | 13 |
|
14 | | -def bench() -> dict[str, float]: |
| 14 | +def bench() -> dict[str, float]: |
15 | 15 | # Initialise the chunkers. |
16 | 16 | semchunk_chunker = semchunk.chunkerify(tiktoken.encoding_for_model('gpt-4'), CHUNK_SIZE) |
17 | | - semantic_text_splitter_chunker = TextSplitter.from_tiktoken_model('gpt-4', CHUNK_SIZE) |
| 17 | + sts_chunker = TextSplitter.from_tiktoken_model('gpt-4', CHUNK_SIZE) |
18 | 18 |
|
19 | | - def bench_semchunk(text: str) -> None: |
20 | | - semchunk_chunker(text) |
| 19 | + def bench_semchunk(texts: list[str]) -> None: |
| 20 | + semchunk_chunker(texts) |
21 | 21 |
|
22 | | - def bench_semantic_text_splitter(text: str) -> None: |
23 | | - semantic_text_splitter_chunker.chunks(text) |
| 22 | + def bench_sts(texts: list[str]) -> None: |
| 23 | + [sts_chunker.chunks(text) for text in texts] |
24 | 24 |
|
25 | 25 | libraries = { |
26 | 26 | 'semchunk': bench_semchunk, |
27 | | - 'semantic_text_splitter': bench_semantic_text_splitter, |
| 27 | + 'semantic_text_splitter': bench_sts, |
28 | 28 | } |
29 | 29 |
|
30 | 30 | # Download the Gutenberg corpus. |
31 | | - nltk.download('gutenberg') |
32 | | - gutenberg = nltk.corpus.gutenberg |
| 31 | + try: |
| 32 | + gutenberg = nltk.corpus.gutenberg |
| 33 | + |
| 34 | + except Exception: |
| 35 | + nltk.download('gutenberg') |
| 36 | + gutenberg = nltk.corpus.gutenberg |
33 | 37 |
|
34 | 38 | # Benchmark the libraries. |
35 | 39 | benchmarks = dict.fromkeys(libraries.keys(), 0) |
| 40 | + texts = [gutenberg.raw(fileid) for fileid in gutenberg.fileids()] |
36 | 41 |
|
37 | | - for fileid in gutenberg.fileids(): |
38 | | - sample = gutenberg.raw(fileid) |
39 | | - for library, function in libraries.items(): |
40 | | - start = time.time() |
41 | | - function(sample) |
42 | | - benchmarks[library] += time.time() - start |
| 42 | + for library, function in libraries.items(): |
| 43 | + start = time.time() |
| 44 | + function(texts) |
| 45 | + benchmarks[library] = time.time() - start |
43 | 46 |
|
44 | 47 | return benchmarks |
45 | 48 |
|
|
0 commit comments