|
3 | 3 | import multiprocessing as mp |
4 | 4 | import time |
5 | 5 | from concurrent.futures import ThreadPoolExecutor |
| 6 | +from copy import deepcopy |
6 | 7 |
|
7 | 8 | from vectordb_bench import config |
8 | 9 | from vectordb_bench.backend.clients import api |
| 10 | +from vectordb_bench.backend.clients.pgvector.pgvector import PgVector |
9 | 11 | from vectordb_bench.backend.dataset import DataSetIterator |
10 | 12 | from vectordb_bench.backend.utils import time_it |
11 | 13 |
|
@@ -33,17 +35,27 @@ def __init__( |
33 | 35 | self.executing_futures = [] |
34 | 36 | self.sig_idx = 0 |
35 | 37 |
|
36 | | - def send_insert_task(self, db: api.VectorDB, emb: list[list[float]], metadata: list[str], retry_idx: int = 0): |
37 | | - _, error = db.insert_embeddings(emb, metadata) |
38 | | - if error is not None: |
39 | | - log.warning(f"Insert Failed, try_idx={retry_idx}, Exception: {error}") |
40 | | - retry_idx += 1 |
41 | | - if retry_idx <= config.MAX_INSERT_RETRY: |
42 | | - time.sleep(retry_idx) |
43 | | - self.send_insert_task(db, emb=emb, metadata=metadata, retry_idx=retry_idx) |
44 | | - else: |
45 | | - msg = f"Insert failed and retried more than {config.MAX_INSERT_RETRY} times" |
46 | | - raise RuntimeError(msg) from None |
| 38 | + def send_insert_task(self, db: api.VectorDB, emb: list[list[float]], metadata: list[str]): |
| 39 | + def _insert_embeddings(db: api.VectorDB, emb: list[list[float]], metadata: list[str], retry_idx: int = 0): |
| 40 | + _, error = db.insert_embeddings(emb, metadata) |
| 41 | + if error is not None: |
| 42 | + log.warning(f"Insert Failed, try_idx={retry_idx}, Exception: {error}") |
| 43 | + retry_idx += 1 |
| 44 | + if retry_idx <= config.MAX_INSERT_RETRY: |
| 45 | + time.sleep(retry_idx) |
| 46 | + _insert_embeddings(db, emb=emb, metadata=metadata, retry_idx=retry_idx) |
| 47 | + else: |
| 48 | + msg = f"Insert failed and retried more than {config.MAX_INSERT_RETRY} times" |
| 49 | + raise RuntimeError(msg) from None |
| 50 | + |
| 51 | + if isinstance(db, PgVector): |
| 52 | + # pgvector is not thread-safe for concurrent insert, |
| 53 | + # so we need to copy the db object, make sure each thread has its own connection |
| 54 | + db_copy = deepcopy(db) |
| 55 | + with db_copy.init(): |
| 56 | + _insert_embeddings(db_copy, emb, metadata, retry_idx=0) |
| 57 | + else: |
| 58 | + _insert_embeddings(db, emb, metadata, retry_idx=0) |
47 | 59 |
|
48 | 60 | @time_it |
49 | 61 | def run_with_rate(self, q: mp.Queue): |
|
0 commit comments