|
30 | 30 | # sentence-transformers is optional - imported lazily when needed |
31 | 31 | from stringsight.prompts.clustering.prompts import clustering_systems_prompt, coarse_clustering_systems_prompt, deduplication_clustering_systems_prompt |
32 | 32 | from stringsight.logging_config import get_logger |
| 33 | +from stringsight.constants import DEFAULT_MAX_WORKERS |
33 | 34 | from ..utils.validation import validate_openai_api_key |
34 | 35 |
|
35 | 36 | logger = get_logger(__name__) |
@@ -160,7 +161,7 @@ def _get_openai_embeddings_batch(batch: List[str], model: str, retries: int = 3, |
160 | 161 | time.sleep(actual_sleep) |
161 | 162 |
|
162 | 163 |
|
163 | | -def _get_openai_embeddings(texts: List[str], *, model: str = "openai/text-embedding-3-large", batch_size: int = 100, max_workers: int = 64) -> List[List[float]]: |
| 164 | +def _get_openai_embeddings(texts: List[str], *, model: str = "openai/text-embedding-3-large", batch_size: int = 100, max_workers: int = DEFAULT_MAX_WORKERS) -> List[List[float]]: |
164 | 165 | """Get embeddings for *texts* from the OpenAI API whilst preserving order.""" |
165 | 166 |
|
166 | 167 | if not texts: |
@@ -405,7 +406,7 @@ async def assign_fine_to_coarse( |
405 | 406 | model: str = "gpt-4.1-mini", |
406 | 407 | strategy: str = "llm", |
407 | 408 | verbose: bool = True, |
408 | | - max_workers: int = 64, |
| 409 | + max_workers: int = DEFAULT_MAX_WORKERS, |
409 | 410 | ) -> Dict[str, str]: |
410 | 411 | """Assign each fine cluster name to one of the coarse cluster names. |
411 | 412 |
|
@@ -489,7 +490,7 @@ def match_label_names(label_name, label_options): |
489 | 490 | return option |
490 | 491 | return None |
491 | 492 |
|
492 | | -async def llm_match(cluster_names, coarse_cluster_names, max_workers=16, model="gpt-4.1-mini"): |
| 493 | +async def llm_match(cluster_names, coarse_cluster_names, max_workers=DEFAULT_MAX_WORKERS, model="gpt-4.1-mini"): |
493 | 494 | """Match fine-grained cluster names to coarse-grained cluster names using an LLM with parallel processing.""" |
494 | 495 | coarse_names_text = "\n".join(coarse_cluster_names) |
495 | 496 |
|
@@ -611,7 +612,7 @@ def _get_openai_embeddings_batch_litellm(batch, retries=3, sleep_time=2.0): |
611 | 612 |
|
612 | 613 |
|
613 | 614 | # NOTE: renamed to avoid overriding the DiskCache-cached version defined earlier |
614 | | -def _get_openai_embeddings_litellm(texts, batch_size=100, max_workers=16): |
| 615 | +def _get_openai_embeddings_litellm(texts, batch_size=100, max_workers=DEFAULT_MAX_WORKERS): |
615 | 616 | """Get embeddings using OpenAI API (LiteLLM cache).""" |
616 | 617 |
|
617 | 618 | if not texts: |
|
0 commit comments