lisadunlap
diff --git a/‎docs/user-guide/basic-usage.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/user-guide/basic-usage.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements.txt‎
Lines changed: 0 additions & 1 deletion b/‎requirements.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎scripts/run_from_config.py‎
Lines changed: 4 additions & 3 deletions b/‎scripts/run_from_config.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎scripts/run_full_pipeline.py‎
Lines changed: 3 additions & 2 deletions b/‎scripts/run_full_pipeline.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎scripts/run_label_pipeline.py‎
Lines changed: 3 additions & 1 deletion b/‎scripts/run_label_pipeline.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎scripts/run_pipeline.py‎
Lines changed: 3 additions & 2 deletions b/‎scripts/run_pipeline.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎stringsight/api.py‎
Lines changed: 2 additions & 2 deletions b/‎stringsight/api.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎stringsight/clusterers/base.py‎
Lines changed: 2 additions & 1 deletion b/‎stringsight/clusterers/base.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎stringsight/clusterers/clustering_utils.py‎
Lines changed: 5 additions & 4 deletions b/‎stringsight/clusterers/clustering_utils.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎stringsight/clusterers/config.py‎
Lines changed: 2 additions & 1 deletion b/‎stringsight/clusterers/config.py‎
Lines changed: 2 additions & 1 deletion
@@ -125,7 +125,7 @@ clustered_df, model_stats = label(
 
 **Other Parameters:**
 - `temperature`: Temperature for classification (default: `0.0`)
-- `max_workers`: Parallel workers (default: `8`)
+- `max_workers`: Parallel workers (default: `16`)
 - `verbose`: Print progress information (default: `True`)
 
 ### Example
 
@@ -34,5 +34,4 @@ alembic>=1.13.0
 psycopg2-binary>=2.9.0
 redis>=5.0.0
 celery[redis]>=5.3.0
-boto3>=1.34.0
 pydantic-settings>=2.1.0
@@ -23,6 +23,7 @@
 
 from scripts.run_full_pipeline import run_pipeline, load_dataset
 from stringsight import label
+from stringsight.constants import DEFAULT_MAX_WORKERS
 
 
 def _load_taxonomy(taxonomy_spec: Any) -> Dict[str, str]:
@@ -126,7 +127,7 @@ def run_label_pipeline(
     temperature: float = 0.0,
     top_p: float = 1.0,
     max_tokens: int = 2048,
-    max_workers: int = 64,
+    max_workers: int = DEFAULT_MAX_WORKERS,
     use_wandb: bool = True,
     verbose: bool = False,
     sample_size: Optional[int] = None,
@@ -479,7 +480,7 @@ def main() -> Tuple[Any, Any]:
             temperature=temperature,
             top_p=top_p,
             max_tokens=max_tokens,
-            max_workers=cfg.get("max_workers", 64),
+            max_workers=cfg.get("max_workers", DEFAULT_MAX_WORKERS),
             use_wandb=use_wandb_flag,
             verbose=verbose,
             sample_size=sample_size,
@@ -510,7 +511,7 @@ def main() -> Tuple[Any, Any]:
             clusterer=cfg.get("clusterer", "hdbscan"),
             min_cluster_size=cfg.get("min_cluster_size", 15),
             embedding_model=cfg.get("embedding_model", "text-embedding-3-large"),
-            max_workers=cfg.get("max_workers", 64),
+            max_workers=cfg.get("max_workers", DEFAULT_MAX_WORKERS),
             use_wandb=use_wandb_flag,
             verbose=verbose,
             sample_size=cfg.get("sample_size"),
 
@@ -22,6 +22,7 @@
 from stringsight.core.preprocessing import sample_prompts_evenly
 from stringsight.core.data_objects import PropertyDataset
 from typing import Optional, Dict, Any, Tuple, List
+from stringsight.constants import DEFAULT_MAX_WORKERS
 
 
 def load_dataset(
@@ -320,8 +321,8 @@ def main():
                         help="Minimum cluster size (default: 15)")
     parser.add_argument("--embedding_model", type=str, default="text-embedding-3-large",
                         help="Embedding model to use (default: openai)")
-    parser.add_argument("--max_workers", type=int, default=64,
-                        help="Maximum number of workers (default: 64)")
+    parser.add_argument("--max_workers", type=int, default=DEFAULT_MAX_WORKERS,
+                        help=f"Maximum number of workers (default: {DEFAULT_MAX_WORKERS})")
     parser.add_argument("--sample_size", type=int, default=None,
                         help="Sample size to use (default: use full dataset)")
 
 
@@ -10,6 +10,7 @@
 from stringsight import label
 import json
 from stringsight.core.preprocessing import sample_prompts_evenly
+from stringsight.constants import DEFAULT_MAX_WORKERS
 
 # -----------------------------------------------------------------------------
 # Default taxonomy – feel free to modify / replace via --taxonomy_file later.
@@ -58,7 +59,7 @@ def main() -> None:
     parser.add_argument("--output_dir", required=True, help="Directory to write results")
     parser.add_argument("--model_name", default="gpt-4.1", help="Labeling model (OpenAI)")
     parser.add_argument("--sample_size", type=int, default=None, help="Optional subsample for quick runs")
-    parser.add_argument("--max_workers", type=int, default=64, help="Parallel requests to OpenAI")
+    parser.add_argument("--max_workers", type=int, default=DEFAULT_MAX_WORKERS, help="Parallel requests to OpenAI")
     parser.add_argument("--bootstrap_samples", type=int, default=100, help="Number of bootstrap samples")
     args = parser.parse_args()
 
@@ -73,6 +74,7 @@ def main() -> None:
         df,
         taxonomy=MAST_TAXONOMY,
         model_name=args.model_name,
+        max_workers=args.max_workers,
         output_dir=args.output_dir,
         metrics_kwargs={
             "compute_bootstrap": True,  # Enable bootstrap for FunctionalMetrics
 
@@ -12,6 +12,7 @@
 from stringsight import compute_metrics_only
 import pandas as pd
 import json
+from stringsight.constants import DEFAULT_MAX_WORKERS
 
 def main():
     """Main function for webdev dataset processing."""
@@ -39,8 +40,8 @@ def main():
                         help="Minimum cluster size (default: 8)")
     parser.add_argument("--max_coarse_clusters", type=int, default=12,
                         help="Maximum number of coarse clusters (default: 12)")
-    parser.add_argument("--max_workers", type=int, default=64,
-                        help="Maximum number of workers (default: 16)")
+    parser.add_argument("--max_workers", type=int, default=DEFAULT_MAX_WORKERS,
+                        help=f"Maximum number of workers (default: {DEFAULT_MAX_WORKERS})")
 
     # Flags
     parser.add_argument("--hierarchical", action="store_true",
 
@@ -37,11 +37,11 @@
 from stringsight.metrics.cluster_subset import enrich_clusters_with_metrics, compute_total_conversations_by_model, prepare_long_frame, compute_subset_metrics
 from stringsight.logging_config import get_logger
 from stringsight.schemas import ClusterRunRequest
+from stringsight.constants import DEFAULT_MAX_WORKERS
 import threading, uuid
 from dataclasses import dataclass, field
 from functools import lru_cache
 from datetime import datetime, timedelta
-from datetime import datetime, timedelta
 import hashlib
 
 logger = get_logger(__name__)
@@ -218,7 +218,7 @@ class ExtractSingleRequest(BaseModel):
     temperature: float | None = 0.7
     top_p: float | None = 0.95
     max_tokens: int | None = 16000
-    max_workers: int | None = 128
+    max_workers: int | None = DEFAULT_MAX_WORKERS
     include_scores_in_prompt: bool | None = False
     use_wandb: bool | None = False
     output_dir: str | None = None
 
@@ -7,6 +7,7 @@
 import pandas as pd
 import litellm
 from ..core.llm_utils import parallel_completions_async, LLMConfig
+from ..constants import DEFAULT_MAX_WORKERS
 
 from ..core.stage import PipelineStage
 from ..core.data_objects import PropertyDataset, Cluster
@@ -135,7 +136,7 @@ async def prettify_labels(df: pd.DataFrame, column_name: str, config: ClusterCon
             labels_to_process,
             model=config.summary_model,
             system_prompt=system_prompt,
-            max_workers=min(getattr(config, "llm_max_workers", 64), len(labels_to_process)),
+            max_workers=min(getattr(config, "llm_max_workers", DEFAULT_MAX_WORKERS), len(labels_to_process)),
             show_progress=True,
             progress_desc="Prettifying cluster labels"
         )
 
@@ -30,6 +30,7 @@
 # sentence-transformers is optional - imported lazily when needed
 from stringsight.prompts.clustering.prompts import clustering_systems_prompt, coarse_clustering_systems_prompt, deduplication_clustering_systems_prompt
 from stringsight.logging_config import get_logger
+from stringsight.constants import DEFAULT_MAX_WORKERS
 from ..utils.validation import validate_openai_api_key
 
 logger = get_logger(__name__)
@@ -160,7 +161,7 @@ def _get_openai_embeddings_batch(batch: List[str], model: str, retries: int = 3,
             time.sleep(actual_sleep)
 
 
-def _get_openai_embeddings(texts: List[str], *, model: str = "openai/text-embedding-3-large", batch_size: int = 100, max_workers: int = 64) -> List[List[float]]:
+def _get_openai_embeddings(texts: List[str], *, model: str = "openai/text-embedding-3-large", batch_size: int = 100, max_workers: int = DEFAULT_MAX_WORKERS) -> List[List[float]]:
     """Get embeddings for *texts* from the OpenAI API whilst preserving order."""
 
     if not texts:
@@ -405,7 +406,7 @@ async def assign_fine_to_coarse(
     model: str = "gpt-4.1-mini",
     strategy: str = "llm",
     verbose: bool = True,
-    max_workers: int = 64,
+    max_workers: int = DEFAULT_MAX_WORKERS,
 ) -> Dict[str, str]:
     """Assign each fine cluster name to one of the coarse cluster names.
 
@@ -489,7 +490,7 @@ def match_label_names(label_name, label_options):
             return option
     return None
 
-async def llm_match(cluster_names, coarse_cluster_names, max_workers=16, model="gpt-4.1-mini"):
+async def llm_match(cluster_names, coarse_cluster_names, max_workers=DEFAULT_MAX_WORKERS, model="gpt-4.1-mini"):
     """Match fine-grained cluster names to coarse-grained cluster names using an LLM with parallel processing."""
     coarse_names_text = "\n".join(coarse_cluster_names)
 
@@ -611,7 +612,7 @@ def _get_openai_embeddings_batch_litellm(batch, retries=3, sleep_time=2.0):
 
 
 # NOTE: renamed to avoid overriding the DiskCache-cached version defined earlier
-def _get_openai_embeddings_litellm(texts, batch_size=100, max_workers=16):
+def _get_openai_embeddings_litellm(texts, batch_size=100, max_workers=DEFAULT_MAX_WORKERS):
     """Get embeddings using OpenAI API (LiteLLM cache)."""
 
     if not texts:
 
@@ -3,6 +3,7 @@
 from dataclasses import dataclass
 from typing import Optional, Dict, Union, List, Any
 import numpy as np
+from ..constants import DEFAULT_MAX_WORKERS
 
 
 def _cuda_available() -> bool:
@@ -57,7 +58,7 @@ class ClusterConfig:
     summary_model: str = "gpt-4.1"
     cluster_assignment_model: str = "gpt-4.1-mini"
     # Parallelism for LLM calls used during clustering (summaries, matching, prettify)
-    llm_max_workers: int = 64
+    llm_max_workers: int = DEFAULT_MAX_WORKERS
 
     # GPU acceleration (auto-detected by default)
     use_gpu: bool | None = None  # None means auto-detect; will be set in __post_init__