Skip to content

Commit 572b703

Browse files
committed
Update frontend submodule
1 parent a36f712 commit 572b703

26 files changed

+325
-164
lines changed

docs/user-guide/basic-usage.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ clustered_df, model_stats = label(
125125

126126
**Other Parameters:**
127127
- `temperature`: Temperature for classification (default: `0.0`)
128-
- `max_workers`: Parallel workers (default: `8`)
128+
- `max_workers`: Parallel workers (default: `16`)
129129
- `verbose`: Print progress information (default: `True`)
130130

131131
### Example

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,4 @@ alembic>=1.13.0
3434
psycopg2-binary>=2.9.0
3535
redis>=5.0.0
3636
celery[redis]>=5.3.0
37-
boto3>=1.34.0
3837
pydantic-settings>=2.1.0

scripts/run_from_config.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
from scripts.run_full_pipeline import run_pipeline, load_dataset
2525
from stringsight import label
26+
from stringsight.constants import DEFAULT_MAX_WORKERS
2627

2728

2829
def _load_taxonomy(taxonomy_spec: Any) -> Dict[str, str]:
@@ -126,7 +127,7 @@ def run_label_pipeline(
126127
temperature: float = 0.0,
127128
top_p: float = 1.0,
128129
max_tokens: int = 2048,
129-
max_workers: int = 64,
130+
max_workers: int = DEFAULT_MAX_WORKERS,
130131
use_wandb: bool = True,
131132
verbose: bool = False,
132133
sample_size: Optional[int] = None,
@@ -479,7 +480,7 @@ def main() -> Tuple[Any, Any]:
479480
temperature=temperature,
480481
top_p=top_p,
481482
max_tokens=max_tokens,
482-
max_workers=cfg.get("max_workers", 64),
483+
max_workers=cfg.get("max_workers", DEFAULT_MAX_WORKERS),
483484
use_wandb=use_wandb_flag,
484485
verbose=verbose,
485486
sample_size=sample_size,
@@ -510,7 +511,7 @@ def main() -> Tuple[Any, Any]:
510511
clusterer=cfg.get("clusterer", "hdbscan"),
511512
min_cluster_size=cfg.get("min_cluster_size", 15),
512513
embedding_model=cfg.get("embedding_model", "text-embedding-3-large"),
513-
max_workers=cfg.get("max_workers", 64),
514+
max_workers=cfg.get("max_workers", DEFAULT_MAX_WORKERS),
514515
use_wandb=use_wandb_flag,
515516
verbose=verbose,
516517
sample_size=cfg.get("sample_size"),

scripts/run_full_pipeline.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from stringsight.core.preprocessing import sample_prompts_evenly
2323
from stringsight.core.data_objects import PropertyDataset
2424
from typing import Optional, Dict, Any, Tuple, List
25+
from stringsight.constants import DEFAULT_MAX_WORKERS
2526

2627

2728
def load_dataset(
@@ -320,8 +321,8 @@ def main():
320321
help="Minimum cluster size (default: 15)")
321322
parser.add_argument("--embedding_model", type=str, default="text-embedding-3-large",
322323
help="Embedding model to use (default: openai)")
323-
parser.add_argument("--max_workers", type=int, default=64,
324-
help="Maximum number of workers (default: 64)")
324+
parser.add_argument("--max_workers", type=int, default=DEFAULT_MAX_WORKERS,
325+
help=f"Maximum number of workers (default: {DEFAULT_MAX_WORKERS})")
325326
parser.add_argument("--sample_size", type=int, default=None,
326327
help="Sample size to use (default: use full dataset)")
327328

scripts/run_label_pipeline.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from stringsight import label
1111
import json
1212
from stringsight.core.preprocessing import sample_prompts_evenly
13+
from stringsight.constants import DEFAULT_MAX_WORKERS
1314

1415
# -----------------------------------------------------------------------------
1516
# Default taxonomy – feel free to modify / replace via --taxonomy_file later.
@@ -58,7 +59,7 @@ def main() -> None:
5859
parser.add_argument("--output_dir", required=True, help="Directory to write results")
5960
parser.add_argument("--model_name", default="gpt-4.1", help="Labeling model (OpenAI)")
6061
parser.add_argument("--sample_size", type=int, default=None, help="Optional subsample for quick runs")
61-
parser.add_argument("--max_workers", type=int, default=64, help="Parallel requests to OpenAI")
62+
parser.add_argument("--max_workers", type=int, default=DEFAULT_MAX_WORKERS, help="Parallel requests to OpenAI")
6263
parser.add_argument("--bootstrap_samples", type=int, default=100, help="Number of bootstrap samples")
6364
args = parser.parse_args()
6465

@@ -73,6 +74,7 @@ def main() -> None:
7374
df,
7475
taxonomy=MAST_TAXONOMY,
7576
model_name=args.model_name,
77+
max_workers=args.max_workers,
7678
output_dir=args.output_dir,
7779
metrics_kwargs={
7880
"compute_bootstrap": True, # Enable bootstrap for FunctionalMetrics

scripts/run_pipeline.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from stringsight import compute_metrics_only
1313
import pandas as pd
1414
import json
15+
from stringsight.constants import DEFAULT_MAX_WORKERS
1516

1617
def main():
1718
"""Main function for webdev dataset processing."""
@@ -39,8 +40,8 @@ def main():
3940
help="Minimum cluster size (default: 8)")
4041
parser.add_argument("--max_coarse_clusters", type=int, default=12,
4142
help="Maximum number of coarse clusters (default: 12)")
42-
parser.add_argument("--max_workers", type=int, default=64,
43-
help="Maximum number of workers (default: 16)")
43+
parser.add_argument("--max_workers", type=int, default=DEFAULT_MAX_WORKERS,
44+
help=f"Maximum number of workers (default: {DEFAULT_MAX_WORKERS})")
4445

4546
# Flags
4647
parser.add_argument("--hierarchical", action="store_true",

stringsight/api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,11 @@
3737
from stringsight.metrics.cluster_subset import enrich_clusters_with_metrics, compute_total_conversations_by_model, prepare_long_frame, compute_subset_metrics
3838
from stringsight.logging_config import get_logger
3939
from stringsight.schemas import ClusterRunRequest
40+
from stringsight.constants import DEFAULT_MAX_WORKERS
4041
import threading, uuid
4142
from dataclasses import dataclass, field
4243
from functools import lru_cache
4344
from datetime import datetime, timedelta
44-
from datetime import datetime, timedelta
4545
import hashlib
4646

4747
logger = get_logger(__name__)
@@ -218,7 +218,7 @@ class ExtractSingleRequest(BaseModel):
218218
temperature: float | None = 0.7
219219
top_p: float | None = 0.95
220220
max_tokens: int | None = 16000
221-
max_workers: int | None = 128
221+
max_workers: int | None = DEFAULT_MAX_WORKERS
222222
include_scores_in_prompt: bool | None = False
223223
use_wandb: bool | None = False
224224
output_dir: str | None = None

stringsight/clusterers/base.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import pandas as pd
88
import litellm
99
from ..core.llm_utils import parallel_completions_async, LLMConfig
10+
from ..constants import DEFAULT_MAX_WORKERS
1011

1112
from ..core.stage import PipelineStage
1213
from ..core.data_objects import PropertyDataset, Cluster
@@ -135,7 +136,7 @@ async def prettify_labels(df: pd.DataFrame, column_name: str, config: ClusterCon
135136
labels_to_process,
136137
model=config.summary_model,
137138
system_prompt=system_prompt,
138-
max_workers=min(getattr(config, "llm_max_workers", 64), len(labels_to_process)),
139+
max_workers=min(getattr(config, "llm_max_workers", DEFAULT_MAX_WORKERS), len(labels_to_process)),
139140
show_progress=True,
140141
progress_desc="Prettifying cluster labels"
141142
)

stringsight/clusterers/clustering_utils.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
# sentence-transformers is optional - imported lazily when needed
3131
from stringsight.prompts.clustering.prompts import clustering_systems_prompt, coarse_clustering_systems_prompt, deduplication_clustering_systems_prompt
3232
from stringsight.logging_config import get_logger
33+
from stringsight.constants import DEFAULT_MAX_WORKERS
3334
from ..utils.validation import validate_openai_api_key
3435

3536
logger = get_logger(__name__)
@@ -160,7 +161,7 @@ def _get_openai_embeddings_batch(batch: List[str], model: str, retries: int = 3,
160161
time.sleep(actual_sleep)
161162

162163

163-
def _get_openai_embeddings(texts: List[str], *, model: str = "openai/text-embedding-3-large", batch_size: int = 100, max_workers: int = 64) -> List[List[float]]:
164+
def _get_openai_embeddings(texts: List[str], *, model: str = "openai/text-embedding-3-large", batch_size: int = 100, max_workers: int = DEFAULT_MAX_WORKERS) -> List[List[float]]:
164165
"""Get embeddings for *texts* from the OpenAI API whilst preserving order."""
165166

166167
if not texts:
@@ -405,7 +406,7 @@ async def assign_fine_to_coarse(
405406
model: str = "gpt-4.1-mini",
406407
strategy: str = "llm",
407408
verbose: bool = True,
408-
max_workers: int = 64,
409+
max_workers: int = DEFAULT_MAX_WORKERS,
409410
) -> Dict[str, str]:
410411
"""Assign each fine cluster name to one of the coarse cluster names.
411412
@@ -489,7 +490,7 @@ def match_label_names(label_name, label_options):
489490
return option
490491
return None
491492

492-
async def llm_match(cluster_names, coarse_cluster_names, max_workers=16, model="gpt-4.1-mini"):
493+
async def llm_match(cluster_names, coarse_cluster_names, max_workers=DEFAULT_MAX_WORKERS, model="gpt-4.1-mini"):
493494
"""Match fine-grained cluster names to coarse-grained cluster names using an LLM with parallel processing."""
494495
coarse_names_text = "\n".join(coarse_cluster_names)
495496

@@ -611,7 +612,7 @@ def _get_openai_embeddings_batch_litellm(batch, retries=3, sleep_time=2.0):
611612

612613

613614
# NOTE: renamed to avoid overriding the DiskCache-cached version defined earlier
614-
def _get_openai_embeddings_litellm(texts, batch_size=100, max_workers=16):
615+
def _get_openai_embeddings_litellm(texts, batch_size=100, max_workers=DEFAULT_MAX_WORKERS):
615616
"""Get embeddings using OpenAI API (LiteLLM cache)."""
616617

617618
if not texts:

stringsight/clusterers/config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from dataclasses import dataclass
44
from typing import Optional, Dict, Union, List, Any
55
import numpy as np
6+
from ..constants import DEFAULT_MAX_WORKERS
67

78

89
def _cuda_available() -> bool:
@@ -57,7 +58,7 @@ class ClusterConfig:
5758
summary_model: str = "gpt-4.1"
5859
cluster_assignment_model: str = "gpt-4.1-mini"
5960
# Parallelism for LLM calls used during clustering (summaries, matching, prettify)
60-
llm_max_workers: int = 64
61+
llm_max_workers: int = DEFAULT_MAX_WORKERS
6162

6263
# GPU acceleration (auto-detected by default)
6364
use_gpu: bool | None = None # None means auto-detect; will be set in __post_init__

0 commit comments

Comments
 (0)