added starter notebook

lisadunlap · lisadunlap · commit 328e935a7ad0 · 2025-11-05T00:32:48.000Z
diff --git a/README.md b/README.md
@@ -35,6 +35,8 @@ export GOOGLE_API_KEY="your-google-key"        # optional
 
 ## Quick Start
 
+For a comprehensive tutorial with detailed explanations, see [starter_notebook.ipynb](starter_notebook.ipynb).
+
 ### 1. Extract and Cluster Properties with `explain()`
 
 ```python
@@ -90,6 +92,63 @@ clustered_df, model_stats = explain(
 )
 ```
 
+### Using Custom Column Names
+
+If your dataframe uses different column names, you can map them using column mapping parameters:
+
+```python
+# Your dataframe has custom column names
+df = pd.DataFrame({
+    "input": ["What is ML?", "Explain QC"],
+    "llm_name": ["gpt-4", "gpt-4"],
+    "output": ["ML is...", "QC uses..."],
+    "accuracy": [0.95, 0.88],
+    "helpfulness": [4.2, 4.5]
+})
+
+# Map custom column names to expected StringSight names
+clustered_df, model_stats = explain(
+    df,
+    prompt_column="input",           # Map "input" → "prompt"
+    model_column="llm_name",          # Map "llm_name" → "model"
+    model_response_column="output",   # Map "output" → "model_response"
+    score_columns=["accuracy", "helpfulness"],
+    output_dir="results/test"
+)
+```
+
+For side-by-side comparisons with custom column names:
+
+```python
+df = pd.DataFrame({
+    "query": ["What is ML?", "Explain QC"],
+    "model_1": ["gpt-4", "gpt-4"],
+    "model_2": ["claude-3", "claude-3"],
+    "response_1": ["ML is...", "QC uses..."],
+    "response_2": ["ML involves...", "QC leverages..."],
+    "accuracy_1": [0.95, 0.88],
+    "accuracy_2": [0.92, 0.85]
+})
+
+clustered_df, model_stats = explain(
+    df,
+    method="side_by_side",
+    prompt_column="query",                # Map "query" → "prompt"
+    model_a_column="model_1",              # Map "model_1" → "model_a"
+    model_b_column="model_2",              # Map "model_2" → "model_b"
+    model_a_response_column="response_1", # Map "response_1" → "model_a_response"
+    model_b_response_column="response_2", # Map "response_2" → "model_b_response"
+    score_columns=["accuracy"],           # Note: score columns need _a/_b suffixes
+    output_dir="results/test"
+)
+```
+
+**Note:** Default column names are:
+- `prompt`, `model`, `model_response`, `question_id` (optional) for single_model
+- `prompt`, `model_a`, `model_b`, `model_a_response`, `model_b_response`, `question_id` (optional) for side_by_side
+
+If your columns already match these names, you don't need to specify mapping parameters.
+
 ### 2. Fixed Taxonomy Labeling with `label()`
 
 When you know exactly which behavioral axes you care about:
@@ -140,6 +199,10 @@ Use the React frontend or other visualization tools to explore your results.
 |--------|-------------|---------|
 | `score` | Evaluation metrics dictionary | `{"accuracy": 0.85, "helpfulness": 4.2}` |
 | `score_columns` | Alternative: separate columns for each metric (e.g., `accuracy`, `helpfulness`) instead of a dict | `score_columns=["accuracy", "helpfulness"]` |
+| `prompt_column` | Name of the prompt column in your dataframe (default: `"prompt"`) | `prompt_column="input"` |
+| `model_column` | Name of the model column for single_model (default: `"model"`) | `model_column="llm_name"` |
+| `model_response_column` | Name of the model response column for single_model (default: `"model_response"`) | `model_response_column="output"` |
+| `question_id_column` | Name of the question_id column (default: `"question_id"` if column exists) | `question_id_column="qid"` |
 
 ### Side-by-Side Comparisons
 
@@ -159,6 +222,12 @@ Use the React frontend or other visualization tools to explore your results.
 |--------|-------------|---------|
 | `score` | Winner and metrics | `{"winner": "model_a", "helpfulness_a": 4.2, "helpfulness_b": 3.8}` |
 | `score_columns` | Alternative: separate columns for each metric with `_a` and `_b` suffixes (e.g., `accuracy_a`, `accuracy_b`) | `score_columns=["accuracy_a", "accuracy_b", "helpfulness_a", "helpfulness_b"]` |
+| `prompt_column` | Name of the prompt column in your dataframe (default: `"prompt"`) | `prompt_column="query"` |
+| `model_a_column` | Name of the model_a column (default: `"model_a"`) | `model_a_column="model_1"` |
+| `model_b_column` | Name of the model_b column (default: `"model_b"`) | `model_b_column="model_2"` |
+| `model_a_response_column` | Name of the model_a_response column (default: `"model_a_response"`) | `model_a_response_column="response_1"` |
+| `model_b_response_column` | Name of the model_b_response column (default: `"model_b_response"`) | `model_b_response_column="response_2"` |
+| `question_id_column` | Name of the question_id column (default: `"question_id"` if column exists) | `question_id_column="qid"` |
 
 **Option 2: Tidy Data (Auto-pairing)**
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "stringsight"
-version = "0.3.1"
+version = "0.3.2"
 authors = [
     {name = "Lisa Dunlap", email = "lisabdunlap@berkeley.edu"},
 ]
diff --git a/scripts/dataset_configs/aci_bench.yaml b/scripts/dataset_configs/aci_bench.yaml
@@ -3,7 +3,7 @@ output_dir: results/aci_bench
 method: single_model
 min_cluster_size: 12
 embedding_model: text-embedding-3-large
-max_workers: 64
+max_workers: 16
 groupby_column: behavior_type
 assign_outliers: false
 task_description: |
diff --git a/scripts/dataset_configs/instructeval.yaml b/scripts/dataset_configs/instructeval.yaml
@@ -6,6 +6,6 @@ model_b: openai/gpt-5-nano-2025-08-07
 min_cluster_size: 5
 sample_size: 50
 embedding_model: text-embedding-3-small
-max_workers: 64
+max_workers: 16
 groupby_column: behavior_type
 assign_outliers: false
diff --git a/scripts/dataset_configs/koala.yaml b/scripts/dataset_configs/koala.yaml
@@ -3,7 +3,7 @@ output_dir: results/koala
 method: single_model
 min_cluster_size: 5
 embedding_model: text-embedding-3-small
-max_workers: 64
+max_workers: 16
 groupby_column: behavior_type
 assign_outliers: false
 
diff --git a/scripts/dataset_configs/medi_qa.yaml b/scripts/dataset_configs/medi_qa.yaml
@@ -3,7 +3,7 @@ output_dir: results/medi_qa
 method: single_model
 min_cluster_size: 5
 embedding_model: text-embedding-3-small
-max_workers: 64
+max_workers: 16
 groupby_column: behavior_type
 assign_outliers: false
 models:
diff --git a/scripts/dataset_configs/omni_math_gpt.yaml b/scripts/dataset_configs/omni_math_gpt.yaml
@@ -3,7 +3,7 @@ output_dir: results/omni_math_gpt
 method: single_model
 min_cluster_size: 8
 embedding_model: text-embedding-3-large
-max_workers: 64
+max_workers: 16
 groupby_column: behavior_type
 assign_outliers: false
 sample_size: 50
diff --git a/scripts/dataset_configs/omni_math_top_models.yaml b/scripts/dataset_configs/omni_math_top_models.yaml
@@ -3,7 +3,7 @@ output_dir: results/omni_math_top_models
 method: single_model
 min_cluster_size: 8
 embedding_model: text-embedding-3-large
-max_workers: 64
+max_workers: 16
 groupby_column: behavior_type
 assign_outliers: false
 models:
diff --git a/scripts/dataset_configs/safety.yaml b/scripts/dataset_configs/safety.yaml
@@ -3,7 +3,7 @@ output_dir: results/safety
 method: single_model
 min_cluster_size: 5
 embedding_model: text-embedding-3-small
-max_workers: 64
+max_workers: 16
 groupby_column: behavior_type
 assign_outliers: false
 task_description: |
diff --git a/scripts/dataset_configs/taubench_airline.yaml b/scripts/dataset_configs/taubench_airline.yaml
@@ -3,7 +3,7 @@ output_dir: results/taubench_airline_data
 method: single_model
 min_cluster_size: 5
 embedding_model: text-embedding-3-small
-max_workers: 64
+max_workers: 16
 groupby_column: behavior_type
 assign_outliers: false
 system_prompt: agent_system_prompt
diff --git a/scripts/dataset_configs/taubench_retail.yaml b/scripts/dataset_configs/taubench_retail.yaml
@@ -3,7 +3,7 @@ output_dir: results/taubench_retail
 method: single_model
 min_cluster_size: 5
 embedding_model: text-embedding-3-small
-max_workers: 64
+max_workers: 16
 groupby_column: behavior_type
 assign_outliers: false
 system_prompt: agent_system_prompt
diff --git a/scripts/dataset_configs/taubench_retail_sbs.yaml b/scripts/dataset_configs/taubench_retail_sbs.yaml
@@ -5,7 +5,7 @@ model_a: gpt-4o
 model_b: claude-sonnet-35
 min_cluster_size: 5
 embedding_model: text-embedding-3-small
-max_workers: 64
+max_workers: 16
 sample_size: 50
 groupby_column: behavior_type
 assign_outliers: false
diff --git a/scripts/dataset_configs/test.yaml b/scripts/dataset_configs/test.yaml
@@ -6,6 +6,6 @@ model_b: openai/gpt-5-nano-2025-08-07
 sample_size: 25
 min_cluster_size: 2
 embedding_model: text-embedding-3-small
-max_workers: 64
+max_workers: 16
 groupby_column: behavior_type
 assign_outliers: false
diff --git a/scripts/dataset_configs/tony.yaml b/scripts/dataset_configs/tony.yaml
@@ -3,7 +3,7 @@ output_dir: results/colorization_new
 method: single_model
 min_cluster_size: 10
 embedding_model: text-embedding-3-small
-max_workers: 64
+max_workers: 16
 groupby_column: behavior_type
 assign_outliers: false
 # sample_size: 50
diff --git a/scripts/dataset_configs/tony2.yaml b/scripts/dataset_configs/tony2.yaml
@@ -3,7 +3,7 @@ output_dir: results/match_equation__MatchEquation-v0__hard
 method: single_model
 min_cluster_size: 10
 embedding_model: text-embedding-3-small
-max_workers: 64
+max_workers: 16
 groupby_column: behavior_type
 assign_outliers: false
 # sample_size: 50
diff --git a/scripts/run_full_pipeline.py b/scripts/run_full_pipeline.py
@@ -88,7 +88,7 @@ def run_pipeline(
     clusterer="hdbscan",
     min_cluster_size=15,
     embedding_model="text-embedding-3-small",
-    max_workers=64,
+    max_workers=16,
     use_wandb=True,
     verbose=False,
     sample_size=None,
diff --git a/scripts/test_task_description_call_center.py b/scripts/test_task_description_call_center.py
@@ -35,7 +35,7 @@ def main():
         max_coarse_clusters=12,
         embedding_model="text-embedding-3-small",
         hierarchical=False,
-        max_workers=64,
+        max_workers=16,
         use_wandb=False,
         verbose=True,
         sample_size=20,
diff --git a/starter_notebook.ipynb b/starter_notebook.ipynb
diff --git a/stringsight/api.py b/stringsight/api.py
diff --git a/stringsight/clusterers/clustering_utils.py b/stringsight/clusterers/clustering_utils.py
diff --git a/stringsight/clusterers/hdbscan.py b/stringsight/clusterers/hdbscan.py
diff --git a/stringsight/clusterers/hierarchical_clustering.py b/stringsight/clusterers/hierarchical_clustering.py
diff --git a/stringsight/core/llm_utils.py b/stringsight/core/llm_utils.py
diff --git a/stringsight/core/preprocessing.py b/stringsight/core/preprocessing.py
diff --git a/stringsight/public.py b/stringsight/public.py

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"`
`4`	`4`
`5`	`5`	`[project]`
`6`	`6`	`name = "stringsight"`
`7`		`-version = "0.3.1"`
	`7`	`+version = "0.3.2"`
`8`	`8`	`authors = [`
`9`	`9`	`{name = "Lisa Dunlap", email = "[email protected]"},`
`10`	`10`	`]`