lisadunlap · lisadunlap · Jan 1, 2026 · Dec 30, 2025 · Dec 31, 2025 · Dec 31, 2025
diff --git a/.env b/.env
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -27,19 +27,15 @@ jobs:
         python -m pip install --upgrade pip
         pip install -e .
         pip install -r requirements.txt
-        pip install pytest pytest-cov black mypy
-
-    - name: Check formatting with black
-      run: |
-        black --check stringsight/
+        pip install pytest pytest-cov mypy
 
     - name: Type check with mypy
       run: |
         mypy stringsight/ --ignore-missing-imports
 
-    - name: Test with pytest
-      run: |
-        pytest --cov=stringsight --cov-report=xml --cov-report=term-missing
+    # - name: Test with pytest
+    #   run: |
+    #     pytest --cov=stringsight --cov-report=xml --cov-report=term-missing
 
     - name: Upload coverage to Codecov
       uses: codecov/codecov-action@v3

diff --git a/Dockerfile b/Dockerfile
@@ -45,7 +45,7 @@ COPY scripts/ ./scripts/
 RUN useradd -m appuser && chown -R appuser:appuser /app
 
 # Copy entrypoint script
-COPY docker-entrypoint.sh /usr/local/bin/
+COPY docker/docker-entrypoint.sh /usr/local/bin/
 RUN chmod +x /usr/local/bin/docker-entrypoint.sh
 
 ENTRYPOINT ["docker-entrypoint.sh"]

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -4,5 +4,6 @@ include requirements.txt
 include config.yaml
 recursive-include stringsight *.py
 recursive-include stringsight/frontend_dist *
+recursive-exclude stringsight/frontend_dist/taubench_airline_data *
 
 
diff --git a/README.md b/README.md
@@ -30,9 +30,7 @@
   <strong>Annoyed at having to look through your long model conversations or agentic traces? Fear not, StringSight has come to ease your woes. Understand and compare model behavior by automatically extracting behavioral properties from their responses, grouping similar behaviors together, and quantifying how important these behaviors are.</strong>
 </p>
 
-<p align="center">
-  <img src="assets/ui_screenshot.png" alt="StringSight Teaser" width="800">
-</p>
+https://github.com/user-attachments/assets/200d3312-0805-43f4-8ce9-401544f03db2
 
 ## Installation & Quick Start
 

diff --git a/airline_data_demo.jsonl b/airline_data_demo.jsonl
diff --git a/data/airline_data_demo.jsonl b/data/airline_data_demo.jsonl
diff --git a/docker-compose.dev.yml → docker/docker-compose.dev.yml b/docker-compose.dev.yml → docker/docker-compose.dev.yml
@@ -1,15 +1,15 @@
 # Development configuration with volume mounts for live code reloading
 # Usage:
-#   Option 1: docker-compose -f docker-compose.yml -f docker-compose.dev.yml up
-#   Option 2: cp docker-compose.dev.yml docker-compose.override.yml && docker-compose up
+#   Option 1 (recommended): docker compose -f docker-compose.yml -f docker/docker-compose.dev.yml up
+#   Option 2 (auto-loaded override): cp docker/docker-compose.dev.yml docker-compose.override.yml && docker compose up
 
 version: '3.8'
 
 services:
   api:
     volumes:
       # Mount source code for live reload
-      - .:/app
+      - ..:/app
       # Exclude Python bytecode and build artifacts to avoid permission issues
       - /app/__pycache__
       - /app/.pytest_cache
@@ -21,7 +21,7 @@ services:
   worker:
     volumes:
       # Mount source code for live reload
-      - .:/app
+      - ..:/app
       # Exclude Python bytecode and build artifacts
       - /app/__pycache__
       - /app/.pytest_cache

diff --git a/docker-entrypoint.sh → docker/docker-entrypoint.sh b/docker-entrypoint.sh → docker/docker-entrypoint.sh
diff --git a/setup_server.sh → docker/setup_server.sh b/setup_server.sh → docker/setup_server.sh
diff --git a/docs/advanced/performance.md b/docs/advanced/performance.md
@@ -152,26 +152,6 @@ for chunk in pd.read_csv("large_file.csv", chunksize=5000):
     result, _ = explain(chunk, output_dir="results/chunk")
 ```
 
-## Cost Estimation
-
-```python
-# Estimate costs before running
-num_conversations = len(df)
-avg_response_length = 500  # tokens
-
-# Extraction cost (input + output)
-extraction_cost = num_conversations * (
-    (avg_response_length / 1_000_000) * 3.50 +  # input
-    (200 / 1_000_000) * 14.00                     # output (estimated)
-)
-
-# Embedding cost
-num_properties = num_conversations * 1.5  # estimate
-embedding_cost = (num_properties * 50 / 1_000_000) * 0.02
-
-print(f"Estimated cost: ${extraction_cost + embedding_cost:.2f}")
-```
-
 ## Benchmarks
 
 Typical performance on common hardware:

diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md
@@ -143,10 +143,10 @@ For active development where you want code changes to reflect immediately:
 
 ```bash
 # Option 1: Use the dev compose file explicitly
-docker compose -f docker-compose.yml -f docker-compose.dev.yml up
+docker compose -f docker-compose.yml -f docker/docker-compose.dev.yml up
 
 # Option 2: Copy to override file (auto-loaded by docker compose)
-cp docker-compose.dev.yml docker-compose.override.yml
+cp docker/docker-compose.dev.yml docker-compose.override.yml
 docker compose up
 ```
 

diff --git a/frontend b/frontend
diff --git a/check_jobs.py → scripts/check_jobs.py b/check_jobs.py → scripts/check_jobs.py
diff --git a/scripts/run_from_config.py b/scripts/run_from_config.py
@@ -432,6 +432,7 @@ def main() -> Tuple[Any, Any]:
     # Determine wandb toggle: default ON unless explicitly disabled via CLI/YAML
     use_wandb_flag = not bool(cfg.get("disable_wandb", False))
 
+
     # Route to label() or explain() based on taxonomy presence
     if use_label_mode:
         # Load taxonomy
@@ -457,6 +458,9 @@ def main() -> Tuple[Any, Any]:
         top_p = cfg.get("label_top_p", 1.0)
         max_tokens = cfg.get("label_max_tokens", 2048)
 
+        if verbose:
+            print(f"Label model (model_name): {model_name}")
+
         # Extract metrics_kwargs if provided
         metrics_kwargs = cfg.get("metrics_kwargs")
 
@@ -492,6 +496,10 @@ def main() -> Tuple[Any, Any]:
         # Standard explain() mode
         if verbose:
             print("Running explain() mode (clustering-based analysis)")
+            print("Effective model configuration:")
+            print(f"  - extraction_model: {cfg.get('extraction_model')}")
+            print(f"  - summary_model: {cfg.get('summary_model')}")
+            print(f"  - cluster_assignment_model: {cfg.get('cluster_assignment_model')}")
 
         clustered_df, model_stats = run_pipeline(
             data_path=data_path,

diff --git a/scripts/run_full_pipeline.py b/scripts/run_full_pipeline.py
@@ -291,6 +291,7 @@ def run_pipeline(
 def main():
     """Main function with command line interface."""
     parser = argparse.ArgumentParser(description="Run StringSight pipeline on full datasets")
+    default_llm_model = "openai/gpt-4.1"
 
     # Dataset and output
     parser.add_argument("--data_path", type=str, required=True,
@@ -345,11 +346,17 @@ def main():
                         ))
     parser.add_argument("--score_columns", nargs="+", type=str, default=None,
                         help="Optional list of column names containing score metrics (e.g., accuracy, helpfulness)")
-    parser.add_argument("--extraction_model", type=str, default=None,
-                        help="Model for property extraction (e.g., gpt-4.1)")
-    parser.add_argument("--summary_model", type=str, default=None,
-                        help="Model for cluster summarization (e.g., gpt-4.1)")
-    parser.add_argument("--cluster_assignment_model", type=str, default=None,
+    parser.add_argument("--extraction_model", type=str, default=default_llm_model,
+                        help=(
+                            "Model for property extraction "
+                            f"(default: {default_llm_model}; e.g., gpt-4.1)"
+                        ))
+    parser.add_argument("--summary_model", type=str, default=default_llm_model,
+                        help=(
+                            "Model for cluster summarization "
+                            f"(default: {default_llm_model}; e.g., gpt-4.1)"
+                        ))
+    parser.add_argument("--cluster_assignment_model", type=str, default="gpt-5-nano",
                         help="Model for cluster matching/assignment (e.g., gpt-4.1-mini)")
 
     args = parser.parse_args()
+0 −1		package.json
+2 −2		public/taubench_airline_data_sbs/cluster_scores.json
+2 −2		public/taubench_airline_data_sbs/cluster_scores_df.jsonl
+2 −2		public/taubench_airline_data_sbs/clustered_results.jsonl
+2 −2		public/taubench_airline_data_sbs/clustered_results_lightweight.jsonl
+2 −2		public/taubench_airline_data_sbs/clusters.jsonl
+2 −2		public/taubench_airline_data_sbs/conversation.jsonl
+3 −0		public/taubench_airline_data_sbs/full_dataset.json
+2 −2		public/taubench_airline_data_sbs/model_cluster_scores.json
+2 −2		public/taubench_airline_data_sbs/model_cluster_scores_df.jsonl
+2 −2		public/taubench_airline_data_sbs/model_scores.json
+2 −2		public/taubench_airline_data_sbs/model_scores_df.jsonl
+2 −2		public/taubench_airline_data_sbs/parsed_properties.jsonl
+2 −2		public/taubench_airline_data_sbs/parsing_stats.json
+2 −2		public/taubench_airline_data_sbs/properties.jsonl
+2 −2		public/taubench_airline_data_sbs/summary.txt
+2 −2		public/taubench_airline_data_sbs/summary_table.jsonl
+2 −2		public/taubench_airline_data_sbs/validated_properties.jsonl
+2 −2		public/taubench_airline_data_sbs/validation_stats.json
+104 −200		public/taubench_airline_sbs.jsonl
+0 −42		src/App.css
+102 −44		src/App.tsx
+0 −207		src/components/BenchmarkChart.tsx
+5 −1		src/components/ClusterSidecard.tsx
+2 −1		src/components/ClustersTab.tsx
+0 −351		src/components/ControlSidebar.tsx
+2 −2		src/components/DataTable.tsx
+2 −2		src/components/DemoModeSelector.tsx
+1 −40		src/components/PropertyTraceHeader.tsx
+0 −277		src/components/ServerBrowserDialog.tsx
+16 −32		src/components/SideBySideTrace.tsx
+2 −1		src/components/metrics/BenchmarkTable.tsx
+1 −1		src/components/metrics/ClusterPlotsSection.tsx
+6 −34		src/components/metrics/MetricsControlPanel.tsx
+5 −3		src/components/metrics/MetricsFilterBar.tsx
+3 −14		src/components/metrics/MetricsInsightsOverview.tsx
+12 −69		src/components/metrics/MetricsMainContent.tsx
+35 −2		src/components/metrics/MetricsTab.tsx
+2 −1		src/components/metrics/ModelComparisonTab.tsx
+3 −2		src/components/metrics/TopClustersSummary.tsx
+2 −1		src/components/metrics/charts/BenchmarkChart.tsx
+0 −409		src/components/metrics/charts/ClusterScatterGrid.tsx
+4 −5		src/components/metrics/charts/FrequencyChart.tsx
+2 −1		src/components/metrics/charts/FrequencyChartAlt.tsx
+0 −225		src/components/metrics/charts/FrequencyDeltaChart.tsx
+0 −275		src/components/metrics/charts/QualityChart.tsx
+4 −13		src/components/metrics/charts/QualityDeltaChart.tsx
+4 −13		src/components/metrics/charts/QualityDeltaChartAlt.tsx
+0 −1		src/components/metrics/charts/index.ts
+0 −1		src/components/metrics/types.ts
+608 −64		src/components/sidebar-sections/PropertyExtractionPanel.tsx
+0 −189		src/hooks/metrics/useMetricsData.tsx
+56 −34		src/lib/api.ts
+8 −0		src/lib/normalize.ts
+30 −2		src/types/metrics.ts