Enhance statistical analysis with comprehensive variant comparisons

jeremymanning · jeremymanning · commit e5c3c927f249 · 2025-10-21T10:41:37.000-04:00
New features in code/compute_stats.py: 1. Automatic threshold crossing detection per author - Detects which authors cross p < 0.001 threshold and when - No longer hard-coded to Twain (was special in baseline) - Shows all 8 authors individually 2. Average t-statistic threshold crossing - Reports when average across all authors crosses threshold - Complements individual author analysis 3. Cross-variant pairwise comparisons (--cross-variant-comparison) - T-tests comparing t-value distributions between all variant pairs - Shows which conditions differ significantly - 6 pairwise comparisons: baseline vs content, baseline vs function, etc. 4. LaTeX table output - Formatted exactly as specified for paper inclusion - Scientific notation for small p-values run_stats.sh updates: - Automatically triggers cross-variant comparison when --all is used - Fixed variant data path handling Example output: Baseline: 7 authors cross at epoch 1-2, Twain at epoch 77 Average crosses at epoch 1 Cross-variant: baseline differs from all (p < 0.01) Related to #33
diff --git a/code/compute_stats.py b/code/compute_stats.py
@@ -38,33 +38,88 @@ def load_data(data_path='data/model_results.pkl', variant=None):
     return df
 
 
-def find_twain_threshold_epoch(df, p_threshold=0.001):
+def find_threshold_crossing_epochs(df, p_threshold=0.001):
     """
-    Find the epoch where Twain model's p-value first drops below threshold.
-    This corresponds to t-threshold of 3.291 for p < 0.001.
+    Find epochs where each author's p-value crosses below threshold.
+    Detects authors that start above threshold and later cross below it.
+
+    Returns:
+        dict: {author: (epoch, t_stat, p_value)} for authors that cross threshold
+    """
+    crossing_authors = {}
+
+    for author in AUTHORS:
+        author_df = df[df['train_author'] == author].copy()
+        epochs = sorted(author_df['epochs_completed'].unique())
+
+        # Track if we've seen above-threshold epochs before crossing
+        seen_above_threshold = False
+
+        for epoch in epochs:
+            epoch_df = author_df[author_df['epochs_completed'] == epoch]
+
+            # Get self losses
+            self_losses = epoch_df[epoch_df['loss_dataset'] == author]['loss_value'].values
+
+            # Get other losses
+            other_authors = [a for a in AUTHORS if a != author]
+            other_losses = epoch_df[epoch_df['loss_dataset'].isin(other_authors)]['loss_value'].values
+
+            if len(self_losses) >= 10 and len(other_losses) >= 70:
+                # Perform t-test (other vs self)
+                t_stat, p_value = stats.ttest_ind(other_losses, self_losses, equal_var=False)
+
+                if p_value >= p_threshold:
+                    seen_above_threshold = True
+                elif seen_above_threshold and p_value < p_threshold:
+                    # Crossed threshold!
+                    crossing_authors[author] = (epoch, t_stat, p_value)
+                    break
+
+    return crossing_authors
+
+
+def find_average_threshold_crossing(df, p_threshold=0.001):
+    """
+    Find epoch where average t-statistic across all authors crosses threshold.
+
+    Returns:
+        tuple: (epoch, avg_t_stat, p_value) or (None, None, None)
     """
-    # Filter for Twain models comparing Twain vs other authors
-    twain_df = df[df['train_author'] == 'twain'].copy()
+    epochs = sorted(df['epochs_completed'].unique())
 
-    # Get unique epochs sorted
-    epochs = sorted(twain_df['epochs_completed'].unique())
+    seen_above_threshold = False
 
     for epoch in epochs:
-        epoch_df = twain_df[twain_df['epochs_completed'] == epoch]
+        # Compute average t-statistic across all authors at this epoch
+        author_t_stats = []
 
-        # Get self losses (Twain model on Twain text)
-        self_losses = epoch_df[epoch_df['loss_dataset'] == 'twain']['loss_value'].values
+        for author in AUTHORS:
+            author_df = df[(df['train_author'] == author) & (df['epochs_completed'] == epoch)]
 
-        # Get other losses (Twain model on other authors' texts)
-        other_authors = [a for a in AUTHORS if a != 'twain']
-        other_losses = epoch_df[epoch_df['loss_dataset'].isin(other_authors)]['loss_value'].values
+            # Get self and other losses
+            self_losses = author_df[author_df['loss_dataset'] == author]['loss_value'].values
+            other_authors = [a for a in AUTHORS if a != author]
+            other_losses = author_df[author_df['loss_dataset'].isin(other_authors)]['loss_value'].values
 
-        if len(self_losses) >= 10 and len(other_losses) >= 70:
-            # Perform t-test (other vs self)
-            t_stat, p_value = stats.ttest_ind(other_losses, self_losses, equal_var=False)
+            if len(self_losses) > 0 and len(other_losses) > 0:
+                # Simple t-statistic
+                mean_diff = np.mean(other_losses) - np.mean(self_losses)
+                pooled_std = np.sqrt((np.var(other_losses) + np.var(self_losses)) / 2)
+                if pooled_std > 0:
+                    t_stat = mean_diff / pooled_std
+                    author_t_stats.append(t_stat)
+
+        if len(author_t_stats) == len(AUTHORS):
+            avg_t = np.mean(author_t_stats)
+            # One-sample t-test: is average t-stat significantly > 0?
+            t_result = stats.ttest_1samp(author_t_stats, 0)
+            p_value = t_result.pvalue / 2  # One-tailed
 
-            if p_value < p_threshold:
-                return epoch, t_stat, p_value
+            if p_value >= p_threshold:
+                seen_above_threshold = True
+            elif seen_above_threshold and p_value < p_threshold:
+                return epoch, avg_t, p_value
 
     return None, None, None
 
@@ -197,6 +252,73 @@ def generate_author_comparison_table(df):
     return df_table, latex_table
 
 
+def compute_cross_variant_comparisons(all_variant_data, epoch=500):
+    """
+    Compare t-value distributions across variants at epoch 500.
+
+    Args:
+        all_variant_data: dict of {variant_name: DataFrame}
+        epoch: Epoch to compare at (default: 500)
+
+    Returns:
+        DataFrame with pairwise t-test results
+    """
+    from itertools import combinations
+
+    # Extract t-values for each variant at epoch 500
+    variant_t_values = {}
+
+    for variant_name, df in all_variant_data.items():
+        t_values = []
+
+        for author in AUTHORS:
+            # Get final epoch data for this author
+            author_df = df[(df['train_author'] == author) & (df['epochs_completed'] == epoch)]
+
+            # Get self and other losses
+            self_losses = author_df[author_df['loss_dataset'] == author]['loss_value'].values
+            other_authors = [a for a in AUTHORS if a != author]
+            other_losses = author_df[author_df['loss_dataset'].isin(other_authors)]['loss_value'].values
+
+            if len(self_losses) > 0 and len(other_losses) > 0:
+                # Compute t-statistic
+                if len(self_losses) == 1:
+                    mean_diff = np.mean(other_losses) - self_losses[0]
+                    std_other = np.std(other_losses)
+                    if std_other > 0:
+                        t_stat = mean_diff / (std_other / np.sqrt(len(other_losses)))
+                        t_values.append(t_stat)
+                else:
+                    t_stat, _ = stats.ttest_ind(other_losses, self_losses, equal_var=False)
+                    if not np.isnan(t_stat):
+                        t_values.append(t_stat)
+
+        variant_t_values[variant_name] = t_values
+
+    # Pairwise comparisons
+    results = []
+    variant_names = list(all_variant_data.keys())
+
+    for var1, var2 in combinations(variant_names, 2):
+        if var1 in variant_t_values and var2 in variant_t_values:
+            t_vals_1 = variant_t_values[var1]
+            t_vals_2 = variant_t_values[var2]
+
+            if len(t_vals_1) >= 2 and len(t_vals_2) >= 2:
+                # T-test comparing distributions
+                t_result = stats.ttest_ind(t_vals_1, t_vals_2, equal_var=False)
+
+                results.append({
+                    'Comparison': f'{var1} vs {var2}',
+                    't-stat': f'{t_result.statistic:.2f}',
+                    'df': f'{t_result.df:.2f}',
+                    'p-value': f'{t_result.pvalue:.2e}',
+                    'mean_diff': f'{np.mean(t_vals_1) - np.mean(t_vals_2):.2f}'
+                })
+
+    return pd.DataFrame(results)
+
+
 def main():
     """Main function to compute and display all statistics."""
     import argparse
@@ -213,9 +335,50 @@ def main():
         default='data/model_results.pkl',
         help='Path to model results file (default: data/model_results.pkl)'
     )
+    parser.add_argument(
+        '--cross-variant-comparison',
+        action='store_true',
+        help='Compute pairwise comparisons across all variants'
+    )
 
     args = parser.parse_args()
 
+    # Handle cross-variant comparison mode
+    if args.cross_variant_comparison:
+        print("=" * 60)
+        print("Cross-Variant Comparison Analysis")
+        print("=" * 60)
+
+        # Load all variant data
+        all_variant_data = {}
+        for var_name, var_key in [('baseline', None), ('content', 'content'), ('function', 'function'), ('pos', 'pos')]:
+            pkl_file = f"data/model_results.pkl" if var_key is None else f"data/model_results_{var_key}.pkl"
+            if Path(pkl_file).exists():
+                all_variant_data[var_name] = load_data(pkl_file, var_key)
+            else:
+                print(f"Warning: {pkl_file} not found, skipping {var_name}")
+
+        if len(all_variant_data) < 2:
+            print("Error: Need at least 2 variants for comparison")
+            return
+
+        print(f"\nLoaded {len(all_variant_data)} conditions: {list(all_variant_data.keys())}")
+
+        # Compute pairwise comparisons
+        print("\nPairwise T-Test Comparisons (Epoch 500)")
+        print("Comparing distributions of t-statistics across all authors")
+        print("-" * 60)
+
+        comparison_df = compute_cross_variant_comparisons(all_variant_data, epoch=500)
+
+        if not comparison_df.empty:
+            print("\n" + comparison_df.to_string(index=False))
+        else:
+            print("No comparisons could be computed")
+
+        print("\n" + "=" * 60)
+        return
+
     # Update header to show variant
     variant_label = f" (Variant: {args.variant})" if args.variant else " (Baseline)"
     print("=" * 60)
@@ -226,19 +389,33 @@ def main():
     print("\nLoading data...")
     df = load_data(data_path=args.data, variant=args.variant)
 
-    # 1. Find Twain threshold epoch
-    print("\n1. Twain Model P-Threshold Analysis")
+    # 1. Find threshold crossing epochs per author
+    print("\n1. Individual Author Threshold Crossings (p < 0.001)")
+    print("-" * 40)
+    crossing_authors = find_threshold_crossing_epochs(df)
+    if crossing_authors:
+        for author in AUTHORS:
+            if author in crossing_authors:
+                epoch, t_stat, p_value = crossing_authors[author]
+                print(f"{author.capitalize():<12}: Epoch {epoch:3d} (t={t_stat:.2f}, p={p_value:.2e})")
+            else:
+                print(f"{author.capitalize():<12}: No threshold crossing detected")
+    else:
+        print("No authors crossed threshold (started below or never crossed)")
+
+    # 2. Average t-statistic threshold crossing
+    print("\n2. Average T-Statistic Threshold Crossing (p < 0.001)")
     print("-" * 40)
-    epoch, t_stat, p_value = find_twain_threshold_epoch(df)
+    epoch, avg_t, p_value = find_average_threshold_crossing(df)
     if epoch is not None:
-        print(f"First epoch where p < 0.001: {epoch}")
-        print(f"t-statistic at epoch {epoch}: {t_stat:.3f}")
-        print(f"p-value at epoch {epoch}: {p_value:.3e}")
+        print(f"Average t-stat crossed threshold at epoch: {epoch}")
+        print(f"Average t-statistic: {avg_t:.3f}")
+        print(f"p-value: {p_value:.2e}")
     else:
-        print("Threshold not reached within training epochs")
+        print("Average t-statistic did not cross threshold")
 
-    # 2. Average t-test at final epoch
-    print("\n2. Average T-Test Across Authors (Epoch 500)")
+    # 3. Average t-test at final epoch
+    print("\n3. Average T-Test Across Authors (Epoch 500)")
     print("-" * 40)
     t_stat, p_value, df_val = compute_average_t_test(df, epoch=500)
     if t_stat is not None:
@@ -252,8 +429,8 @@ def main():
     else:
         print("Insufficient data for t-test")
 
-    # 3. Author comparison table
-    print("\n3. Author Model Comparison Table (Table 1)")
+    # 4. Author comparison table
+    print("\n4. Author Model Comparison Table (Table 1)")
     print("-" * 40)
     table, latex_table = generate_author_comparison_table(df)
 
diff --git a/run_stats.sh b/run_stats.sh
@@ -139,4 +139,12 @@ for variant in "${VARIANTS[@]}"; do
     echo
 done
 
+# If --all was specified, compute cross-variant comparisons
+if [ ${#VARIANTS[@]} -eq 4 ]; then
+    echo
+    print_info "Computing cross-variant comparisons..."
+    python code/compute_stats.py --cross-variant-comparison
+    echo
+fi
+
 print_success "Statistical analysis complete!"