Skip to content

Commit 1b36eac

Browse files
committed
updated stats, re-ran ALL models + analyses
1 parent fd3100a commit 1b36eac

File tree

256 files changed

+701829
-701179
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

256 files changed

+701829
-701179
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,6 @@ tests/data/*.pkl
2626
!tests/data/test_model_results.pkl
2727

2828
# Temporary test files
29-
.test_credentials
29+
.test_credentials
30+
models/*/model.safetensors
31+
models/*/training_state.pt

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,9 @@ pip install -e .
110110
# Train models from scratch (requires GPU)
111111
./run_llm_stylometry.sh -t
112112

113+
# Compute statistical analyses (Table 1 and key statistics)
114+
./run_stats.sh
115+
113116
# Custom data and output paths
114117
./run_llm_stylometry.sh -d path/to/model_results.pkl -o path/to/output
115118

@@ -168,6 +171,20 @@ fig = generate_all_losses_figure(
168171
- **4**: Figure 4 - 3D MDS plot (3d_MDS_plot.pdf)
169172
- **5**: Figure 5 - Oz authorship analysis (oz_losses.pdf)
170173

174+
### Statistical Analysis
175+
176+
Generate key statistics from the paper:
177+
178+
```bash
179+
# Compute statistical analyses
180+
./run_stats.sh
181+
```
182+
183+
This produces:
184+
- **Twain p-threshold analysis**: Epoch where Twain model first achieves p < 0.001
185+
- **Average t-test**: t-test of average t-statistics across seeds, at 500th epoch
186+
- **Table 1**: Individual author model t-tests comparing self vs. other losses
187+
171188
## Training Models from Scratch
172189

173190
**Note**: Training requires a CUDA-enabled GPU and takes significant time (~80 models total).

code/compute_stats.py

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
#!/usr/bin/env python
2+
"""
3+
Compute statistics for LLM stylometry paper reproduction.
4+
"""
5+
6+
import pickle
7+
import pandas as pd
8+
import numpy as np
9+
from scipy import stats
10+
from pathlib import Path
11+
from constants import AUTHORS
12+
13+
def load_data():
14+
"""Load the model results data."""
15+
with open('data/model_results.pkl', 'rb') as f:
16+
return pickle.load(f)
17+
18+
19+
def find_twain_threshold_epoch(df, p_threshold=0.001):
20+
"""
21+
Find the epoch where Twain model's p-value first drops below threshold.
22+
This corresponds to t-threshold of 3.291 for p < 0.001.
23+
"""
24+
# Filter for Twain models comparing Twain vs other authors
25+
twain_df = df[df['train_author'] == 'twain'].copy()
26+
27+
# Get unique epochs sorted
28+
epochs = sorted(twain_df['epochs_completed'].unique())
29+
30+
for epoch in epochs:
31+
epoch_df = twain_df[twain_df['epochs_completed'] == epoch]
32+
33+
# Get self losses (Twain model on Twain text)
34+
self_losses = epoch_df[epoch_df['loss_dataset'] == 'twain']['loss_value'].values
35+
36+
# Get other losses (Twain model on other authors' texts)
37+
other_authors = [a for a in AUTHORS if a != 'twain']
38+
other_losses = epoch_df[epoch_df['loss_dataset'].isin(other_authors)]['loss_value'].values
39+
40+
if len(self_losses) >= 10 and len(other_losses) >= 70:
41+
# Perform t-test (other vs self)
42+
t_stat, p_value = stats.ttest_ind(other_losses, self_losses, equal_var=False)
43+
44+
if p_value < p_threshold:
45+
return epoch, t_stat, p_value
46+
47+
return None, None, None
48+
49+
50+
def compute_average_t_test(df, epoch=500):
51+
"""
52+
Compute t-test comparing average t-values across seeds to 0.
53+
For each seed, compute average t-statistic across all authors.
54+
This reproduces the test on line 230 of the paper.
55+
"""
56+
# For each seed, get the t-statistics for all authors
57+
seed_avg_t_stats = []
58+
59+
for seed in range(10):
60+
author_t_stats = []
61+
62+
for author in AUTHORS:
63+
# Get all data for this author-seed combination
64+
model_name = f"{author}_tokenizer=gpt2_seed={seed}"
65+
model_df = df[df['model_name'] == model_name]
66+
67+
# Get data at the specified epoch (or closest if not exact)
68+
epoch_data = model_df[model_df['epochs_completed'] <= epoch].groupby('loss_dataset').tail(1)
69+
70+
# Get self losses
71+
self_losses = epoch_data[epoch_data['loss_dataset'] == author]['loss_value'].values
72+
73+
# Get other losses
74+
other_authors = [a for a in AUTHORS if a != author]
75+
other_losses = epoch_data[epoch_data['loss_dataset'].isin(other_authors)]['loss_value'].values
76+
77+
if len(self_losses) > 0 and len(other_losses) > 0:
78+
# Use mean values if we only have one sample
79+
if len(self_losses) == 1:
80+
# Compute t-statistic using difference of means and std of others
81+
mean_diff = np.mean(other_losses) - self_losses[0]
82+
std_other = np.std(other_losses)
83+
if std_other > 0:
84+
t_stat = mean_diff / (std_other / np.sqrt(len(other_losses)))
85+
author_t_stats.append(t_stat)
86+
else:
87+
t_stat, _ = stats.ttest_ind(other_losses, self_losses, equal_var=False)
88+
if not np.isnan(t_stat):
89+
author_t_stats.append(t_stat)
90+
91+
# Average t-statistic across authors for this seed
92+
if len(author_t_stats) == len(AUTHORS):
93+
seed_avg_t_stats.append(np.mean(author_t_stats))
94+
95+
# Test if mean t-statistic is significantly different from 0
96+
if len(seed_avg_t_stats) == 10:
97+
t_stat, p_value = stats.ttest_1samp(seed_avg_t_stats, 0)
98+
return t_stat, p_value, len(seed_avg_t_stats) - 1
99+
100+
return None, None, None
101+
102+
103+
def generate_author_comparison_table(df):
104+
"""
105+
Generate table of t-tests comparing each author's model losses.
106+
This reproduces Table 1 in the paper.
107+
"""
108+
# Get final epoch data
109+
final_df = df.groupby(['train_author', 'loss_dataset', 'seed']).tail(1)
110+
111+
# Use the same author order as in the figures
112+
author_order = ['baum', 'thompson', 'austen', 'dickens', 'fitzgerald', 'melville', 'twain', 'wells']
113+
114+
results = []
115+
for author in author_order:
116+
author_df = final_df[final_df['train_author'] == author]
117+
118+
# Get self losses (model trained on author, tested on same author)
119+
self_losses = author_df[author_df['loss_dataset'] == author]['loss_value'].values
120+
121+
# Get other losses (model trained on author, tested on other authors)
122+
other_authors = [a for a in AUTHORS if a != author]
123+
other_losses = author_df[author_df['loss_dataset'].isin(other_authors)]['loss_value'].values
124+
125+
if len(self_losses) >= 10 and len(other_losses) >= 70:
126+
# Perform t-test (other vs self)
127+
t_result = stats.ttest_ind(other_losses, self_losses, equal_var=False)
128+
129+
results.append({
130+
'Model': author.capitalize(),
131+
't-stat': f'{t_result.statistic:.2f}',
132+
'df': f'{t_result.df:.2f}',
133+
'p-value': f'{t_result.pvalue:.2e}'
134+
})
135+
136+
return pd.DataFrame(results)
137+
138+
139+
def main():
140+
"""Main function to compute and display all statistics."""
141+
print("=" * 60)
142+
print("LLM Stylometry Statistical Analysis")
143+
print("=" * 60)
144+
145+
# Load data
146+
print("\nLoading data...")
147+
df = load_data()
148+
149+
# 1. Find Twain threshold epoch
150+
print("\n1. Twain Model P-Threshold Analysis")
151+
print("-" * 40)
152+
epoch, t_stat, p_value = find_twain_threshold_epoch(df)
153+
if epoch is not None:
154+
print(f"First epoch where p < 0.001: {epoch}")
155+
print(f"t-statistic at epoch {epoch}: {t_stat:.3f}")
156+
print(f"p-value at epoch {epoch}: {p_value:.3e}")
157+
else:
158+
print("Threshold not reached within training epochs")
159+
160+
# 2. Average t-test at final epoch
161+
print("\n2. Average T-Test Across Authors (Epoch 500)")
162+
print("-" * 40)
163+
t_stat, p_value, df_val = compute_average_t_test(df, epoch=500)
164+
if t_stat is not None:
165+
print(f"t({df_val}) = {t_stat:.3f}, p = {p_value:.2e}")
166+
167+
# Format p-value in scientific notation
168+
if p_value < 1e-10:
169+
exponent = int(np.floor(np.log10(p_value)))
170+
mantissa = p_value / (10 ** exponent)
171+
print(f"(p-value in scientific notation: {mantissa:.1f} × 10^{exponent})")
172+
else:
173+
print("Insufficient data for t-test")
174+
175+
# 3. Author comparison table
176+
print("\n3. Author Model Comparison Table (Table 1)")
177+
print("-" * 40)
178+
table = generate_author_comparison_table(df)
179+
print("\n" + table.to_string(index=False))
180+
181+
print("\n" + "=" * 60)
182+
183+
184+
if __name__ == "__main__":
185+
main()

data/model_results.pkl

22.3 KB
Binary file not shown.

models/austen_tokenizer=gpt2_seed=0/config.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
],
66
"attn_pdrop": 0.1,
77
"bos_token_id": 50256,
8+
"dtype": "float32",
89
"embd_pdrop": 0.1,
910
"eos_token_id": 50256,
1011
"initializer_range": 0.02,
@@ -24,8 +25,7 @@
2425
"summary_proj_to_labels": true,
2526
"summary_type": "cls_index",
2627
"summary_use_proj": true,
27-
"torch_dtype": "float32",
28-
"transformers_version": "4.45.2",
28+
"transformers_version": "4.56.1",
2929
"use_cache": true,
3030
"vocab_size": 50257
3131
}

models/austen_tokenizer=gpt2_seed=0/generation_config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
"_from_model_config": true,
33
"bos_token_id": 50256,
44
"eos_token_id": 50256,
5-
"transformers_version": "4.45.2"
5+
"transformers_version": "4.56.1"
66
}

0 commit comments

Comments
 (0)