Add LaTeX table output and fix variant stats computation

jeremymanning · jeremymanning · commit 5b573bb121d2 · 2025-10-21T10:29:58.000-04:00
Changes to code/compute_stats.py: 1. generate_author_comparison_table() now returns (DataFrame, LaTeX_string) 2. LaTeX table formatted exactly as specified with proper scientific notation 3. Fixed compute_average_t_test() to work with variants: - Was constructing baseline model names only - Now filters by train_author and seed columns (works for all variants) Changes to run_stats.sh: - Now uses correct data path for each variant (data/model_results_{variant}.pkl) Results: - Average t-test now works for all variants (was showing 'Insufficient data') - LaTeX tables generated automatically for easy paper inclusion - All 4 conditions supported: baseline, content, function, pos Related to #33
diff --git a/code/compute_stats.py b/code/compute_stats.py
@@ -83,8 +83,8 @@ def compute_average_t_test(df, epoch=500):
 
         for author in AUTHORS:
             # Get all data for this author-seed combination
-            model_name = f"{author}_tokenizer=gpt2_seed={seed}"
-            model_df = df[df['model_name'] == model_name]
+            # Filter by author and seed columns (works for both baseline and variants)
+            model_df = df[(df['train_author'] == author) & (df['seed'] == seed)]
 
             # Get data at the specified epoch (or closest if not exact)
             epoch_data = model_df[model_df['epochs_completed'] <= epoch].groupby('loss_dataset').tail(1)
@@ -126,6 +126,9 @@ def generate_author_comparison_table(df):
     """
     Generate table of t-tests comparing each author's model losses.
     This reproduces Table 1 in the paper.
+
+    Returns:
+        tuple: (pandas DataFrame, LaTeX string)
     """
     # Get final epoch data
     final_df = df.groupby(['train_author', 'loss_dataset', 'seed']).tail(1)
@@ -152,10 +155,46 @@ def generate_author_comparison_table(df):
                 'Model': author.capitalize(),
                 't-stat': f'{t_result.statistic:.2f}',
                 'df': f'{t_result.df:.2f}',
-                'p-value': f'{t_result.pvalue:.2e}'
+                'p-value': f'{t_result.pvalue:.2e}',
+                't_stat_val': t_result.statistic,
+                'df_val': t_result.df,
+                'p_val': t_result.pvalue
             })
 
-    return pd.DataFrame(results)
+    df_table = pd.DataFrame(results)
+
+    # Generate LaTeX table
+    latex_lines = [
+        "\\begin{table}[h]",
+        "\\centering",
+        "\\small",
+        "\\begin{tabular}{lccc}",
+        "\\hline",
+        "\\textbf{Model} & \\textbf{$t$-stat} & \\textbf{df} & \\textbf{$p$-value}\\\\",
+        "\\hline"
+    ]
+
+    for _, row in df_table.iterrows():
+        # Format p-value in scientific notation
+        p_val = row['p_val']
+        if p_val < 0.01:
+            exponent = int(np.floor(np.log10(p_val)))
+            mantissa = p_val / (10 ** exponent)
+            p_str = f"${mantissa:.2f} \\times 10^{{{exponent}}}$"
+        else:
+            p_str = f"${p_val:.4f}$"
+
+        latex_lines.append(
+            f"{row['Model']:<12} & {row['t_stat_val']:.2f} & {row['df_val']:.2f} & {p_str}  \\\\"
+        )
+
+    latex_lines.append("\\hline")
+    latex_lines.append("\\end{tabular}")
+    latex_lines.append("\\end{table}")
+
+    latex_table = "\n".join(latex_lines)
+
+    return df_table, latex_table
 
 
 def main():
@@ -216,8 +255,15 @@ def main():
     # 3. Author comparison table
     print("\n3. Author Model Comparison Table (Table 1)")
     print("-" * 40)
-    table = generate_author_comparison_table(df)
-    print("\n" + table.to_string(index=False))
+    table, latex_table = generate_author_comparison_table(df)
+
+    # Display DataFrame table
+    print("\n" + table[['Model', 't-stat', 'df', 'p-value']].to_string(index=False))
+
+    # Display LaTeX table
+    print("\n\nLaTeX Table Format:")
+    print("-" * 40)
+    print(latex_table)
 
     print("\n" + "=" * 60)
 
diff --git a/paper/main.pdf b/paper/main.pdf
diff --git a/paper/main.tex b/paper/main.tex
@@ -25,10 +25,11 @@
 
 \title{A Stylometric Application of Large Language Models}
 
-\author{Harrison F. Stropkay, Jiayi Chen, Daniel N. Rockmore, and Jeremy R. Manning\\
+\author{Harrison F. Stropkay, Jiayi Chen, Mohammad J. L. Jabelli,\\
+Daniel N. Rockmore, and Jeremy R. Manning\\
 Dartmouth College \\
 Hanover, NH 03755, USA \\
-\texttt{\{harrison.f.stropkay.25, jiayi.chen.gr, }\\\texttt{daniel.n.rockmore, jeremy.r.manning\}@dartmouth.edu}}
+\texttt{\{harrison.f.stropkay.25, jiayi.chen.gr, mohammad.javad.latifi.jebelli}\\\texttt{daniel.n.rockmore, jeremy.r.manning\}@dartmouth.edu}}
 
 \begin{document}
 \maketitle
@@ -167,21 +168,20 @@ \subsection{Model architecture, training, and evaluation}
 and to ensure that the models are not overfitting to a specific book or random
 sample.
 
-\subsubsection{Investigating the contributions of function words, content words, and parts of speech}
+\subsubsection{Investigating the contributions of function words, content
+words, and parts of speech}
 
 In order to investigate the contributions of different types of words to the
 stylometric signatures captured by our models, we carried out additional
 analyses using modified corpora. First, we created content-word-only corpora by
 replacing all function words with a special token, \texttt{<FUNC>}. Function
-words were identified using scikit-learn's list of English stop words~\citep{PedrEtal11}.
-Next, we created function-word-only corpora by replacing all content (i.e.,
-non-function) words with a \texttt{<CONTENT>} token. Finally, we created
-part-of-speech-only corpora by using the Natural Language Toolkit~\citep[NLTK; ][]{BirdLope04} to
-replace each word with its corresponding part-of-speech tag. We then re-trained
-our models on each of these modified corpora, following the same methodology as
-described above.
-
-
+words were identified using scikit-learn's list of English stop
+words~\citep{PedrEtal11}. Next, we created function-word-only corpora by
+replacing all content (i.e., non-function) words with a \texttt{<CONTENT>}
+token. Finally, we created part-of-speech-only corpora by using the Natural
+Language Toolkit~\citep[NLTK; ][]{BirdLope04} to replace each word with its
+corresponding part-of-speech tag. We then re-trained our models on each of
+these modified corpora, following the same methodology as described above.
 
 \begin{figure*}[t]
   \centering
diff --git a/paper/supplement.pdf b/paper/supplement.pdf
diff --git a/paper/supplement.tex b/paper/supplement.tex
@@ -18,10 +18,11 @@
 
 \title{\textit{Supplementary materials for}: A Stylometric Application of Large Language Models}
 
-\author{Harrison F. Stropkay, Jiayi Chen, Daniel N. Rockmore, and Jeremy R. Manning\\
+\author{Harrison F. Stropkay, Jiayi Chen, Mohammad J. L. Jabelli,\\
+Daniel N. Rockmore, and Jeremy R. Manning\\
 Dartmouth College \\
 Hanover, NH 03755, USA \\
-\texttt{\{harrison.f.stropkay.25, jiayi.chen.gr, }\\\texttt{daniel.n.rockmore, jeremy.r.manning\}@dartmouth.edu}}
+\texttt{\{harrison.f.stropkay.25, jiayi.chen.gr, mohammad.javad.latifi.jebelli}\\\texttt{daniel.n.rockmore, jeremy.r.manning\}@dartmouth.edu}}
 
 \date{}
 
diff --git a/run_stats.sh b/run_stats.sh
@@ -121,10 +121,20 @@ for variant in "${VARIANTS[@]}"; do
     echo
     if [ "$variant" == "baseline" ]; then
         print_info "Computing baseline statistics..."
-        python code/compute_stats.py --data "$DATA_PATH"
+        VARIANT_DATA_PATH="data/model_results.pkl"
+        if [ ! -f "$VARIANT_DATA_PATH" ]; then
+            print_error "Baseline data not found: $VARIANT_DATA_PATH"
+            continue
+        fi
+        python code/compute_stats.py --data "$VARIANT_DATA_PATH"
     else
         print_info "Computing statistics for $variant variant..."
-        python code/compute_stats.py --data "$DATA_PATH" --variant "$variant"
+        VARIANT_DATA_PATH="data/model_results_${variant}.pkl"
+        if [ ! -f "$VARIANT_DATA_PATH" ]; then
+            print_error "$variant data not found: $VARIANT_DATA_PATH"
+            continue
+        fi
+        python code/compute_stats.py --data "$VARIANT_DATA_PATH" --variant "$variant"
     fi
     echo
 done