Merge pull request #37 from jeremymanning/main

jeremymanning · web-flow · commit 8a7a4d7fdc45 · 2025-10-22T09:37:38.000-04:00
README simplification and security improvements (Issue #35)
diff --git a/README.md b/README.md
diff --git a/check_remote_status.sh b/check_remote_status.sh
@@ -7,7 +7,7 @@
 # completed models and estimates for in-progress training.
 #
 # Usage:
-#   ./check_remote_status.sh [--cluster tensor01|tensor02]
+#   ./check_remote_status.sh --cluster CLUSTER_NAME
 #
 
 set -e
@@ -30,7 +30,7 @@ print_success() {
 }
 
 # Default cluster
-CLUSTER="tensor02"
+CLUSTER=""  # Must be specified with --cluster flag
 
 # Parse arguments
 while [[ $# -gt 0 ]]; do
@@ -45,12 +45,11 @@ while [[ $# -gt 0 ]]; do
             echo "Check training status on remote GPU server"
             echo ""
             echo "Options:"
-            echo "  --cluster NAME          Select cluster: tensor01 or tensor02 (default: tensor02)"
+            echo "  --cluster NAME          Select cluster (required)"
             echo "  -h, --help             Show this help message"
             echo ""
             echo "Examples:"
-            echo "  $0                              # Check status on tensor02 (default)"
-            echo "  $0 --cluster tensor01           # Check status on tensor01"
+            echo "  $0 --cluster mycluster          # Check status on mycluster"
             exit 0
             ;;
         *)
@@ -61,6 +60,13 @@ while [[ $# -gt 0 ]]; do
     esac
 done
 
+# Validate cluster is specified
+if [ -z "$CLUSTER" ]; then
+    print_error "Cluster must be specified with --cluster flag"
+    echo "Example: $0 --cluster mycluster"
+    exit 1
+fi
+
 # Read credentials from config file
 CRED_FILE=".ssh/credentials_${CLUSTER}.json"
 if [ -f "$CRED_FILE" ]; then
diff --git a/code/README.md b/code/README.md
@@ -0,0 +1,36 @@
+# Code Directory
+
+Python scripts for model training, analysis, and figure generation.
+
+## Main Scripts
+
+### Training and Figures
+- **generate_figures.py** - Main CLI for training and figure generation
+- **main.py** - Model training orchestration with parallel GPU support
+
+### Data Processing
+- **clean.py** - Preprocess Project Gutenberg texts (remove headers/footers)
+- **create_analysis_variants.py** - Generate variant-transformed texts (content, function, POS)
+- **consolidate_model_results.py** - Combine model loss logs into single pkl file
+
+### Analysis
+- **compute_stats.py** - Statistical analysis: t-tests, threshold crossings, cross-variant comparisons
+- **check_training_status.py** - Remote training progress monitoring
+
+### Utilities
+- **constants.py** - Shared constants (authors, hyperparameters)
+
+## Usage
+
+Most scripts are called through shell wrappers (see main README):
+- `./run_llm_stylometry.sh` → generate_figures.py
+- `./run_stats.sh` → compute_stats.py
+- `./check_remote_status.sh` → check_training_status.py
+
+Run scripts directly for advanced usage:
+```bash
+python code/generate_figures.py --help
+python code/compute_stats.py --help
+```
+
+See main README for complete documentation and examples.
diff --git a/code/generate_figures.py b/code/generate_figures.py
@@ -167,7 +167,7 @@ def train_models(max_gpus=None, no_confirm=False, resume=False, variant=None):
 
 
 def generate_figure(figure_name, data_path='data/model_results.pkl', output_dir='paper/figs/source', variant=None):
-    """Generate a specific figure."""
+    """Generate a specific figure (main or supplemental)."""
     from llm_stylometry.visualization import (
         generate_all_losses_figure,
         generate_stripplot_figure,
@@ -178,6 +178,7 @@ def generate_figure(figure_name, data_path='data/model_results.pkl', output_dir=
         generate_oz_losses_figure
     )
 
+    # Main figures (baseline)
     figure_map = {
         '1a': ('all_losses', generate_all_losses_figure, 'all_losses.pdf'),
         '1b': ('stripplot', generate_stripplot_figure, 'stripplot.pdf'),
@@ -188,9 +189,30 @@ def generate_figure(figure_name, data_path='data/model_results.pkl', output_dir=
         '5': ('oz', generate_oz_losses_figure, 'oz_losses.pdf'),
     }
 
+    # Supplemental figures (variants)
+    # S1-S3: Figure 1 variants, S4-S6: Figure 2 variants, S7-S8: Figures 3-4 variants
+    supplemental_map = {
+        's1a': ('1a', 'content'), 's1b': ('1b', 'content'),
+        's2a': ('1a', 'function'), 's2b': ('1b', 'function'),
+        's3a': ('1a', 'pos'), 's3b': ('1b', 'pos'),
+        's4a': ('2a', 'content'), 's4b': ('2b', 'content'),
+        's5a': ('2a', 'function'), 's5b': ('2b', 'function'),
+        's6a': ('2a', 'pos'), 's6b': ('2b', 'pos'),
+        's7a': ('3', 'content'), 's7b': ('3', 'function'), 's7c': ('3', 'pos'),
+        's8a': ('4', 'content'), 's8b': ('4', 'function'), 's8c': ('4', 'pos'),
+    }
+
+    # Check if it's a supplemental figure
+    if figure_name.lower() in supplemental_map:
+        main_fig, supp_variant = supplemental_map[figure_name.lower()]
+        # Override variant parameter
+        variant = supp_variant
+        figure_name = main_fig
+        safe_print(f"Supplemental Figure {figure_name.upper()}: {supp_variant} variant")
+
     if figure_name not in figure_map:
         safe_print(f"Unknown figure: {figure_name}")
-        safe_print(f"Available figures: {', '.join(figure_map.keys())}")
+        safe_print(f"Available: {', '.join(figure_map.keys())} or supplemental: {', '.join(supplemental_map.keys())}")
         return False
 
     # Skip Figure 5 for variants with clear message
@@ -377,14 +399,23 @@ def main():
     args = parser.parse_args()
 
     if args.list:
-        safe_print("\nAvailable figures:")
-        safe_print("  1a - Figure 1A: Training curves (all_losses.pdf)")
-        safe_print("  1b - Figure 1B: Strip plot (stripplot.pdf)")
-        safe_print("  2a - Figure 2A: Individual t-tests (t_test.pdf)")
-        safe_print("  2b - Figure 2B: Average t-test (t_test_avg.pdf)")
-        safe_print("  3  - Figure 3: Confusion matrix heatmap (average_loss_heatmap.pdf)")
-        safe_print("  4  - Figure 4: 3D MDS plot (3d_MDS_plot.pdf)")
-        safe_print("  5  - Figure 5: Oz authorship analysis (oz_losses.pdf) [baseline only]")
+        safe_print("\nMain Figures (baseline):")
+        safe_print("  1a - Figure 1A: Training curves")
+        safe_print("  1b - Figure 1B: Strip plot")
+        safe_print("  2a - Figure 2A: Individual t-tests")
+        safe_print("  2b - Figure 2B: Average t-test")
+        safe_print("  3  - Figure 3: Confusion matrix heatmap")
+        safe_print("  4  - Figure 4: 3D MDS plot")
+        safe_print("  5  - Figure 5: Oz authorship analysis")
+        safe_print("\nSupplemental Figures (variants):")
+        safe_print("  s1a, s1b - Supp. Fig. 1: Content-only (Figs 1A, 1B)")
+        safe_print("  s2a, s2b - Supp. Fig. 2: Function-only (Figs 1A, 1B)")
+        safe_print("  s3a, s3b - Supp. Fig. 3: POS (Figs 1A, 1B)")
+        safe_print("  s4a, s4b - Supp. Fig. 4: Content-only (Figs 2A, 2B)")
+        safe_print("  s5a, s5b - Supp. Fig. 5: Function-only (Figs 2A, 2B)")
+        safe_print("  s6a, s6b - Supp. Fig. 6: POS (Figs 2A, 2B)")
+        safe_print("  s7a-c    - Supp. Fig. 7: Confusion matrices (all variants)")
+        safe_print("  s8a-c    - Supp. Fig. 8: MDS plots (all variants)")
         return 0
 
     safe_print(format_header("LLM Stylometry CLI", 60))
diff --git a/data/README.md b/data/README.md
@@ -0,0 +1,56 @@
+# Data Directory
+
+Contains text data and consolidated model results.
+
+## Structure
+
+```
+data/
+├── raw/                    # Original Project Gutenberg texts (with headers/footers)
+├── cleaned/                # Preprocessed texts (headers/footers removed)
+│   ├── {author}/          # One directory per author
+│   ├── content_only/      # Function words masked as FUNC
+│   ├── function_only/     # Content words masked as CONTENT
+│   ├── pos_only/          # Words replaced with POS tags
+│   ├── contested/         # Disputed authorship texts (Baum/Thompson)
+│   ├── non_oz_baum/       # Non-Oz works by Baum
+│   └── non_oz_thompson/   # Non-Oz works by Thompson
+├── model_results.pkl               # Consolidated baseline results
+├── model_results_content.pkl       # Content-only variant results
+├── model_results_function.pkl      # Function-only variant results
+├── model_results_pos.pkl           # POS variant results
+└── classifier_results/             # Text classification results (gitignored)
+    ├── baseline.pkl
+    ├── content.pkl
+    ├── function.pkl
+    └── pos.pkl
+```
+
+## Authors
+
+8 authors with 7-14 books each (84 books total):
+- Austen (7 books)
+- Baum (14 books)
+- Dickens (14 books)
+- Fitzgerald (8 books)
+- Melville (10 books)
+- Thompson (13 books)
+- Twain (6 books)
+- Wells (12 books)
+
+## Creating Variant Data
+
+Generate variant-transformed texts:
+```bash
+python code/create_analysis_variants.py all
+```
+
+## Consolidating Model Results
+
+Combine loss logs from all models into single pkl file:
+```bash
+python code/consolidate_model_results.py              # Baseline
+python code/consolidate_model_results.py --variant content  # Content-only
+```
+
+See main README for data sources and preprocessing details.
diff --git a/models/README.md b/models/README.md
@@ -0,0 +1,38 @@
+# Models Directory
+
+Contains 320 trained GPT-2 models (80 per condition: baseline, content-only, function-only, POS).
+
+## Directory Naming
+
+**Baseline:** `{author}_tokenizer=gpt2_seed={0-9}/`
+**Variants:** `{author}_variant={variant}_tokenizer=gpt2_seed={0-9}/`
+
+Examples:
+- `baum_tokenizer=gpt2_seed=0/` (baseline)
+- `austen_variant=content_tokenizer=gpt2_seed=5/` (content-only)
+
+## File Contents
+
+Each directory contains:
+- `config.json`, `generation_config.json` - Model configuration
+- `loss_logs.csv` - Training/evaluation losses per epoch
+- `model.safetensors` - Model weights (~32MB, gitignored)
+- `training_state.pt` - Optimizer state (~65MB, gitignored)
+
+**Note:** Weight files are gitignored due to size. See issue #36 for downloading pre-trained weights.
+
+## Training Models
+
+Train locally:
+```bash
+./run_llm_stylometry.sh --train           # Baseline
+./run_llm_stylometry.sh --train -co       # Content-only
+```
+
+Train remotely on GPU cluster:
+```bash
+./remote_train.sh                         # Baseline
+./remote_train.sh -co --cluster tensor02  # Content-only on tensor02
+```
+
+See main README for full training documentation.
diff --git a/paper/README.md b/paper/README.md
@@ -0,0 +1,43 @@
+# Paper Directory
+
+LaTeX source files and compiled PDFs for the paper.
+
+## Main Files
+
+- **main.tex** - Main paper
+- **main.pdf** - Compiled main paper
+- **supplement.tex** - Supplemental material
+- **supplement.pdf** - Compiled supplement
+- **custom.bib** - Bibliography
+
+## Figures
+
+- **figs/source/** - Generated figures (PDFs from analysis)
+  - Main figures: all_losses.pdf, stripplot.pdf, t_test.pdf, etc.
+  - Variant figures: *_content.pdf, *_function.pdf, *_pos.pdf
+  - Classification: classification_accuracy.pdf, wordcloud_*.pdf
+- **figs/** - Additional figures and compiled multi-panel figures
+
+## Compilation
+
+Compile with standard LaTeX tools:
+```bash
+cd paper
+pdflatex main.tex
+bibtex main
+pdflatex main.tex
+pdflatex main.tex
+```
+
+Or use your preferred LaTeX editor (Overleaf, TeXShop, etc.).
+
+## Figure Generation
+
+Figures are generated from model results:
+```bash
+# From repository root
+./run_llm_stylometry.sh              # Generate all figures (all variants)
+./run_llm_stylometry.sh -f 1a        # Generate specific figure
+```
+
+See main README for complete figure generation documentation.
diff --git a/remote_train.sh b/remote_train.sh
@@ -83,15 +83,15 @@ echo "  -co, --content-only     Train content-only variant"
 echo "  -fo, --function-only    Train function-only variant"
 echo "  -pos, --part-of-speech  Train part-of-speech variant"
 echo "  -g, --max-gpus NUM      Maximum number of GPUs to use (default: 4)"
-echo "  --cluster NAME          Select cluster: tensor01 or tensor02 (default: tensor02)"
+echo "  --cluster NAME          Select cluster (required: specify your cluster name)"
 echo
 
 # Parse command line arguments
 KILL_MODE=false
 RESUME_MODE=false
 VARIANT_ARG=""
 MAX_GPUS=""
-CLUSTER="tensor02"  # Default cluster
+CLUSTER=""  # Must be specified with --cluster flag
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -136,6 +136,14 @@ while [[ $# -gt 0 ]]; do
     esac
 done
 
+# Validate cluster is specified
+if [ -z "$CLUSTER" ]; then
+    print_error "Cluster must be specified with --cluster flag"
+    echo "Example: $0 --cluster mycluster"
+    echo "Create credentials file: .ssh/credentials_mycluster.json"
+    exit 1
+fi
+
 # Get server details - try to read from cluster-specific credentials file first
 CRED_FILE=".ssh/credentials_${CLUSTER}.json"
 if [ -f "$CRED_FILE" ]; then
diff --git a/sync_models.sh b/sync_models.sh
@@ -22,7 +22,7 @@ SYNC_BASELINE=false
 SYNC_CONTENT=false
 SYNC_FUNCTION=false
 SYNC_POS=false
-CLUSTER="tensor02"  # Default cluster
+CLUSTER=""  # Must be specified with --cluster flag
 
 # Parse command line arguments (stackable)
 while [[ $# -gt 0 ]]; do
@@ -63,17 +63,12 @@ while [[ $# -gt 0 ]]; do
             echo "  -fo, --function-only    Sync function-only variant models"
             echo "  -pos, --part-of-speech  Sync part-of-speech variant models"
             echo "  -a, --all               Sync all models (baseline + all variants)"
-            echo "  --cluster CLUSTER       Specify cluster (tensor01 or tensor02, default: tensor02)"
+            echo "  --cluster CLUSTER       Specify cluster (required)"
             echo "  -h, --help              Show this help message"
             echo ""
-            echo "Flags are stackable. Examples:"
-            echo "  $0                      # Sync baseline only (default, tensor02)"
-            echo "  $0 -b -co               # Sync baseline and content-only"
-            echo "  $0 -fo -pos             # Sync function-only and POS"
-            echo "  $0 -a                   # Sync everything"
-            echo "  $0 -a --cluster tensor01  # Sync everything from tensor01"
-            echo ""
-            echo "Default: Sync baseline models only from tensor02"
+            echo "Examples:"
+            echo "  $0 --cluster mycluster -a       # Sync everything from mycluster"
+            echo "  $0 --cluster gpucluster -b -co  # Sync baseline and content-only"
             exit 0
             ;;
         *)
@@ -89,6 +84,14 @@ if [ "$SYNC_BASELINE" = false ] && [ "$SYNC_CONTENT" = false ] && [ "$SYNC_FUNCT
     SYNC_BASELINE=true
 fi
 
+# Validate cluster is specified
+if [ -z "$CLUSTER" ]; then
+    print_error "Cluster must be specified with --cluster flag"
+    echo "Example: $0 --cluster mycluster -a"
+    echo "Create credentials file: .ssh/credentials_mycluster.json"
+    exit 1
+fi
+
 echo "=================================================="
 echo "       LLM Stylometry Model Sync"
 echo "=================================================="