Skip to content

Commit 8a7a4d7

Browse files
Merge pull request #37 from jeremymanning/main
README simplification and security improvements (Issue #35)
2 parents e4401df + 285dd05 commit 8a7a4d7

File tree

9 files changed

+290
-568
lines changed

9 files changed

+290
-568
lines changed

README.md

Lines changed: 42 additions & 541 deletions
Large diffs are not rendered by default.

check_remote_status.sh

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
# completed models and estimates for in-progress training.
88
#
99
# Usage:
10-
# ./check_remote_status.sh [--cluster tensor01|tensor02]
10+
# ./check_remote_status.sh --cluster CLUSTER_NAME
1111
#
1212

1313
set -e
@@ -30,7 +30,7 @@ print_success() {
3030
}
3131

3232
# Default cluster
33-
CLUSTER="tensor02"
33+
CLUSTER="" # Must be specified with --cluster flag
3434

3535
# Parse arguments
3636
while [[ $# -gt 0 ]]; do
@@ -45,12 +45,11 @@ while [[ $# -gt 0 ]]; do
4545
echo "Check training status on remote GPU server"
4646
echo ""
4747
echo "Options:"
48-
echo " --cluster NAME Select cluster: tensor01 or tensor02 (default: tensor02)"
48+
echo " --cluster NAME Select cluster (required)"
4949
echo " -h, --help Show this help message"
5050
echo ""
5151
echo "Examples:"
52-
echo " $0 # Check status on tensor02 (default)"
53-
echo " $0 --cluster tensor01 # Check status on tensor01"
52+
echo " $0 --cluster mycluster # Check status on mycluster"
5453
exit 0
5554
;;
5655
*)
@@ -61,6 +60,13 @@ while [[ $# -gt 0 ]]; do
6160
esac
6261
done
6362

63+
# Validate cluster is specified
64+
if [ -z "$CLUSTER" ]; then
65+
print_error "Cluster must be specified with --cluster flag"
66+
echo "Example: $0 --cluster mycluster"
67+
exit 1
68+
fi
69+
6470
# Read credentials from config file
6571
CRED_FILE=".ssh/credentials_${CLUSTER}.json"
6672
if [ -f "$CRED_FILE" ]; then

code/README.md

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Code Directory
2+
3+
Python scripts for model training, analysis, and figure generation.
4+
5+
## Main Scripts
6+
7+
### Training and Figures
8+
- **generate_figures.py** - Main CLI for training and figure generation
9+
- **main.py** - Model training orchestration with parallel GPU support
10+
11+
### Data Processing
12+
- **clean.py** - Preprocess Project Gutenberg texts (remove headers/footers)
13+
- **create_analysis_variants.py** - Generate variant-transformed texts (content, function, POS)
14+
- **consolidate_model_results.py** - Combine model loss logs into single pkl file
15+
16+
### Analysis
17+
- **compute_stats.py** - Statistical analysis: t-tests, threshold crossings, cross-variant comparisons
18+
- **check_training_status.py** - Remote training progress monitoring
19+
20+
### Utilities
21+
- **constants.py** - Shared constants (authors, hyperparameters)
22+
23+
## Usage
24+
25+
Most scripts are called through shell wrappers (see main README):
26+
- `./run_llm_stylometry.sh` → generate_figures.py
27+
- `./run_stats.sh` → compute_stats.py
28+
- `./check_remote_status.sh` → check_training_status.py
29+
30+
Run scripts directly for advanced usage:
31+
```bash
32+
python code/generate_figures.py --help
33+
python code/compute_stats.py --help
34+
```
35+
36+
See main README for complete documentation and examples.

code/generate_figures.py

Lines changed: 41 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ def train_models(max_gpus=None, no_confirm=False, resume=False, variant=None):
167167

168168

169169
def generate_figure(figure_name, data_path='data/model_results.pkl', output_dir='paper/figs/source', variant=None):
170-
"""Generate a specific figure."""
170+
"""Generate a specific figure (main or supplemental)."""
171171
from llm_stylometry.visualization import (
172172
generate_all_losses_figure,
173173
generate_stripplot_figure,
@@ -178,6 +178,7 @@ def generate_figure(figure_name, data_path='data/model_results.pkl', output_dir=
178178
generate_oz_losses_figure
179179
)
180180

181+
# Main figures (baseline)
181182
figure_map = {
182183
'1a': ('all_losses', generate_all_losses_figure, 'all_losses.pdf'),
183184
'1b': ('stripplot', generate_stripplot_figure, 'stripplot.pdf'),
@@ -188,9 +189,30 @@ def generate_figure(figure_name, data_path='data/model_results.pkl', output_dir=
188189
'5': ('oz', generate_oz_losses_figure, 'oz_losses.pdf'),
189190
}
190191

192+
# Supplemental figures (variants)
193+
# S1-S3: Figure 1 variants, S4-S6: Figure 2 variants, S7-S8: Figures 3-4 variants
194+
supplemental_map = {
195+
's1a': ('1a', 'content'), 's1b': ('1b', 'content'),
196+
's2a': ('1a', 'function'), 's2b': ('1b', 'function'),
197+
's3a': ('1a', 'pos'), 's3b': ('1b', 'pos'),
198+
's4a': ('2a', 'content'), 's4b': ('2b', 'content'),
199+
's5a': ('2a', 'function'), 's5b': ('2b', 'function'),
200+
's6a': ('2a', 'pos'), 's6b': ('2b', 'pos'),
201+
's7a': ('3', 'content'), 's7b': ('3', 'function'), 's7c': ('3', 'pos'),
202+
's8a': ('4', 'content'), 's8b': ('4', 'function'), 's8c': ('4', 'pos'),
203+
}
204+
205+
# Check if it's a supplemental figure
206+
if figure_name.lower() in supplemental_map:
207+
main_fig, supp_variant = supplemental_map[figure_name.lower()]
208+
# Override variant parameter
209+
variant = supp_variant
210+
figure_name = main_fig
211+
safe_print(f"Supplemental Figure {figure_name.upper()}: {supp_variant} variant")
212+
191213
if figure_name not in figure_map:
192214
safe_print(f"Unknown figure: {figure_name}")
193-
safe_print(f"Available figures: {', '.join(figure_map.keys())}")
215+
safe_print(f"Available: {', '.join(figure_map.keys())} or supplemental: {', '.join(supplemental_map.keys())}")
194216
return False
195217

196218
# Skip Figure 5 for variants with clear message
@@ -377,14 +399,23 @@ def main():
377399
args = parser.parse_args()
378400

379401
if args.list:
380-
safe_print("\nAvailable figures:")
381-
safe_print(" 1a - Figure 1A: Training curves (all_losses.pdf)")
382-
safe_print(" 1b - Figure 1B: Strip plot (stripplot.pdf)")
383-
safe_print(" 2a - Figure 2A: Individual t-tests (t_test.pdf)")
384-
safe_print(" 2b - Figure 2B: Average t-test (t_test_avg.pdf)")
385-
safe_print(" 3 - Figure 3: Confusion matrix heatmap (average_loss_heatmap.pdf)")
386-
safe_print(" 4 - Figure 4: 3D MDS plot (3d_MDS_plot.pdf)")
387-
safe_print(" 5 - Figure 5: Oz authorship analysis (oz_losses.pdf) [baseline only]")
402+
safe_print("\nMain Figures (baseline):")
403+
safe_print(" 1a - Figure 1A: Training curves")
404+
safe_print(" 1b - Figure 1B: Strip plot")
405+
safe_print(" 2a - Figure 2A: Individual t-tests")
406+
safe_print(" 2b - Figure 2B: Average t-test")
407+
safe_print(" 3 - Figure 3: Confusion matrix heatmap")
408+
safe_print(" 4 - Figure 4: 3D MDS plot")
409+
safe_print(" 5 - Figure 5: Oz authorship analysis")
410+
safe_print("\nSupplemental Figures (variants):")
411+
safe_print(" s1a, s1b - Supp. Fig. 1: Content-only (Figs 1A, 1B)")
412+
safe_print(" s2a, s2b - Supp. Fig. 2: Function-only (Figs 1A, 1B)")
413+
safe_print(" s3a, s3b - Supp. Fig. 3: POS (Figs 1A, 1B)")
414+
safe_print(" s4a, s4b - Supp. Fig. 4: Content-only (Figs 2A, 2B)")
415+
safe_print(" s5a, s5b - Supp. Fig. 5: Function-only (Figs 2A, 2B)")
416+
safe_print(" s6a, s6b - Supp. Fig. 6: POS (Figs 2A, 2B)")
417+
safe_print(" s7a-c - Supp. Fig. 7: Confusion matrices (all variants)")
418+
safe_print(" s8a-c - Supp. Fig. 8: MDS plots (all variants)")
388419
return 0
389420

390421
safe_print(format_header("LLM Stylometry CLI", 60))

data/README.md

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# Data Directory
2+
3+
Contains text data and consolidated model results.
4+
5+
## Structure
6+
7+
```
8+
data/
9+
├── raw/ # Original Project Gutenberg texts (with headers/footers)
10+
├── cleaned/ # Preprocessed texts (headers/footers removed)
11+
│ ├── {author}/ # One directory per author
12+
│ ├── content_only/ # Function words masked as FUNC
13+
│ ├── function_only/ # Content words masked as CONTENT
14+
│ ├── pos_only/ # Words replaced with POS tags
15+
│ ├── contested/ # Disputed authorship texts (Baum/Thompson)
16+
│ ├── non_oz_baum/ # Non-Oz works by Baum
17+
│ └── non_oz_thompson/ # Non-Oz works by Thompson
18+
├── model_results.pkl # Consolidated baseline results
19+
├── model_results_content.pkl # Content-only variant results
20+
├── model_results_function.pkl # Function-only variant results
21+
├── model_results_pos.pkl # POS variant results
22+
└── classifier_results/ # Text classification results (gitignored)
23+
├── baseline.pkl
24+
├── content.pkl
25+
├── function.pkl
26+
└── pos.pkl
27+
```
28+
29+
## Authors
30+
31+
8 authors with 7-14 books each (84 books total):
32+
- Austen (7 books)
33+
- Baum (14 books)
34+
- Dickens (14 books)
35+
- Fitzgerald (8 books)
36+
- Melville (10 books)
37+
- Thompson (13 books)
38+
- Twain (6 books)
39+
- Wells (12 books)
40+
41+
## Creating Variant Data
42+
43+
Generate variant-transformed texts:
44+
```bash
45+
python code/create_analysis_variants.py all
46+
```
47+
48+
## Consolidating Model Results
49+
50+
Combine loss logs from all models into single pkl file:
51+
```bash
52+
python code/consolidate_model_results.py # Baseline
53+
python code/consolidate_model_results.py --variant content # Content-only
54+
```
55+
56+
See main README for data sources and preprocessing details.

models/README.md

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Models Directory
2+
3+
Contains 320 trained GPT-2 models (80 per condition: baseline, content-only, function-only, POS).
4+
5+
## Directory Naming
6+
7+
**Baseline:** `{author}_tokenizer=gpt2_seed={0-9}/`
8+
**Variants:** `{author}_variant={variant}_tokenizer=gpt2_seed={0-9}/`
9+
10+
Examples:
11+
- `baum_tokenizer=gpt2_seed=0/` (baseline)
12+
- `austen_variant=content_tokenizer=gpt2_seed=5/` (content-only)
13+
14+
## File Contents
15+
16+
Each directory contains:
17+
- `config.json`, `generation_config.json` - Model configuration
18+
- `loss_logs.csv` - Training/evaluation losses per epoch
19+
- `model.safetensors` - Model weights (~32MB, gitignored)
20+
- `training_state.pt` - Optimizer state (~65MB, gitignored)
21+
22+
**Note:** Weight files are gitignored due to size. See issue #36 for downloading pre-trained weights.
23+
24+
## Training Models
25+
26+
Train locally:
27+
```bash
28+
./run_llm_stylometry.sh --train # Baseline
29+
./run_llm_stylometry.sh --train -co # Content-only
30+
```
31+
32+
Train remotely on GPU cluster:
33+
```bash
34+
./remote_train.sh # Baseline
35+
./remote_train.sh -co --cluster tensor02 # Content-only on tensor02
36+
```
37+
38+
See main README for full training documentation.

paper/README.md

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Paper Directory
2+
3+
LaTeX source files and compiled PDFs for the paper.
4+
5+
## Main Files
6+
7+
- **main.tex** - Main paper
8+
- **main.pdf** - Compiled main paper
9+
- **supplement.tex** - Supplemental material
10+
- **supplement.pdf** - Compiled supplement
11+
- **custom.bib** - Bibliography
12+
13+
## Figures
14+
15+
- **figs/source/** - Generated figures (PDFs from analysis)
16+
- Main figures: all_losses.pdf, stripplot.pdf, t_test.pdf, etc.
17+
- Variant figures: *_content.pdf, *_function.pdf, *_pos.pdf
18+
- Classification: classification_accuracy.pdf, wordcloud_*.pdf
19+
- **figs/** - Additional figures and compiled multi-panel figures
20+
21+
## Compilation
22+
23+
Compile with standard LaTeX tools:
24+
```bash
25+
cd paper
26+
pdflatex main.tex
27+
bibtex main
28+
pdflatex main.tex
29+
pdflatex main.tex
30+
```
31+
32+
Or use your preferred LaTeX editor (Overleaf, TeXShop, etc.).
33+
34+
## Figure Generation
35+
36+
Figures are generated from model results:
37+
```bash
38+
# From repository root
39+
./run_llm_stylometry.sh # Generate all figures (all variants)
40+
./run_llm_stylometry.sh -f 1a # Generate specific figure
41+
```
42+
43+
See main README for complete figure generation documentation.

remote_train.sh

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,15 +83,15 @@ echo " -co, --content-only Train content-only variant"
8383
echo " -fo, --function-only Train function-only variant"
8484
echo " -pos, --part-of-speech Train part-of-speech variant"
8585
echo " -g, --max-gpus NUM Maximum number of GPUs to use (default: 4)"
86-
echo " --cluster NAME Select cluster: tensor01 or tensor02 (default: tensor02)"
86+
echo " --cluster NAME Select cluster (required: specify your cluster name)"
8787
echo
8888

8989
# Parse command line arguments
9090
KILL_MODE=false
9191
RESUME_MODE=false
9292
VARIANT_ARG=""
9393
MAX_GPUS=""
94-
CLUSTER="tensor02" # Default cluster
94+
CLUSTER="" # Must be specified with --cluster flag
9595

9696
while [[ $# -gt 0 ]]; do
9797
case $1 in
@@ -136,6 +136,14 @@ while [[ $# -gt 0 ]]; do
136136
esac
137137
done
138138

139+
# Validate cluster is specified
140+
if [ -z "$CLUSTER" ]; then
141+
print_error "Cluster must be specified with --cluster flag"
142+
echo "Example: $0 --cluster mycluster"
143+
echo "Create credentials file: .ssh/credentials_mycluster.json"
144+
exit 1
145+
fi
146+
139147
# Get server details - try to read from cluster-specific credentials file first
140148
CRED_FILE=".ssh/credentials_${CLUSTER}.json"
141149
if [ -f "$CRED_FILE" ]; then

sync_models.sh

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ SYNC_BASELINE=false
2222
SYNC_CONTENT=false
2323
SYNC_FUNCTION=false
2424
SYNC_POS=false
25-
CLUSTER="tensor02" # Default cluster
25+
CLUSTER="" # Must be specified with --cluster flag
2626

2727
# Parse command line arguments (stackable)
2828
while [[ $# -gt 0 ]]; do
@@ -63,17 +63,12 @@ while [[ $# -gt 0 ]]; do
6363
echo " -fo, --function-only Sync function-only variant models"
6464
echo " -pos, --part-of-speech Sync part-of-speech variant models"
6565
echo " -a, --all Sync all models (baseline + all variants)"
66-
echo " --cluster CLUSTER Specify cluster (tensor01 or tensor02, default: tensor02)"
66+
echo " --cluster CLUSTER Specify cluster (required)"
6767
echo " -h, --help Show this help message"
6868
echo ""
69-
echo "Flags are stackable. Examples:"
70-
echo " $0 # Sync baseline only (default, tensor02)"
71-
echo " $0 -b -co # Sync baseline and content-only"
72-
echo " $0 -fo -pos # Sync function-only and POS"
73-
echo " $0 -a # Sync everything"
74-
echo " $0 -a --cluster tensor01 # Sync everything from tensor01"
75-
echo ""
76-
echo "Default: Sync baseline models only from tensor02"
69+
echo "Examples:"
70+
echo " $0 --cluster mycluster -a # Sync everything from mycluster"
71+
echo " $0 --cluster gpucluster -b -co # Sync baseline and content-only"
7772
exit 0
7873
;;
7974
*)
@@ -89,6 +84,14 @@ if [ "$SYNC_BASELINE" = false ] && [ "$SYNC_CONTENT" = false ] && [ "$SYNC_FUNCT
8984
SYNC_BASELINE=true
9085
fi
9186

87+
# Validate cluster is specified
88+
if [ -z "$CLUSTER" ]; then
89+
print_error "Cluster must be specified with --cluster flag"
90+
echo "Example: $0 --cluster mycluster -a"
91+
echo "Create credentials file: .ssh/credentials_mycluster.json"
92+
exit 1
93+
fi
94+
9295
echo "=================================================="
9396
echo " LLM Stylometry Model Sync"
9497
echo "=================================================="

0 commit comments

Comments
 (0)