ContextLab
diff --git a/‎README.md‎
Lines changed: 19 additions & 1 deletion b/‎README.md‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎code/generate_figures.py‎
Lines changed: 41 additions & 14 deletions b/‎code/generate_figures.py‎
Lines changed: 41 additions & 14 deletions
diff --git a/‎code/main.py‎
Lines changed: 93 additions & 0 deletions b/‎code/main.py‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎code/model_utils.py‎
Lines changed: 30 additions & 1 deletion b/‎code/model_utils.py‎
Lines changed: 30 additions & 1 deletion
@@ -131,6 +131,9 @@ python generate_figures.py --figure 1a
 # Train models from scratch
 python generate_figures.py --train
 
+# Resume training from existing checkpoints
+python generate_figures.py --train --resume
+
 # List available figures
 python generate_figures.py --list
 ```
@@ -175,6 +178,9 @@ fig = generate_all_losses_figure(
 # Using the CLI (recommended - handles all steps automatically)
 ./run_llm_stylometry.sh --train
 
+# Resume training from existing checkpoints
+./run_llm_stylometry.sh --train --resume
+
 # Limit GPU usage if needed
 ./run_llm_stylometry.sh --train --max-gpus 4
 ```
@@ -184,6 +190,12 @@ This command will:
 2. Train all 80 models (8 authors × 10 seeds)
 3. Consolidate results into `data/model_results.pkl`
 
+**Resume Training**: The `--resume` flag allows you to continue training from existing checkpoints:
+- Models that have already met training criteria are automatically skipped
+- Partially trained models with saved weights resume from their last checkpoint
+- Models without weights are trained from scratch (even if loss logs exist)
+- Random states are restored from checkpoints to ensure consistent training continuation
+
 The training pipeline automatically handles data preparation, model training across available GPUs, and result consolidation. Individual model checkpoints and loss logs are saved in the `models/` directory.
 
 ### Remote Training on GPU Server
@@ -226,15 +238,21 @@ Once Git credentials are configured on your server, run `remote_train.sh` **from
 # From your local machine, start training on the remote GPU server
 ./remote_train.sh
 
+# Resume training from existing checkpoints
+./remote_train.sh --resume  # or -r
+
 # Kill existing training sessions and optionally start new one
 ./remote_train.sh --kill  # or -k
 
+# Kill and resume (restart interrupted training)
+./remote_train.sh --kill --resume
+
 # You'll be prompted for:
 # - Server address (hostname or IP)
 # - Username
 ```
 
-**What this script does:** The `remote_train.sh` script connects to your GPU server via SSH and executes `run_llm_stylometry.sh --train -y` in a `screen` session. This allows you to disconnect your local machine while the GPU server continues training.
+**What this script does:** The `remote_train.sh` script connects to your GPU server via SSH and executes `run_llm_stylometry.sh --train -y` (or `--train --resume -y` if resuming) in a `screen` session. This allows you to disconnect your local machine while the GPU server continues training.
 
 The script will:
 1. SSH into your GPU server
 
@@ -21,10 +21,13 @@
 from llm_stylometry.cli_utils import safe_print, format_header, is_windows
 
 
-def train_models(max_gpus=None, no_confirm=False):
-    """Train all models from scratch."""
+def train_models(max_gpus=None, no_confirm=False, resume=False):
+    """Train all models from scratch or resume from checkpoints."""
     safe_print("\n" + "=" * 60)
-    safe_print("Training Models from Scratch")
+    if resume:
+        safe_print("Resuming Model Training from Checkpoints")
+    else:
+        safe_print("Training Models from Scratch")
     safe_print("=" * 60)
     warning = "[WARNING]" if is_windows() else "⚠️"
     # Check device availability
@@ -51,19 +54,29 @@ def train_models(max_gpus=None, no_confirm=False):
         safe_print("\nSkipping confirmation (--no-confirm flag set)")
         safe_print("Starting training...")
 
-    # Remove existing models directory to train from scratch
+    # Handle models directory based on resume flag
     import shutil
     models_dir = Path('models')
-    if models_dir.exists():
-        safe_print("\nRemoving existing models directory...")
-        shutil.rmtree(models_dir)
-        safe_print("Existing models removed.")
 
-    # Also remove existing model results file
-    model_results_path = Path('data/model_results.pkl')
-    if model_results_path.exists():
-        safe_print("Removing existing model_results.pkl...")
-        model_results_path.unlink()
+    if not resume:
+        # Remove existing models directory to train from scratch
+        if models_dir.exists():
+            safe_print("\nRemoving existing models directory...")
+            shutil.rmtree(models_dir)
+            safe_print("Existing models removed.")
+
+        # Also remove existing model results file
+        model_results_path = Path('data/model_results.pkl')
+        if model_results_path.exists():
+            safe_print("Removing existing model_results.pkl...")
+            model_results_path.unlink()
+    else:
+        # When resuming, keep existing models and check their status
+        if models_dir.exists():
+            safe_print("\nResuming from existing models directory...")
+        else:
+            safe_print("\nNo existing models found. Starting fresh training...")
+            resume = False  # Fall back to fresh training if no models exist
 
     # Prepare data if needed
     if not Path('data/cleaned').exists():
@@ -98,6 +111,9 @@ def train_models(max_gpus=None, no_confirm=False):
         if max_gpus:
             env['MAX_GPUS'] = str(max_gpus)
             safe_print(f"Limiting to {max_gpus} GPU(s)")
+        # Pass through resume flag if specified
+        if resume:
+            env['RESUME_TRAINING'] = '1'
         # Run without capturing output so we can see progress
         result = subprocess.run([sys.executable, 'code/main.py'], env=env, check=False)
         if result.returncode != 0:
@@ -227,6 +243,12 @@ def main():
         help='Skip confirmation prompts (useful for non-interactive mode)'
     )
 
+    parser.add_argument(
+        '--resume', '-r',
+        action='store_true',
+        help='Resume training from existing checkpoints (use with --train)'
+    )
+
     args = parser.parse_args()
 
     if args.list:
@@ -242,9 +264,14 @@ def main():
 
     safe_print(format_header("LLM Stylometry CLI", 60))
 
+    # Validate --resume flag usage
+    if args.resume and not args.train:
+        safe_print("\nWarning: --resume flag is ignored without --train flag")
+        args.resume = False
+
     # Train models if requested
     if args.train:
-        if not train_models(max_gpus=args.max_gpus, no_confirm=args.no_confirm):
+        if not train_models(max_gpus=args.max_gpus, no_confirm=args.no_confirm, resume=args.resume):
             return 1
         # Update data path to use newly generated results
         args.data = 'data/model_results.pkl'
 
@@ -36,6 +36,52 @@ def tqdm(iterable, *args, **kwargs):
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+def check_model_complete(model_name, stop_train_loss=3.0, min_epochs=0):
+    """
+    Check if a model has completed training based on loss logs and weights.
+
+    Returns:
+        tuple: (is_complete, has_weights, epochs_completed)
+            - is_complete: True if model has met stop criteria
+            - has_weights: True if model weights exist
+            - epochs_completed: Number of epochs completed (0 if no logs)
+    """
+    model_dir = MODELS_DIR / model_name
+
+    # Check if model weights exist
+    weights_file = model_dir / "model.safetensors"
+    config_file = model_dir / "config.json"
+    training_state_file = model_dir / "training_state.pt"
+    has_weights = weights_file.exists() and config_file.exists() and training_state_file.exists()
+
+    # Check loss logs
+    loss_log_file = model_dir / "loss_logs.csv"
+    if not loss_log_file.exists():
+        return False, has_weights, 0
+
+    # Read loss logs to check training status
+    import pandas as pd
+    try:
+        df = pd.read_csv(loss_log_file)
+        if df.empty:
+            return False, has_weights, 0
+
+        # Get the last training loss for this model
+        train_losses = df[df['loss_dataset'] == 'train'].sort_values('epochs_completed')
+        if train_losses.empty:
+            return False, has_weights, 0
+
+        last_epoch = train_losses['epochs_completed'].max()
+        last_train_loss = train_losses[train_losses['epochs_completed'] == last_epoch]['loss_value'].iloc[0]
+
+        # Check if model has met stop criteria
+        is_complete = (last_train_loss <= stop_train_loss and last_epoch >= min_epochs)
+
+        return is_complete, has_weights, int(last_epoch)
+    except Exception as e:
+        logger.warning(f"Error reading loss logs for {model_name}: {e}")
+        return False, has_weights, 0
+
 # Detect available devices
 def get_device_info():
     """Detect and return device configuration."""
@@ -51,6 +97,9 @@ def get_device_info():
 device_type, device_count = get_device_info()
 logger.info(f"Device type: {device_type}, Count: {device_count}")
 
+# Check if we're in resume mode
+resume_mode = os.environ.get('RESUME_TRAINING', '0') == '1'
+
 experiments = []
 for seed in range(10):
     for author in AUTHORS:
@@ -59,6 +108,7 @@ def get_device_info():
                 train_author=author,
                 seed=seed,
                 tokenizer_name="gpt2",
+                resume_training=resume_mode,
             )
         )
 
@@ -298,6 +348,49 @@ def run_experiment(exp: Experiment, device_queue, device_type="cuda"):
     # Check if we should run sequentially (for subprocess compatibility)
     USE_MULTIPROCESSING = os.environ.get('NO_MULTIPROCESSING', '0') != '1'
 
+    # Filter experiments based on resume mode
+    if resume_mode:
+        logger.info("Checking existing models for resume...")
+        experiments_to_run = []
+        import shutil
+
+        for exp in experiments:
+            is_complete, has_weights, epochs_done = check_model_complete(
+                exp.name,
+                exp.stop_criteria["train_loss"],
+                exp.stop_criteria["min_epochs"]
+            )
+
+            if is_complete:
+                # Model has completed training - skip it
+                logger.info(f"Skipping {exp.name} - already complete (epochs: {epochs_done})")
+            elif has_weights:
+                # Model has weights and can be resumed
+                logger.info(f"Resuming {exp.name} from epoch {epochs_done}")
+                experiments_to_run.append(exp)
+            elif epochs_done > 0:
+                # Loss logs exist but no weights (e.g., after cloning repo) - need to restart
+                logger.info(f"Starting {exp.name} from scratch - no weights available (removing existing logs)")
+                model_dir = MODELS_DIR / exp.name
+                if model_dir.exists():
+                    # Remove only this specific model's directory to start fresh
+                    shutil.rmtree(model_dir)
+                exp.resume_training = False  # Force fresh start for this model
+                experiments_to_run.append(exp)
+            else:
+                # No logs or weights - start fresh for this model
+                logger.info(f"Starting fresh: {exp.name} (no existing logs or weights)")
+                exp.resume_training = False  # No checkpoint to resume from
+                experiments_to_run.append(exp)
+
+        experiments = experiments_to_run
+        total_models = 80  # 8 authors × 10 seeds
+        logger.info(f"Models to train: {len(experiments)} out of {total_models} total")
+
+        if not experiments:
+            logger.info("All models are complete. Nothing to train.")
+            sys.exit(0)
+
     # Use already detected device configuration
     if device_type == "cuda":
         # Check for MAX_GPUS environment variable to optionally limit GPU usage
 
@@ -3,6 +3,8 @@
 import logging
 from torch.optim import AdamW
 from constants import MODELS_DIR
+import random
+import numpy as np
 
 logger = logging.getLogger(__name__)
 
@@ -18,10 +20,19 @@ def save_checkpoint(
 
     model.save_pretrained(save_directory=checkpoint_dir)
 
+    # Save training state including random states for deterministic resume
     training_state = {
         "optimizer_state_dict": optimizer.state_dict(),
         "epochs_completed": epochs_completed,
+        "random_state": random.getstate(),
+        "np_random_state": np.random.get_state(),
+        "torch_random_state": torch.get_rng_state(),
     }
+
+    # Also save CUDA random state if available
+    if torch.cuda.is_available():
+        training_state["cuda_random_state"] = torch.cuda.get_rng_state_all()
+
     torch.save(obj=training_state, f=checkpoint_dir / "training_state.pt")
     logger.info(
         f"Checkpoint saved for {model_name} at epochs_completed={epochs_completed}"
@@ -42,11 +53,29 @@ def load_checkpoint(model_class, model_name, device):
     if not training_state_path.exists():
         raise FileNotFoundError(f"Training state file not found for {model_name}")
 
-    training_state = torch.load(f=training_state_path)
+    training_state = torch.load(f=training_state_path, map_location=device)
 
     optimizer = AdamW(params=model.parameters(), lr=0)
     optimizer.load_state_dict(state_dict=training_state["optimizer_state_dict"])
     epochs_completed = training_state["epochs_completed"]
+
+    # Restore random states for deterministic resume (if available)
+    if "random_state" in training_state:
+        random.setstate(training_state["random_state"])
+        logger.info("Restored Python random state")
+
+    if "np_random_state" in training_state:
+        np.random.set_state(training_state["np_random_state"])
+        logger.info("Restored NumPy random state")
+
+    if "torch_random_state" in training_state:
+        torch.set_rng_state(training_state["torch_random_state"])
+        logger.info("Restored PyTorch random state")
+
+    if "cuda_random_state" in training_state and torch.cuda.is_available():
+        torch.cuda.set_rng_state_all(training_state["cuda_random_state"])
+        logger.info("Restored CUDA random state")
+
     logger.info(
         f"Checkpoint loaded for {model_name} from epochs_completed={epochs_completed}"
     )