codinglabsong
diff --git a/‎app.py‎
Lines changed: 23 additions & 4 deletions b/‎app.py‎
Lines changed: 23 additions & 4 deletions
diff --git a/‎scripts/download_data.py‎
Lines changed: 25 additions & 3 deletions b/‎scripts/download_data.py‎
Lines changed: 25 additions & 3 deletions
diff --git a/‎scripts/sample_data.py‎
Lines changed: 33 additions & 3 deletions b/‎scripts/sample_data.py‎
Lines changed: 33 additions & 3 deletions
diff --git a/‎scripts/train.py‎
Lines changed: 7 additions & 0 deletions b/‎scripts/train.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/bart_reddit_lora/data.py‎
Lines changed: 68 additions & 5 deletions b/‎src/bart_reddit_lora/data.py‎
Lines changed: 68 additions & 5 deletions
diff --git a/‎src/bart_reddit_lora/evaluation.py‎
Lines changed: 37 additions & 3 deletions b/‎src/bart_reddit_lora/evaluation.py‎
Lines changed: 37 additions & 3 deletions
@@ -1,13 +1,23 @@
+"""
+Module for loading a LoRA fine-tuned BART model and serving
+an interactive Gradio interface for text generation.
+"""
+
 import torch
 import gradio as gr
 from transformers import AutoTokenizer
 from transformers import BartForConditionalGeneration
 from peft import PeftModel
 
 
-def load_model():
+def load_model() -> tuple[AutoTokenizer, PeftModel, torch.device]:
     """
-    Load environment variables, tokenizer, and the fine-tuned LoRA model.
+    Load tokenizer and LoRA-enhanced model onto available device.
+
+    Returns:
+        tokenizer (AutoTokenizer): Tokenizer for text processing.
+        model (PeftModel): Fine-tuned LoRA BART model in eval mode.
+        device (torch.device): Computation device (GPU if available, else CPU).
     """
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -33,7 +43,13 @@ def load_model():
 
 def predict(text: str) -> str:
     """
-    Generate a response for a single input text.
+    Generate a text response given an input prompt.
+
+    Args:
+        text (str): The input prompt string.
+
+    Returns:
+        str: The decoded model output.
     """
     # Tokenize and move inputs to device
     inputs = tokenizer(
@@ -62,7 +78,10 @@ def predict(text: str) -> str:
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
-def main():
+def main() -> None:
+    """
+    Launch Gradio web interface for interactive model inference.
+    """
     interface = gr.Interface(
         fn=predict,
         inputs=gr.Textbox(lines=5, placeholder="Ask a Question", label="Your Question"),
 
@@ -1,12 +1,24 @@
+"""
+Download Reddit Q&A posts, preprocess them, and split into train/validation/test sets.
+"""
+
 import json
 import pandas as pd
 import argparse
 from pathlib import Path
 from bart_reddit_lora.data import scrape, preprocess, split_and_save
 
 
-def parse_args():
-    """Parse command-line arguments for downloading and preprocessing data."""
+def parse_args() -> argparse.Namespace:
+    """
+    Parse command-line arguments for data scraping and preprocessing.
+
+    Returns:
+        argparse.Namespace: Parsed arguments with attributes:
+            config (str): Path to subreddit size map JSON.
+            raw_dir (str): Directory to save raw data.
+            out_dir (str): Directory to save processed data.
+    """
     p = argparse.ArgumentParser(prog="download-data")
 
     p.add_argument("--config", default="data/subreddit_size_map.json")
@@ -16,7 +28,17 @@ def parse_args():
     return p.parse_args()
 
 
-def main():
+def main() -> None:
+    """
+    Execute the data pipeline: scrape Reddit posts, preprocess, and split into datasets.
+
+    Steps:
+    1. Parse arguments for paths and config.
+    2. Create directories for raw and processed data.
+    3. Load subreddit size map from JSON.
+    4. Scrape posts and save raw JSON.
+    5. Preprocess scraped data and split into train/val/test sets.
+    """
     cfg = parse_args()
 
     # create the paths for dataset dirs
 
@@ -1,11 +1,22 @@
+"""Utilities for creating a reproducible smoke‐test sample from a larger CSV dataset."""
+
 import argparse
 import sys
 import pandas as pd
 from pathlib import Path
 
 
-def parse_args():
-    """Parse command-line arguments for sampling data for smoke tests."""
+def parse_args() -> argparse.Namespace:
+    """
+    Parse command-line arguments.
+
+    Returns:
+        argparse.Namespace:  
+            - input (str): Path to the full CSV file to sample from.  
+            - output (str): Path where the sampled CSV will be written.  
+            - n (int): Number of examples to sample.  
+            - seed (int): Random seed for reproducibility.
+    """
     p = argparse.ArgumentParser(
         description="Create a smoke-test sample from a larger CSV"
     )
@@ -30,7 +41,23 @@ def parse_args():
     return p.parse_args()
 
 
-def sample_dataset(input_csv: Path, output_csv: Path, sample_size: int, seed: int = 42):
+def sample_dataset(
+    input_csv: Path, 
+    output_csv: Path,
+    sample_size: int, 
+    seed: int = 42):
+    """
+    Load a CSV, draw a random sample, and write it out.
+
+    Args:
+        input_csv (Path): Path to the source CSV file.
+        output_csv (Path): Path where the sampled CSV will be saved.
+        sample_size (int): Number of rows to sample without replacement.
+        seed (int, optional): Random seed for sampling. Defaults to 42.
+
+    Raises:
+        SystemExit: If `sample_size` exceeds the number of available rows.
+    """
     # load full train set
     df = pd.read_csv(input_csv)
     total = len(df)
@@ -51,6 +78,9 @@ def sample_dataset(input_csv: Path, output_csv: Path, sample_size: int, seed: in
 
 
 def main():
+    """
+    Entry point: parse arguments and run the sampling routine.
+    """
     cfg = parse_args()
 
     sample_dataset(cfg.input, cfg.output, cfg.n, cfg.seed)
 
@@ -1,3 +1,10 @@
+"""
+CLI entry point for the BART Reddit LoRA training module.
+
+This script imports the `main` function from `bart_reddit_lora.train`
+and executes it when run directly.
+"""
+
 from bart_reddit_lora.train import main
 
 if __name__ == "__main__":
 
@@ -1,3 +1,8 @@
+"""
+Module for scraping Reddit Q/A pairs, cleaning text, splitting data,
+and tokenizing for model training.
+"""
+
 import os
 import re
 import time
@@ -7,10 +12,22 @@
 from pathlib import Path
 from praw import Reddit
 from transformers import AutoTokenizer
-from typing import Union, Tuple
+from typing import Dict, List, Any, Union, Tuple
+import pandas as pd
+
+
+def init_reddit() -> Reddit:
+    """
+    Initialize and return a Reddit client using environment variables.
 
+    Environment Variables:
+        REDDIT_CLIENT_ID: Reddit API client ID.
+        REDDIT_CLIENT_SECRET: Reddit API client secret.
+        REDDIT_USER_AGENT: User agent string for Reddit API.
 
-def init_reddit() -> None:
+    Returns:
+        An authenticated praw.Reddit instance.
+    """
     return Reddit(
         client_id=os.environ["REDDIT_CLIENT_ID"],
         client_secret=os.environ["REDDIT_CLIENT_SECRET"],
@@ -19,6 +36,15 @@ def init_reddit() -> None:
 
 
 def clean_text(txt: str) -> str:
+    """
+    Clean a text string by removing HTML, code fences, URLs, emojis, quotes, and extra whitespace.
+
+    Args:
+        txt: Raw text to be cleaned.
+
+    Returns:
+        A cleaned text string.
+    """
     # strip HTML/Markdown
     txt = BeautifulSoup(txt, "html.parser").get_text()
     # remove code fences
@@ -36,7 +62,16 @@ def clean_text(txt: str) -> str:
     return txt
 
 
-def scrape(sub_size_map):
+def scrape(sub_size_map: Dict[str, int]) -> List[Dict[str, Any]]:
+    """
+    Scrape top posts and their highest-quality comments from specified subreddits.
+
+    Args:
+        sub_size_map: Mapping of subreddit names to sample sizes.
+
+    Returns:
+        List of dicts each containing 'id', 'subreddit', 'question', 'answer', and 'url'.
+    """
     reddit = init_reddit()
     qa = []  # the Q/A posts to train the model
 
@@ -149,7 +184,16 @@ def _comment_quality(c):
     return qa
 
 
-def preprocess(qa_raw):
+def preprocess(qa_raw: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    Apply text cleaning to raw Q/A entries.
+
+    Args:
+        qa_raw: List of dicts with raw 'question' and 'answer' fields.
+
+    Returns:
+        Cleaned list with same keys plus cleaned text.
+    """
     cleaned = []
     for item in qa_raw:
         q = clean_text(item["question"])
@@ -166,7 +210,14 @@ def preprocess(qa_raw):
     return cleaned
 
 
-def split_and_save(df, out_dir: Union[str, Path]):
+def split_and_save(df: pd.DataFrame, out_dir: Union[str, Path]) -> None:
+    """
+    Shuffle, split, and save DataFrame into train/validation/test CSV files.
+
+    Args:
+        df: DataFrame containing Q/A data.
+        out_dir: Directory path to save CSV files.
+    """
     # create the dir path if not existing
     out_dir = Path(out_dir)
     out_dir.mkdir(parents=True, exist_ok=True)
@@ -195,6 +246,18 @@ def tokenize_and_format(
     max_input_length: int = 512,  # max 1024 1024
     max_target_length: int = 128,  # max 1024 800
 ) -> Tuple[DatasetDict, AutoTokenizer]:
+    """
+    Tokenize and format a DatasetDict for model training.
+
+    Args:
+        ds: DatasetDict with 'question' and 'answer' columns.
+        checkpoint: Pretrained tokenizer checkpoint identifier.
+        max_input_length: Maximum input token length.
+        max_target_length: Maximum target token length.
+
+    Returns:
+        Tuple of tokenized DatasetDict and the tokenizer.
+    """
     tok = AutoTokenizer.from_pretrained(checkpoint)
 
     def _preprocess_batch(examples):
 
@@ -1,15 +1,49 @@
+"""
+Metrics computation module for sequence-to-sequence models.
+
+This module provides a factory function to create a `compute_metrics` callable
+for Hugging Face's `Trainer`. The returned function computes ROUGE-L, BLEU, and
+BERTScore (F1) on decoded model predictions versus labels.
+"""
+
 import numpy as np
 import evaluate
 from transformers import EvalPrediction
+from typing import Callable, Dict, Any, Union
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+
 
+def build_compute_metrics(
+    tok: PreTrainedTokenizerBase, 
+    num_process_workers: int = 2
+) -> Callable[[EvalPrediction], Dict[str, float]]:
+    """
+    Create a metrics computation function for use with Hugging Face `Trainer`.
 
-def build_compute_metrics(tok, num_process_workers: int = 2):
-    """Return a closure that Hugging Face's Trainer can call."""
+    Args:
+        tokenizer: A Hugging Face tokenizer for decoding predictions/labels.
+        num_process_workers: Number of worker processes for metric computation.
+
+    Returns:
+        A callable that takes an `EvalPrediction` and returns a dict with:
+          - "rougeL": ROUGE-L score (%)
+          - "bleu": BLEU score (%)
+          - "bertscore_f1": average BERTScore F1
+    """
     rouge = evaluate.load("rouge")  # longest-substring overlap
     bleu = evaluate.load("bleu")  # n-gram precision
     bertscore = evaluate.load("bertscore")  # semantic similarity
 
-    def _compute_metrics(eval_pred: EvalPrediction):
+    def _compute_metrics(eval_pred: EvalPrediction) -> Dict[str, float]:
+        """
+        Compute ROUGE-L, BLEU, and BERTScore given model predictions and labels.
+
+        Args:
+            eval_pred: An `EvalPrediction` with `predictions` and `label_ids`.
+
+        Returns:
+            A dict mapping metric names to rounded scores.
+        """
         preds, labels = eval_pred.predictions, eval_pred.label_ids
 
         # handle tuple output (some models return (generated_ids, ...))