1+ """
2+ Module for scraping Reddit Q/A pairs, cleaning text, splitting data,
3+ and tokenizing for model training.
4+ """
5+
16import os
27import re
38import time
712from pathlib import Path
813from praw import Reddit
914from transformers import AutoTokenizer
10- from typing import Union , Tuple
15+ from typing import Dict , List , Any , Union , Tuple
16+ import pandas as pd
17+
18+
19+ def init_reddit () -> Reddit :
20+ """
21+ Initialize and return a Reddit client using environment variables.
1122
23+ Environment Variables:
24+ REDDIT_CLIENT_ID: Reddit API client ID.
25+ REDDIT_CLIENT_SECRET: Reddit API client secret.
26+ REDDIT_USER_AGENT: User agent string for Reddit API.
1227
13- def init_reddit () -> None :
28+ Returns:
29+ An authenticated praw.Reddit instance.
30+ """
1431 return Reddit (
1532 client_id = os .environ ["REDDIT_CLIENT_ID" ],
1633 client_secret = os .environ ["REDDIT_CLIENT_SECRET" ],
@@ -19,6 +36,15 @@ def init_reddit() -> None:
1936
2037
2138def clean_text (txt : str ) -> str :
39+ """
40+ Clean a text string by removing HTML, code fences, URLs, emojis, quotes, and extra whitespace.
41+
42+ Args:
43+ txt: Raw text to be cleaned.
44+
45+ Returns:
46+ A cleaned text string.
47+ """
2248 # strip HTML/Markdown
2349 txt = BeautifulSoup (txt , "html.parser" ).get_text ()
2450 # remove code fences
@@ -36,7 +62,16 @@ def clean_text(txt: str) -> str:
3662 return txt
3763
3864
39- def scrape (sub_size_map ):
65+ def scrape (sub_size_map : Dict [str , int ]) -> List [Dict [str , Any ]]:
66+ """
67+ Scrape top posts and their highest-quality comments from specified subreddits.
68+
69+ Args:
70+ sub_size_map: Mapping of subreddit names to sample sizes.
71+
72+ Returns:
73+ List of dicts each containing 'id', 'subreddit', 'question', 'answer', and 'url'.
74+ """
4075 reddit = init_reddit ()
4176 qa = [] # the Q/A posts to train the model
4277
@@ -149,7 +184,16 @@ def _comment_quality(c):
149184 return qa
150185
151186
152- def preprocess (qa_raw ):
187+ def preprocess (qa_raw : List [Dict [str , Any ]]) -> List [Dict [str , Any ]]:
188+ """
189+ Apply text cleaning to raw Q/A entries.
190+
191+ Args:
192+ qa_raw: List of dicts with raw 'question' and 'answer' fields.
193+
194+ Returns:
195+ Cleaned list with same keys plus cleaned text.
196+ """
153197 cleaned = []
154198 for item in qa_raw :
155199 q = clean_text (item ["question" ])
@@ -166,7 +210,14 @@ def preprocess(qa_raw):
166210 return cleaned
167211
168212
169- def split_and_save (df , out_dir : Union [str , Path ]):
213+ def split_and_save (df : pd .DataFrame , out_dir : Union [str , Path ]) -> None :
214+ """
215+ Shuffle, split, and save DataFrame into train/validation/test CSV files.
216+
217+ Args:
218+ df: DataFrame containing Q/A data.
219+ out_dir: Directory path to save CSV files.
220+ """
170221 # create the dir path if not existing
171222 out_dir = Path (out_dir )
172223 out_dir .mkdir (parents = True , exist_ok = True )
@@ -195,6 +246,18 @@ def tokenize_and_format(
195246 max_input_length : int = 512 , # max 1024 1024
196247 max_target_length : int = 128 , # max 1024 800
197248) -> Tuple [DatasetDict , AutoTokenizer ]:
249+ """
250+ Tokenize and format a DatasetDict for model training.
251+
252+ Args:
253+ ds: DatasetDict with 'question' and 'answer' columns.
254+ checkpoint: Pretrained tokenizer checkpoint identifier.
255+ max_input_length: Maximum input token length.
256+ max_target_length: Maximum target token length.
257+
258+ Returns:
259+ Tuple of tokenized DatasetDict and the tokenizer.
260+ """
198261 tok = AutoTokenizer .from_pretrained (checkpoint )
199262
200263 def _preprocess_batch (examples ):
0 commit comments