kernelism
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 42 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 30 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 110 additions & 0 deletions b/‎README.md‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎pytest.ini‎
Lines changed: 6 additions & 0 deletions b/‎pytest.ini‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎reddit_scraper/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎reddit_scraper/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎reddit_scraper/__main__.py‎
Lines changed: 67 additions & 0 deletions b/‎reddit_scraper/__main__.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎reddit_scraper/core/__init__.py‎
Lines changed: 15 additions & 0 deletions b/‎reddit_scraper/core/__init__.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎reddit_scraper/core/data_processor.py‎
Lines changed: 83 additions & 0 deletions b/‎reddit_scraper/core/data_processor.py‎
Lines changed: 83 additions & 0 deletions
@@ -0,0 +1,42 @@
+name: CI
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.9, 3.10, 3.11]
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+
+    - name: Run pre-commit hooks
+      run: |
+        pre-commit install
+        pre-commit run --all-files
+
+    - name: Run tests
+      run: |
+        pytest --cov=. --cov-report=xml
+
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v3
+      with:
+        file: ./coverage.xml
+        fail_ci_if_error: true
@@ -0,0 +1,30 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+    -   id: trailing-whitespace
+    -   id: end-of-file-fixer
+    -   id: check-yaml
+    -   id: check-added-large-files
+    -   id: check-ast
+    -   id: check-json
+    -   id: check-merge-conflict
+    -   id: detect-private-key
+
+-   repo: https://github.com/psf/black
+    rev: 23.3.0
+    hooks:
+    -   id: black
+        language_version: python3
+
+-   repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+    -   id: isort
+        args: ["--profile", "black"]
+
+-   repo: https://github.com/pycqa/flake8
+    rev: 6.0.0
+    hooks:
+    -   id: flake8
+        additional_dependencies: [flake8-docstrings]
@@ -0,0 +1,110 @@
+# Reddit Scraper
+
+A simple tool to scrape posts and comments from Reddit subreddits.
+
+## What it does
+
+- Scrapes top posts and their comments from specified subreddits
+- Supports monthly or yearly time periods
+- Can limit the number of posts scraped per subreddit
+- Saves data in JSON format for easy analysis
+
+## Installation
+
+### From source
+
+```bash
+git clone https://github.com/yourusername/reddit-scraper.git
+cd reddit-scraper
+pip install -e .
+```
+
+### Using pip
+
+```bash
+pip install reddit-scraper
+```
+
+## Usage
+
+1. Create a `subreddits.json` file with your target subreddits:
+
+```json
+[
+    "programming",
+    "python",
+    "physics",
+    "biology"
+]
+```
+
+2. Run the scraper:
+
+```bash
+reddit-scraper -d month -s subreddits.json
+```
+
+3. To limit posts per subreddit:
+
+```bash
+reddit-scraper -d month -s subreddits.json -l 50
+```
+
+## Options
+
+- `-d, --duration`: Time period (month/year)
+- `-l, --post-limit`: Max posts per subreddit
+- `-s, --subreddits-file`: Path to subreddits config file
+
+## Output
+
+Data is saved in JSON files under the `data/` directory, one file per subreddit.
+
+## Development
+
+### Project Structure
+
+```
+reddit-scraper/
+├── reddit_scraper/           # Main package
+│   ├── core/                 # Core functionality
+│   │   ├── models.py         # Data models
+│   │   ├── scraper.py        # Scraping logic
+│   │   └── data_processor.py # Data processing
+│   ├── utils/                # Utilities
+│   │   └── config.py         # Configuration
+│   ├── __init__.py           # Package initialization
+│   └── __main__.py           # Entry point
+├── tests/                    # Test suite
+├── setup.py                  # Package setup
+├── requirements.txt          # Dependencies
+└── README.md                 # Documentation
+```
+
+### Pre-commit Hooks
+
+This project uses pre-commit hooks to ensure code quality. To set them up:
+
+```bash
+pre-commit install
+```
+
+The hooks will run automatically on commit, or you can run them manually:
+
+```bash
+pre-commit run --all-files
+```
+
+### Testing
+
+Run the tests with:
+
+```bash
+pytest
+```
+
+For coverage information:
+
+```bash
+pytest --cov=. --cov-report=term-missing
+```
@@ -0,0 +1,6 @@
+[pytest]
+testpaths = tests
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+addopts = --cov=. --cov-report=term-missing
@@ -0,0 +1,3 @@
+"""Reddit Scraper - A simple tool to scrape posts and comments from Reddit subreddits."""
+
+__version__ = "0.1.0"
@@ -0,0 +1,67 @@
+"""Reddit Scraper - A simple tool to scrape posts and comments from Reddit subreddits."""
+
+import click
+from dotenv import load_dotenv
+
+from reddit_scraper.core.data_processor import DataProcessor
+from reddit_scraper.core.models import SubredditConfig
+from reddit_scraper.core.scraper import RedditScraper
+
+load_dotenv()
+
+
+def process_subreddit(
+    subreddit_config: SubredditConfig, post_limit: int = None
+) -> None:
+    """Process a single subreddit."""
+    print(f"Scraping subreddit: {subreddit_config.name}")
+    scraper = RedditScraper(subreddit=subreddit_config.url, post_limit=post_limit)
+    processor = DataProcessor()
+
+    try:
+        scraper.get_posts()
+        while True:
+            checkpointed = scraper.get_post_details()
+            if not checkpointed:
+                break
+
+        processor.save_to_json(scraper.posts, subreddit_config.name)
+    except Exception as e:
+        print(f"Error processing subreddit {subreddit_config.name}: {e}")
+    finally:
+        scraper.destroy()
+
+
+@click.command()
+@click.option(
+    "-d",
+    "--duration",
+    prompt="Scrape Duration",
+    help="Duration to scrape for (month/year)",
+)
+@click.option(
+    "-s",
+    "--subreddits-file",
+    default="subreddits.json",
+    help="Path to the subreddits JSON file",
+)
+@click.option(
+    "-l",
+    "--post-limit",
+    type=int,
+    help="Maximum number of posts to scrape per subreddit",
+)
+def main(duration: str, subreddits_file: str, post_limit: int = None) -> None:
+    """Main entry point for the Reddit scraper."""
+    if duration not in ["month", "year"]:
+        raise ValueError("Duration must be either month or year")
+
+    processor = DataProcessor()
+    subreddits = processor.read_subreddits_from_json(subreddits_file, duration)
+
+    for subreddit_config in subreddits:
+        process_subreddit(subreddit_config, post_limit)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,15 @@
+"""Core functionality for the Reddit Scraper."""
+
+from reddit_scraper.core.data_processor import DataProcessor, DateTimeEncoder
+from reddit_scraper.core.models import Comment, Post, ScraperState, SubredditConfig
+from reddit_scraper.core.scraper import RedditScraper
+
+__all__ = [
+    "Comment",
+    "Post",
+    "SubredditConfig",
+    "ScraperState",
+    "RedditScraper",
+    "DataProcessor",
+    "DateTimeEncoder",
+]
@@ -0,0 +1,83 @@
+import json
+import os
+from datetime import datetime
+from typing import Any, Dict, List
+
+from reddit_scraper.core.models import Comment, Post, SubredditConfig
+from reddit_scraper.utils.config import get_scraper_config
+
+
+class DateTimeEncoder(json.JSONEncoder):
+    """Custom JSON encoder for datetime objects."""
+
+    def default(self, obj):
+        if isinstance(obj, datetime):
+            return obj.isoformat()
+        return super().default(obj)
+
+
+class DataProcessor:
+    """Handles data processing and storage operations."""
+
+    def __init__(self):
+        """Initialize the data processor."""
+        self.config = get_scraper_config()
+
+    def parse_post_data(self, json_data: Dict[str, Any]) -> Post:
+        """Parse raw JSON data into a Post model."""
+        post = json_data[0]["data"]["children"][0]["data"]
+        comments_data = json_data[1]["data"]["children"]
+
+        return Post(
+            post_body=post["title"],
+            post_user=post["author"],
+            post_time=datetime.fromtimestamp(post["created_utc"]),
+            comments=self._parse_comments(comments_data),
+        )
+
+    def _parse_comments(self, comment_data: List[Dict[str, Any]]) -> List[Comment]:
+        """Parse comment data into Comment models."""
+        comments = []
+        for comment in comment_data:
+            if comment["kind"] != "t1":
+                continue
+
+            comment_dict = comment["data"]
+            comments.append(
+                Comment(
+                    body=comment_dict["body"],
+                    user=comment_dict["author"],
+                    time=datetime.fromtimestamp(comment_dict["created_utc"]),
+                    replies=self._parse_comments(
+                        comment_dict["replies"]["data"]["children"]
+                    )
+                    if comment_dict.get("replies")
+                    else [],
+                )
+            )
+        return comments
+
+    def save_to_json(self, data: List[Post], subreddit: str) -> str:
+        """Save processed data to a JSON file."""
+        directory = self.config.data_dir
+        os.makedirs(directory, exist_ok=True)
+        filename = f"{directory}/{subreddit}.json"
+
+        with open(filename, "w") as f:
+            json.dump([post.dict() for post in data], f, cls=DateTimeEncoder)
+        return filename
+
+    def read_subreddits_from_json(
+        self, filename: str, duration: str
+    ) -> List[SubredditConfig]:
+        """Read subreddit configurations from a JSON file."""
+        with open(filename) as f:
+            subreddits_list = json.load(f)
+
+        return [
+            SubredditConfig(
+                name=subreddit,
+                url=f"https://www.reddit.com/r/{subreddit}/top/?t={duration}",
+            )
+            for subreddit in subreddits_list
+        ]
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+"""Reddit Scraper - A simple tool to scrape posts and comments from Reddit subreddits."""`
	`2`	`+`
	`3`	`+__version__ = "0.1.0"`