feat: Add Reddit ingest connector (#293)

tomaarsen · web-flow · commit 486c7987fcdc · 2023-02-27T00:11:04.000-08:00
Add Reddit data connector for ingest.
* The connector can process a subreddit.
* Either via a search query,
*  or via hot posts.
* The texts in the submissions are converted to markdown files including the post title and the text body, if any (i.e. no images or videos).
* The number of posts to fetch can be changed with the CLI.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.4.16-dev2
+## 0.4.16-dev3
 
 ### Enhancements
 
@@ -7,6 +7,7 @@
 ### Features
 
 * Added setup script for Ubuntu
+* Added Reddit connector for ingest cli.
 
 ### Fixes
 
diff --git a/Makefile b/Makefile
@@ -54,6 +54,10 @@ install-build:
 install-ingest-s3:
 	pip install -r requirements/ingest-s3.txt
 
+.PHONY: install-ingest-reddit
+install-ingest-reddit:
+	pip install -r requirements/ingest-reddit.txt
+
 .PHONY: install-unstructured-inference
 install-unstructured-inference:
 	pip install -r requirements/local-inference.txt
@@ -83,6 +87,7 @@ pip-compile:
 	# sphinx docs looks for additional requirements
 	cp requirements/build.txt docs/requirements.txt
 	pip-compile --upgrade requirements/ingest-s3.in requirements/base.txt --output-file requirements/ingest-s3.txt
+	pip-compile --upgrade --extra=reddit --output-file=requirements/ingest-reddit.txt requirements/base.txt setup.py
 
 ## install-project-local:   install unstructured into your local python environment
 .PHONY: install-project-local
diff --git a/examples/ingest/reddit/ingest.sh b/examples/ingest/reddit/ingest.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+# Processes the Unstructured-IO/unstructured repository
+# through Unstructured's library in 2 processes.
+
+# Structured outputs are stored in reddit-ingest-output/
+
+# NOTE, this script is not ready-to-run!
+# You must enter a client ID and a client secret before running.
+# You can find out how to get them here:
+# https://github.com/reddit-archive/reddit/wiki/OAuth2-Quick-Start-Example#first-steps
+# It is quite easy and very quick.
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd "$SCRIPT_DIR"/../../.. || exit 1
+
+PYTHONPATH=. ./unstructured/ingest/main.py \
+    --subreddit-name machinelearning \
+    --reddit-client-id "<client id here>" \
+    --reddit-client-secret "<client secret here>" \
+    --reddit-user-agent "Unstructured Ingest Subreddit fetcher by \u\..." \
+    --reddit-search-query "Unstructured" \
+    --reddit-num-posts 10 \
+    --structured-output-dir reddit-ingest-output \
+    --num-processes 2 \
+    --verbose
+
+# Alternatively, you can call it using:
+# unstructured-ingest --subreddit-name ...
diff --git a/requirements/ingest-reddit.txt b/requirements/ingest-reddit.txt
@@ -0,0 +1,182 @@
+#
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
+#
+#    pip-compile --extra=reddit --output-file=requirements/ingest-reddit.txt requirements/base.txt setup.py
+#
+anyio==3.6.2
+    # via
+    #   -r requirements/base.txt
+    #   httpcore
+argilla==1.3.0
+    # via
+    #   -r requirements/base.txt
+    #   unstructured (setup.py)
+backoff==2.2.1
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+certifi==2022.12.7
+    # via
+    #   -r requirements/base.txt
+    #   httpcore
+    #   httpx
+    #   requests
+    #   unstructured (setup.py)
+charset-normalizer==3.0.1
+    # via
+    #   -r requirements/base.txt
+    #   requests
+click==8.1.3
+    # via
+    #   -r requirements/base.txt
+    #   nltk
+colorama==0.4.6
+    # via
+    #   click
+    #   tqdm
+deprecated==1.2.13
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+et-xmlfile==1.1.0
+    # via
+    #   -r requirements/base.txt
+    #   openpyxl
+h11==0.14.0
+    # via
+    #   -r requirements/base.txt
+    #   httpcore
+httpcore==0.16.3
+    # via
+    #   -r requirements/base.txt
+    #   httpx
+httpx==0.23.3
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+idna==3.4
+    # via
+    #   -r requirements/base.txt
+    #   anyio
+    #   requests
+    #   rfc3986
+joblib==1.2.0
+    # via
+    #   -r requirements/base.txt
+    #   nltk
+lxml==4.9.2
+    # via
+    #   -r requirements/base.txt
+    #   python-docx
+    #   python-pptx
+    #   unstructured (setup.py)
+monotonic==1.6
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+nltk==3.8.1
+    # via
+    #   -r requirements/base.txt
+    #   unstructured (setup.py)
+numpy==1.23.5
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+    #   pandas
+openpyxl==3.1.1
+    # via
+    #   -r requirements/base.txt
+    #   unstructured (setup.py)
+packaging==23.0
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+pandas==1.5.3
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+    #   unstructured (setup.py)
+pillow==9.4.0
+    # via
+    #   -r requirements/base.txt
+    #   python-pptx
+    #   unstructured (setup.py)
+praw==7.7.0
+    # via unstructured (setup.py)
+prawcore==2.3.0
+    # via praw
+pydantic==1.10.4
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+python-dateutil==2.8.2
+    # via
+    #   -r requirements/base.txt
+    #   pandas
+python-docx==0.8.11
+    # via
+    #   -r requirements/base.txt
+    #   unstructured (setup.py)
+python-magic==0.4.27
+    # via
+    #   -r requirements/base.txt
+    #   unstructured (setup.py)
+python-pptx==0.6.21
+    # via
+    #   -r requirements/base.txt
+    #   unstructured (setup.py)
+pytz==2022.7.1
+    # via
+    #   -r requirements/base.txt
+    #   pandas
+regex==2022.10.31
+    # via
+    #   -r requirements/base.txt
+    #   nltk
+requests==2.28.2
+    # via
+    #   -r requirements/base.txt
+    #   prawcore
+    #   unstructured (setup.py)
+    #   update-checker
+rfc3986[idna2008]==1.5.0
+    # via
+    #   -r requirements/base.txt
+    #   httpx
+six==1.16.0
+    # via
+    #   -r requirements/base.txt
+    #   python-dateutil
+sniffio==1.3.0
+    # via
+    #   -r requirements/base.txt
+    #   anyio
+    #   httpcore
+    #   httpx
+tqdm==4.64.1
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+    #   nltk
+typing-extensions==4.4.0
+    # via
+    #   -r requirements/base.txt
+    #   pydantic
+update-checker==0.18.0
+    # via praw
+urllib3==1.26.14
+    # via
+    #   -r requirements/base.txt
+    #   requests
+websocket-client==1.5.1
+    # via praw
+wrapt==1.14.1
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+    #   deprecated
+xlsxwriter==3.0.8
+    # via
+    #   -r requirements/base.txt
+    #   python-pptx
diff --git a/setup.py b/setup.py
@@ -74,6 +74,7 @@
         ],
         "local-inference": ["unstructured-inference>=0.2.4"],
         "s3": ["boto3"],
+        "reddit": ["praw"],
     },
     package_dir={"unstructured": "unstructured"},
     package_data={"unstructured": ["nlp/*.txt"]},
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.4.16-dev2"  # pragma: no cover
+__version__ = "0.4.16-dev3"  # pragma: no cover
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
@@ -126,7 +126,6 @@ def detect_filetype(
         _, extension = os.path.splitext(filename)
         extension = extension.lower()
         if LIBMAGIC_AVAILABLE:
-            mime_type = None
             mime_type = magic.from_file(filename, mime=True)
         else:
             return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK)
diff --git a/unstructured/ingest/connector/reddit.py b/unstructured/ingest/connector/reddit.py
@@ -0,0 +1,127 @@
+import json
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from unstructured.ingest.interfaces import (
+    BaseConnector,
+    BaseConnectorConfig,
+    BaseIngestDoc,
+)
+
+if TYPE_CHECKING:
+    from praw.models import Submission
+
+
+@dataclass
+class SimpleRedditConfig(BaseConnectorConfig):
+    subreddit_name: str
+    client_id: str
+    client_secret: str
+    user_agent: str
+    search_query: str
+    num_posts: int
+
+    # Standard Connector options
+    download_dir: str
+    # where to write structured data
+    output_dir: str
+    preserve_downloads: bool = False
+    re_download: bool = False
+    verbose: bool = False
+
+    def __post_init__(self):
+        if self.num_posts <= 0:
+            raise ValueError("The number of Reddit posts to fetch must be positive.")
+
+
+@dataclass
+class RedditIngestDoc(BaseIngestDoc):
+    config: SimpleRedditConfig = field(repr=False)
+    post: "Submission"
+
+    @property
+    def filename(self) -> Path:
+        return (Path(self.config.download_dir) / f"{self.post.id}.md").resolve()
+
+    def _output_filename(self):
+        return Path(self.config.output_dir) / f"{self.post.id}.json"
+
+    def _create_full_tmp_dir_path(self):
+        self.filename.parent.mkdir(parents=True, exist_ok=True)
+
+    def cleanup_file(self):
+        """Removes the local copy the file (or anything else) after successful processing."""
+        if not self.config.preserve_downloads:
+            if self.config.verbose:
+                print(f"cleaning up {self}")
+            os.unlink(self.filename)
+
+    def get_file(self):
+        """Fetches the "remote" doc and stores it locally on the filesystem."""
+        self._create_full_tmp_dir_path()
+        if not self.config.re_download and self.filename.is_file() and self.filename.stat():
+            if self.config.verbose:
+                print(f"File exists: {self.filename}, skipping download")
+            return
+
+        if self.config.verbose:
+            print(f"fetching {self} - PID: {os.getpid()}")
+        # Write the title plus the body, if any
+        text_to_write = f"# {self.post.title}\n{self.post.selftext}"
+        with open(self.filename, "w", encoding="utf8") as f:
+            f.write(text_to_write)
+
+    def has_output(self):
+        """Determine if structured output for this doc already exists."""
+        output_filename = self._output_filename()
+        return output_filename.is_file() and output_filename.stat()
+
+    def write_result(self):
+        """Write the structured json result for this doc. result must be json serializable."""
+        output_filename = self._output_filename()
+        output_filename.parent.mkdir(parents=True, exist_ok=True)
+        with open(output_filename, "w", encoding="utf8") as output_f:
+            json.dump(self.isd_elems_no_filename, output_f, ensure_ascii=False, indent=2)
+        print(f"Wrote {output_filename}")
+
+
+class RedditConnector(BaseConnector):
+    def __init__(self, config: SimpleRedditConfig):
+        from praw import Reddit
+
+        self.config = config
+        self.reddit = Reddit(
+            client_id=config.client_id,
+            client_secret=config.client_secret,
+            user_agent=config.user_agent,
+        )
+        self.cleanup_files = not config.preserve_downloads
+
+    def cleanup(self, cur_dir=None):
+        if not self.cleanup_files:
+            return
+
+        if cur_dir is None:
+            cur_dir = self.config.download_dir
+        sub_dirs = os.listdir(cur_dir)
+        os.chdir(cur_dir)
+        for sub_dir in sub_dirs:
+            # don't traverse symlinks, not that there every should be any
+            if os.path.isdir(sub_dir) and not os.path.islink(sub_dir):
+                self.cleanup(sub_dir)
+        os.chdir("..")
+        if len(os.listdir(cur_dir)) == 0:
+            os.rmdir(cur_dir)
+
+    def initialize(self):
+        pass
+
+    def get_ingest_docs(self):
+        subreddit = self.reddit.subreddit(self.config.subreddit_name)
+        if self.config.search_query:
+            posts = subreddit.search(self.config.search_query, limit=self.config.num_posts)
+        else:
+            posts = subreddit.hot(limit=self.config.num_posts)
+        return [RedditIngestDoc(self.config, post) for post in posts]
diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py
diff --git a/unstructured/ingest/main.py b/unstructured/ingest/main.py
diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.4.16-dev2" # pragma: no cover`
	`1`	`+__version__ = "0.4.16-dev3" # pragma: no cover`