Skip to content

Commit 486c798

Browse files
authored
feat: Add Reddit ingest connector (#293)
Add Reddit data connector for ingest. * The connector can process a subreddit. * Either via a search query, * or via hot posts. * The texts in the submissions are converted to markdown files including the post title and the text body, if any (i.e. no images or videos). * The number of posts to fetch can be changed with the CLI.
1 parent 0a51f28 commit 486c798

File tree

11 files changed

+405
-5
lines changed

11 files changed

+405
-5
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.4.16-dev2
1+
## 0.4.16-dev3
22

33
### Enhancements
44

@@ -7,6 +7,7 @@
77
### Features
88

99
* Added setup script for Ubuntu
10+
* Added Reddit connector for ingest cli.
1011

1112
### Fixes
1213

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ install-build:
5454
install-ingest-s3:
5555
pip install -r requirements/ingest-s3.txt
5656

57+
.PHONY: install-ingest-reddit
58+
install-ingest-reddit:
59+
pip install -r requirements/ingest-reddit.txt
60+
5761
.PHONY: install-unstructured-inference
5862
install-unstructured-inference:
5963
pip install -r requirements/local-inference.txt
@@ -83,6 +87,7 @@ pip-compile:
8387
# sphinx docs looks for additional requirements
8488
cp requirements/build.txt docs/requirements.txt
8589
pip-compile --upgrade requirements/ingest-s3.in requirements/base.txt --output-file requirements/ingest-s3.txt
90+
pip-compile --upgrade --extra=reddit --output-file=requirements/ingest-reddit.txt requirements/base.txt setup.py
8691

8792
## install-project-local: install unstructured into your local python environment
8893
.PHONY: install-project-local

examples/ingest/reddit/ingest.sh

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/usr/bin/env bash
2+
3+
# Processes the Unstructured-IO/unstructured repository
4+
# through Unstructured's library in 2 processes.
5+
6+
# Structured outputs are stored in reddit-ingest-output/
7+
8+
# NOTE, this script is not ready-to-run!
9+
# You must enter a client ID and a client secret before running.
10+
# You can find out how to get them here:
11+
# https://github.com/reddit-archive/reddit/wiki/OAuth2-Quick-Start-Example#first-steps
12+
# It is quite easy and very quick.
13+
14+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
15+
cd "$SCRIPT_DIR"/../../.. || exit 1
16+
17+
PYTHONPATH=. ./unstructured/ingest/main.py \
18+
--subreddit-name machinelearning \
19+
--reddit-client-id "<client id here>" \
20+
--reddit-client-secret "<client secret here>" \
21+
--reddit-user-agent "Unstructured Ingest Subreddit fetcher by \u\..." \
22+
--reddit-search-query "Unstructured" \
23+
--reddit-num-posts 10 \
24+
--structured-output-dir reddit-ingest-output \
25+
--num-processes 2 \
26+
--verbose
27+
28+
# Alternatively, you can call it using:
29+
# unstructured-ingest --subreddit-name ...

requirements/ingest-reddit.txt

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
#
2+
# This file is autogenerated by pip-compile with Python 3.8
3+
# by the following command:
4+
#
5+
# pip-compile --extra=reddit --output-file=requirements/ingest-reddit.txt requirements/base.txt setup.py
6+
#
7+
anyio==3.6.2
8+
# via
9+
# -r requirements/base.txt
10+
# httpcore
11+
argilla==1.3.0
12+
# via
13+
# -r requirements/base.txt
14+
# unstructured (setup.py)
15+
backoff==2.2.1
16+
# via
17+
# -r requirements/base.txt
18+
# argilla
19+
certifi==2022.12.7
20+
# via
21+
# -r requirements/base.txt
22+
# httpcore
23+
# httpx
24+
# requests
25+
# unstructured (setup.py)
26+
charset-normalizer==3.0.1
27+
# via
28+
# -r requirements/base.txt
29+
# requests
30+
click==8.1.3
31+
# via
32+
# -r requirements/base.txt
33+
# nltk
34+
colorama==0.4.6
35+
# via
36+
# click
37+
# tqdm
38+
deprecated==1.2.13
39+
# via
40+
# -r requirements/base.txt
41+
# argilla
42+
et-xmlfile==1.1.0
43+
# via
44+
# -r requirements/base.txt
45+
# openpyxl
46+
h11==0.14.0
47+
# via
48+
# -r requirements/base.txt
49+
# httpcore
50+
httpcore==0.16.3
51+
# via
52+
# -r requirements/base.txt
53+
# httpx
54+
httpx==0.23.3
55+
# via
56+
# -r requirements/base.txt
57+
# argilla
58+
idna==3.4
59+
# via
60+
# -r requirements/base.txt
61+
# anyio
62+
# requests
63+
# rfc3986
64+
joblib==1.2.0
65+
# via
66+
# -r requirements/base.txt
67+
# nltk
68+
lxml==4.9.2
69+
# via
70+
# -r requirements/base.txt
71+
# python-docx
72+
# python-pptx
73+
# unstructured (setup.py)
74+
monotonic==1.6
75+
# via
76+
# -r requirements/base.txt
77+
# argilla
78+
nltk==3.8.1
79+
# via
80+
# -r requirements/base.txt
81+
# unstructured (setup.py)
82+
numpy==1.23.5
83+
# via
84+
# -r requirements/base.txt
85+
# argilla
86+
# pandas
87+
openpyxl==3.1.1
88+
# via
89+
# -r requirements/base.txt
90+
# unstructured (setup.py)
91+
packaging==23.0
92+
# via
93+
# -r requirements/base.txt
94+
# argilla
95+
pandas==1.5.3
96+
# via
97+
# -r requirements/base.txt
98+
# argilla
99+
# unstructured (setup.py)
100+
pillow==9.4.0
101+
# via
102+
# -r requirements/base.txt
103+
# python-pptx
104+
# unstructured (setup.py)
105+
praw==7.7.0
106+
# via unstructured (setup.py)
107+
prawcore==2.3.0
108+
# via praw
109+
pydantic==1.10.4
110+
# via
111+
# -r requirements/base.txt
112+
# argilla
113+
python-dateutil==2.8.2
114+
# via
115+
# -r requirements/base.txt
116+
# pandas
117+
python-docx==0.8.11
118+
# via
119+
# -r requirements/base.txt
120+
# unstructured (setup.py)
121+
python-magic==0.4.27
122+
# via
123+
# -r requirements/base.txt
124+
# unstructured (setup.py)
125+
python-pptx==0.6.21
126+
# via
127+
# -r requirements/base.txt
128+
# unstructured (setup.py)
129+
pytz==2022.7.1
130+
# via
131+
# -r requirements/base.txt
132+
# pandas
133+
regex==2022.10.31
134+
# via
135+
# -r requirements/base.txt
136+
# nltk
137+
requests==2.28.2
138+
# via
139+
# -r requirements/base.txt
140+
# prawcore
141+
# unstructured (setup.py)
142+
# update-checker
143+
rfc3986[idna2008]==1.5.0
144+
# via
145+
# -r requirements/base.txt
146+
# httpx
147+
six==1.16.0
148+
# via
149+
# -r requirements/base.txt
150+
# python-dateutil
151+
sniffio==1.3.0
152+
# via
153+
# -r requirements/base.txt
154+
# anyio
155+
# httpcore
156+
# httpx
157+
tqdm==4.64.1
158+
# via
159+
# -r requirements/base.txt
160+
# argilla
161+
# nltk
162+
typing-extensions==4.4.0
163+
# via
164+
# -r requirements/base.txt
165+
# pydantic
166+
update-checker==0.18.0
167+
# via praw
168+
urllib3==1.26.14
169+
# via
170+
# -r requirements/base.txt
171+
# requests
172+
websocket-client==1.5.1
173+
# via praw
174+
wrapt==1.14.1
175+
# via
176+
# -r requirements/base.txt
177+
# argilla
178+
# deprecated
179+
xlsxwriter==3.0.8
180+
# via
181+
# -r requirements/base.txt
182+
# python-pptx

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
],
7575
"local-inference": ["unstructured-inference>=0.2.4"],
7676
"s3": ["boto3"],
77+
"reddit": ["praw"],
7778
},
7879
package_dir={"unstructured": "unstructured"},
7980
package_data={"unstructured": ["nlp/*.txt"]},

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.16-dev2" # pragma: no cover
1+
__version__ = "0.4.16-dev3" # pragma: no cover

unstructured/file_utils/filetype.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,6 @@ def detect_filetype(
126126
_, extension = os.path.splitext(filename)
127127
extension = extension.lower()
128128
if LIBMAGIC_AVAILABLE:
129-
mime_type = None
130129
mime_type = magic.from_file(filename, mime=True)
131130
else:
132131
return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK)
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import json
2+
import os
3+
from dataclasses import dataclass, field
4+
from pathlib import Path
5+
from typing import TYPE_CHECKING
6+
7+
from unstructured.ingest.interfaces import (
8+
BaseConnector,
9+
BaseConnectorConfig,
10+
BaseIngestDoc,
11+
)
12+
13+
if TYPE_CHECKING:
14+
from praw.models import Submission
15+
16+
17+
@dataclass
18+
class SimpleRedditConfig(BaseConnectorConfig):
19+
subreddit_name: str
20+
client_id: str
21+
client_secret: str
22+
user_agent: str
23+
search_query: str
24+
num_posts: int
25+
26+
# Standard Connector options
27+
download_dir: str
28+
# where to write structured data
29+
output_dir: str
30+
preserve_downloads: bool = False
31+
re_download: bool = False
32+
verbose: bool = False
33+
34+
def __post_init__(self):
35+
if self.num_posts <= 0:
36+
raise ValueError("The number of Reddit posts to fetch must be positive.")
37+
38+
39+
@dataclass
40+
class RedditIngestDoc(BaseIngestDoc):
41+
config: SimpleRedditConfig = field(repr=False)
42+
post: "Submission"
43+
44+
@property
45+
def filename(self) -> Path:
46+
return (Path(self.config.download_dir) / f"{self.post.id}.md").resolve()
47+
48+
def _output_filename(self):
49+
return Path(self.config.output_dir) / f"{self.post.id}.json"
50+
51+
def _create_full_tmp_dir_path(self):
52+
self.filename.parent.mkdir(parents=True, exist_ok=True)
53+
54+
def cleanup_file(self):
55+
"""Removes the local copy the file (or anything else) after successful processing."""
56+
if not self.config.preserve_downloads:
57+
if self.config.verbose:
58+
print(f"cleaning up {self}")
59+
os.unlink(self.filename)
60+
61+
def get_file(self):
62+
"""Fetches the "remote" doc and stores it locally on the filesystem."""
63+
self._create_full_tmp_dir_path()
64+
if not self.config.re_download and self.filename.is_file() and self.filename.stat():
65+
if self.config.verbose:
66+
print(f"File exists: {self.filename}, skipping download")
67+
return
68+
69+
if self.config.verbose:
70+
print(f"fetching {self} - PID: {os.getpid()}")
71+
# Write the title plus the body, if any
72+
text_to_write = f"# {self.post.title}\n{self.post.selftext}"
73+
with open(self.filename, "w", encoding="utf8") as f:
74+
f.write(text_to_write)
75+
76+
def has_output(self):
77+
"""Determine if structured output for this doc already exists."""
78+
output_filename = self._output_filename()
79+
return output_filename.is_file() and output_filename.stat()
80+
81+
def write_result(self):
82+
"""Write the structured json result for this doc. result must be json serializable."""
83+
output_filename = self._output_filename()
84+
output_filename.parent.mkdir(parents=True, exist_ok=True)
85+
with open(output_filename, "w", encoding="utf8") as output_f:
86+
json.dump(self.isd_elems_no_filename, output_f, ensure_ascii=False, indent=2)
87+
print(f"Wrote {output_filename}")
88+
89+
90+
class RedditConnector(BaseConnector):
91+
def __init__(self, config: SimpleRedditConfig):
92+
from praw import Reddit
93+
94+
self.config = config
95+
self.reddit = Reddit(
96+
client_id=config.client_id,
97+
client_secret=config.client_secret,
98+
user_agent=config.user_agent,
99+
)
100+
self.cleanup_files = not config.preserve_downloads
101+
102+
def cleanup(self, cur_dir=None):
103+
if not self.cleanup_files:
104+
return
105+
106+
if cur_dir is None:
107+
cur_dir = self.config.download_dir
108+
sub_dirs = os.listdir(cur_dir)
109+
os.chdir(cur_dir)
110+
for sub_dir in sub_dirs:
111+
# don't traverse symlinks, not that there every should be any
112+
if os.path.isdir(sub_dir) and not os.path.islink(sub_dir):
113+
self.cleanup(sub_dir)
114+
os.chdir("..")
115+
if len(os.listdir(cur_dir)) == 0:
116+
os.rmdir(cur_dir)
117+
118+
def initialize(self):
119+
pass
120+
121+
def get_ingest_docs(self):
122+
subreddit = self.reddit.subreddit(self.config.subreddit_name)
123+
if self.config.search_query:
124+
posts = subreddit.search(self.config.search_query, limit=self.config.num_posts)
125+
else:
126+
posts = subreddit.hot(limit=self.config.num_posts)
127+
return [RedditIngestDoc(self.config, post) for post in posts]

0 commit comments

Comments
 (0)