forked from pakagronglb/reddit-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
168 lines (140 loc) · 5.84 KB
/
main.py
File metadata and controls
168 lines (140 loc) · 5.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import praw
import pandas as pd
import os
from dotenv import load_dotenv
import streamlit as st
# Load environment variables only in development
if os.path.exists('.env'):
load_dotenv()
# Initialize Reddit API
def init_reddit():
# Try getting from environment first (for production)
client_id = os.environ.get("REDDIT_CLIENT_ID") or st.secrets["REDDIT_CLIENT_ID"]
client_secret = os.environ.get("REDDIT_CLIENT_SECRET") or st.secrets["REDDIT_CLIENT_SECRET"]
user_agent = os.environ.get("REDDIT_USER_AGENT") or st.secrets["REDDIT_USER_AGENT"]
return praw.Reddit(
client_id=client_id,
client_secret=client_secret,
user_agent=user_agent
)
@st.cache_data(ttl=3600) # Cache data for 1 hour
def get_subreddit_posts(_reddit, subreddit_name, post_limit, time_filter="month"):
try:
subreddit = _reddit.subreddit(subreddit_name)
posts_dict = {
"Title": [],
"Post Text": [],
"ID": [],
"Score": [],
"Total Comments": [],
"Post URL": []
}
for post in subreddit.top(limit=post_limit, time_filter=time_filter):
posts_dict["Title"].append(post.title)
posts_dict["Post Text"].append(post.selftext)
posts_dict["ID"].append(post.id)
posts_dict["Score"].append(post.score)
posts_dict["Total Comments"].append(post.num_comments)
posts_dict["Post URL"].append(post.url)
return pd.DataFrame(posts_dict)
except Exception as e:
st.error(f"Error fetching subreddit posts: {str(e)}")
return pd.DataFrame()
@st.cache_data(ttl=3600)
def get_post_by_url(_reddit, url):
try:
submission = _reddit.submission(url=url)
post_data = {
"Title": submission.title,
"Post Text": submission.selftext,
"ID": submission.id,
"Score": submission.score,
"Total Comments": submission.num_comments,
"Post URL": submission.url
}
post_comments = []
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
comment_data = {
"Comment Text": comment.body,
"Score": comment.score,
"Author": str(comment.author),
"Created UTC": comment.created_utc
}
post_comments.append(comment_data)
return pd.DataFrame([post_data]), pd.DataFrame(post_comments)
except Exception as e:
st.error(f"Error fetching post: {str(e)}")
return pd.DataFrame(), pd.DataFrame()
def main():
st.title("Reddit Data Scraper")
# Initialize Reddit instance
reddit = init_reddit()
# Sidebar for input parameters
st.sidebar.header("Settings")
scrape_option = st.sidebar.radio(
"Choose scraping option:",
["Subreddit Posts", "Specific Post by URL"]
)
if scrape_option == "Subreddit Posts":
st.header("Subreddit Posts Scraper")
# Input fields
subreddit_name = st.text_input("Enter subreddit name:", "selfhosted")
post_limit = st.slider("Number of posts to scrape:", 1, 100, 10)
time_filter = st.selectbox(
"Time filter:",
["day", "week", "month", "year", "all"]
)
if st.button("Scrape Subreddit"):
with st.spinner("Scraping posts..."):
try:
df = get_subreddit_posts(reddit, subreddit_name, post_limit, time_filter)
st.success(f"Successfully scraped {len(df)} posts!")
# Display the data
st.dataframe(df)
# Download button
csv = df.to_csv(index=False).encode('utf-8')
st.download_button(
"Download CSV",
csv,
f"{subreddit_name}_posts.csv",
"text/csv",
key='download-csv'
)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
else:
st.header("Post URL Scraper")
post_url = st.text_input("Enter Reddit post URL:")
if st.button("Scrape Post"):
with st.spinner("Scraping post and comments..."):
try:
post_df, comments_df = get_post_by_url(reddit, post_url)
st.subheader("Post Details")
st.dataframe(post_df)
st.subheader(f"Comments ({len(comments_df)} total)")
st.dataframe(comments_df)
# Download buttons
post_csv = post_df.to_csv(index=False).encode('utf-8')
comments_csv = comments_df.to_csv(index=False).encode('utf-8')
col1, col2 = st.columns(2)
with col1:
st.download_button(
"Download Post CSV",
post_csv,
"post_details.csv",
"text/csv",
key='download-post'
)
with col2:
st.download_button(
"Download Comments CSV",
comments_csv,
"comments.csv",
"text/csv",
key='download-comments'
)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
if __name__ == "__main__":
main()