-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathextract_submissions_via_search.py
More file actions
105 lines (87 loc) · 3.91 KB
/
extract_submissions_via_search.py
File metadata and controls
105 lines (87 loc) · 3.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#Import modules
#--------------------------------------
import praw #to access Reddit
import requests #Might not need this
from praw.models import MoreComments #access Reddit comments
import sys #for .translate bmp code
import regex #regex matching
import datetime #converting timestamps
import redcreds as creds #Reddit credentials
#Assign Reddit Credentials
#---------------------------------------
r = praw.Reddit(username = creds.username,
password = creds.password,
client_id = creds.client_id,
client_secret = creds.client_secret,
user_agent = creds.user_agent)
print("Credentials have been accepted")
#Assign Subreddit
#------------------------------
print("What subreddit are we mining through today?")
chosen_sub = input()
subreddit = r.subreddit(chosen_sub)
print("Thanks")
#Create list to store submissions
subs = []
subCount = 0
sub_entries = {}
print("Placeholders have been assigned")
#Collect submission ids
#------------------------------
#For each submission within selected subreddit
#sort – Can be one of: relevance, hot, top, new, comments. (default: relevance).
#time_filter – Can be one of: all, day, hour, month, week, year (default: all).
print("And what are we searching for in this subreddit?")
sub_query = input()
#For most endpoints this results in 100 items per request.
#If you want to retrieve as many as possible pass in limit=None.
for submission in subreddit.search(sub_query, sort='new', time_filter='week', limit=None):
subs.append(submission.id)
subCount+=1
#Test list of submission ids
print(str(subCount) + " submissions have added to list")
print("1st entry is:")
print(r.submission(id=str(subs[0])).title + " created: " + str(datetime.datetime.fromtimestamp(r.submission(id=str(subs[0])).created)))
print("Last entry is:")
print(r.submission(id=str(subs[subCount-1])).title + " created on: " + str(datetime.datetime.fromtimestamp(r.submission(id=str(subs[subCount-1])).created)))
#Build Extract key submission data
def collectSubData(submission):
post = r.submission(id=submission) #Access subreddit post based on submission id
subData = list() #list to store key data of submission
title = post.title
url = post.url
flair = post.flair
author = post.author
unique = post.id
score = post.score
created = datetime.datetime.fromtimestamp(post.created) #e.g. 1520561700.0 which can be converted later
upratio = post.upvote_ratio
topcommsCnt = len(post.comments)
allcommsCnt = len(post.comments.list()) #or len(post.num_comments)
subData.append((unique,title,url,author,score,created,upratio,topcommsCnt,allcommsCnt,flair))
sub_entries[unique] = subData
#Run Submission Data Extraction
#---------------------------------------
print("Shall begin collecting data on Submissions...")
for submission in subs:
collectSubData(submission)
print("Submissions have been collected")
print(str(len(sub_entries)) + " entries have been added to the dictionary")
#Save submission Data in file
#---------------------------------------------
def updateSubs_file():
upload_count = 0
import csv
location = {folder-to-store-your-file}
print("input filename of submission file, please add .csv") #don't forget to assign filetype
filename = input()
file = location + filename
with open(file, 'w', newline='') as file: #if you encounter encoding error use > encoding="utf-8"
a = csv.writer(file, delimiter=',')
headers = ["Post ID","Title","Url","Author","Score","Publish Date","Upvote Ratio","Total No. of Top Comments","Total No. of Comments","Flair"]
a.writerow(headers)
for sub in sub_entries:
a.writerow(sub_entries[sub][0])
upload_count+=1
print(str(upload_count) + " submissions have been uploaded")
updateSubs_file()