Skip to content

Commit 13439e2

Browse files
committed
Use pushshift when searching comments, decode percent encoded URLs in comments
1 parent c82dda6 commit 13439e2

File tree

4 files changed

+148
-75
lines changed

4 files changed

+148
-75
lines changed

core/parse.py

Lines changed: 75 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import re
22
import praw.exceptions
33
from datetime import datetime
4-
4+
from core.pushshift import get_original_comment_from_psaw, get_comment_from_psaw
5+
import urllib.parse
6+
import praw.models
57
"""
68
Provides different methods to parse Reddit data
79
"""
@@ -130,12 +132,13 @@ def parse_comment(text):
130132
if matches:
131133
# Now check for a reddit link
132134
for i in matches:
133-
match = REPatterns.reddit_detect.findall(i[1])
135+
url = urllib.parse.unquote(i[1]).replace(" ", "")
136+
match = REPatterns.reddit_detect.findall(url)
134137
if match:
135-
return RedditURL(i[1].strip())
136-
match = REPatterns.short_reddit_detect.findall(i[1])
138+
return RedditURL(url.strip())
139+
match = REPatterns.short_reddit_detect.findall(url)
137140
if match:
138-
return RedditURL(i[1].strip())
141+
return RedditURL(url.strip())
139142
# Search for just general URLs in the comment
140143
matches = REPatterns.reddit_detect.findall(text)
141144
if matches:
@@ -204,22 +207,55 @@ def only_reddit_url(text):
204207
def find_roo_recursive(comment, starting_depth, depth):
205208
if not depth:
206209
return None
207-
url = parse_comment(comment.body)
210+
if comment.body == "[deleted]" or comment.body == "[removed]":
211+
print("Comment was deleted")
212+
url = search_pushshift(comment)
213+
else:
214+
body = comment.body
215+
url = parse_comment(body)
208216
if url.comment_id:
209217
url = RedditURL(f"https://reddit.com{comment.permalink}")
210218
url.params['context'] = starting_depth - depth
211219
return url
212220
else:
213-
comment.refresh()
221+
try:
222+
comment.refresh()
223+
except praw.exceptions.ClientException:
224+
return None
214225
for reply in comment.replies:
215226
url = find_roo_recursive(reply, starting_depth, depth - 1)
216227
if url:
217228
return url
218229
return None
219230

220231

232+
def find_roo_parent_recursive(comment, starting_depth, depth):
233+
if not depth:
234+
return None
235+
if isinstance(comment, praw.models.Submission):
236+
return None
237+
url = parse_comment(comment.body)
238+
if url.comment_id:
239+
url = RedditURL(f"https://reddit.com{comment.permalink}")
240+
# url.params['context'] = starting_depth - depth
241+
return url
242+
else:
243+
try:
244+
comment.refresh()
245+
except praw.exceptions.ClientException:
246+
return None
247+
url = find_roo_parent_recursive(comment.parent(), starting_depth, depth - 1)
248+
if url:
249+
return url
250+
return None
251+
221252
def find_roo_comment(comment):
222-
return find_roo_recursive(comment, 4, 4)
253+
roo = find_roo_recursive(comment, 4, 4)
254+
if roo:
255+
return roo
256+
roo = find_roo_parent_recursive(comment, 3, 3)
257+
if roo:
258+
return roo
223259

224260

225261
# If we have only responded to this in the past, then pretend we already have it in FixRequests
@@ -241,5 +277,36 @@ def has_responded_to_post(submission):
241277
return response
242278

243279

280+
def search_pushshift(comment, last_url=None):
281+
if not last_url:
282+
last_url = RedditURL(f"https://reddit.com{comment.permalink}")
283+
print("Searching PushShift for", last_url.comment_id)
284+
# psaw leaves a little to be desired in default functionality
285+
ps_comment = get_comment_from_psaw(comment.parent_id[3:], last_url.comment_id)
286+
if ps_comment:
287+
ps_comment = parse_comment(ps_comment['body'])
288+
pso_comment = get_original_comment_from_psaw(last_url.comment_id)
289+
if pso_comment:
290+
pso_comment = parse_comment(pso_comment['body'])
291+
if ps_comment and pso_comment:
292+
if ps_comment == pso_comment:
293+
url = ps_comment
294+
else:
295+
print("Two versions of comment, which one to use? (1/2)")
296+
print(pso_comment.to_link(comment._reddit), ps_comment.to_link(comment._reddit))
297+
option = input()
298+
if option == "1":
299+
url = pso_comment
300+
else:
301+
url = ps_comment
302+
elif ps_comment:
303+
url = ps_comment
304+
elif pso_comment:
305+
url = pso_comment
306+
else:
307+
url = RedditURL("")
308+
return url
309+
310+
244311
if __name__ == '__main__':
245312
print(parse_comment("""[](/kderpymeow-90-f) Ah, the ol' reddit [third-world-aroo!](https://old.reddit.com/r/interestingasfuck/comments/a9xt2g/you_may_be_cool_but_you_will_never_be_as_cool_as/ecnjvlf/?context=2)"""))

core/pushshift.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import requests
2+
import urllib.parse
3+
4+
5+
# Weird other way to get the data but it returns the edited version?
6+
def get_comment_from_psaw(parent_id, comment_id):
7+
params = {'parent_id': f"t1_{parent_id}", "filter": "id,created_utc,edited,body"}
8+
# Come on PushShift, percent coding is a standard
9+
payload_str = urllib.parse.urlencode(params, safe=",")
10+
r = requests.get("https://api.pushshift.io/reddit/comment/search/",
11+
params=payload_str)
12+
try:
13+
j = r.json()
14+
except Exception as e:
15+
print(r.status_code, r.text)
16+
raise e
17+
for i in j['data']:
18+
if i['id'] == comment_id:
19+
return i
20+
return None
21+
22+
23+
def get_original_comment_from_psaw(comment_id):
24+
params = {'ids': comment_id, "filter": "id,created_utc,body"}
25+
# Come on PushShift, percent coding is a standard
26+
payload_str = urllib.parse.urlencode(params, safe=",")
27+
r = requests.get("https://api.pushshift.io/reddit/comment/search/",
28+
params=payload_str)
29+
j = r.json()
30+
if j.get('data', None):
31+
if len(j['data']) > 0:
32+
return j['data'][0]
33+
return None
34+

tools/db.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@ def roo_id_to_submission(id):
2727
print(f"https://reddit.com{roo.submission.permalink}")
2828

2929

30+
def roo_id_to_comment(id):
31+
roo = last_switcharoo.get_roo(id)
32+
print(f"https://reddit.com{roo.comment.permalink}")
33+
34+
3035
def roo_id_to_issues(id):
3136
roo = last_switcharoo.get_roo(id)
3237
issues = last_switcharoo.get_issues(roo)

tracer.py

Lines changed: 34 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
1-
import requests
2-
import urllib.parse
31
import psaw
42
import praw.exceptions
53
import time
64
import pendulum
5+
import webbrowser
76
from datetime import datetime, timedelta
87

98
import prawcore.exceptions
@@ -13,6 +12,7 @@
1312
from core import parse
1413
from core.history import SwitcharooLog
1514
from core.arguments import tracer as argparser
15+
from core.pushshift import get_comment_from_psaw, get_original_comment_from_psaw
1616

1717
credentials = CredentialsLoader.get_credentials()['reddit']
1818

@@ -30,6 +30,7 @@
3030

3131
args = argparser.parse_args()
3232

33+
3334
def get_newest_id(subreddit, index=0):
3435
"""Retrieves the newest post's id. Used for starting the last switcharoo history trackers"""
3536
return [i for i in subreddit.new(params={"limit": "1"})][index].url
@@ -55,33 +56,6 @@ def get_newest_id(subreddit, index=0):
5556
print("SwitcharooHelper Tracer v{} Ctrl+C to stop".format(consts.version))
5657

5758

58-
# Weird other way to get the data but it returns the edited version?
59-
def get_comment_from_psaw(parent_id, comment_id):
60-
params = {'parent_id': f"t1_{parent_id}", "filter": "id,created_utc,edited,body"}
61-
# Come on PushShift, percent coding is a standard
62-
payload_str = urllib.parse.urlencode(params, safe=",")
63-
r = requests.get("https://api.pushshift.io/reddit/comment/search/",
64-
params=payload_str)
65-
j = r.json()
66-
for i in j['data']:
67-
if i['id'] == comment_id:
68-
return i
69-
return None
70-
71-
72-
def get_original_comment_from_psaw(comment_id):
73-
params = {'ids': comment_id, "filter": "id,created_utc,body"}
74-
# Come on PushShift, percent coding is a standard
75-
payload_str = urllib.parse.urlencode(params, safe=",")
76-
r = requests.get("https://api.pushshift.io/reddit/comment/search/",
77-
params=payload_str)
78-
j = r.json()
79-
if j.get('data', None):
80-
if len(j['data']) > 0:
81-
return j['data'][0]
82-
return None
83-
84-
8559
def unable_to_find_link(url: parse.RedditURL, last_url: parse.RedditURL):
8660
print("Unable to find a link in this roo.")
8761
print(last_url.to_link(reddit))
@@ -100,35 +74,6 @@ def unable_to_find_link(url: parse.RedditURL, last_url: parse.RedditURL):
10074
return parse.RedditURL(url)
10175

10276

103-
def search_pushshift(last_url):
104-
print("Searching PushShift for", last_url.comment_id)
105-
# psaw leaves a little to be desired in default functionality
106-
ps_comment = get_comment_from_psaw(comment.parent_id[3:], last_url.comment_id)
107-
if ps_comment:
108-
ps_comment = parse.parse_comment(ps_comment['body'])
109-
pso_comment = get_original_comment_from_psaw(last_url.comment_id)
110-
if pso_comment:
111-
pso_comment = parse.parse_comment(pso_comment['body'])
112-
if ps_comment and pso_comment:
113-
if ps_comment == pso_comment:
114-
url = ps_comment
115-
else:
116-
print("Two versions of comment, which one to use? (1/2)")
117-
print(pso_comment.to_link(reddit), ps_comment.to_link(reddit))
118-
option = input()
119-
if option == "1":
120-
url = pso_comment
121-
else:
122-
url = ps_comment
123-
elif ps_comment:
124-
url = ps_comment
125-
elif pso_comment:
126-
url = pso_comment
127-
else:
128-
url = parse.RedditURL("")
129-
return url
130-
131-
13277
def add_comment(url: parse.RedditURL, start_url: parse.RedditURL = None):
13378
# Double check it's not already there
13479
q = log.search(comment_id=url.comment_id)
@@ -142,7 +87,13 @@ def add_comment(url: parse.RedditURL, start_url: parse.RedditURL = None):
14287
if q:
14388
print("Adjusting roo time")
14489
comment_time = q.time - timedelta(seconds=1)
145-
log.add_comment(url.thread_id, url.comment_id, url.params.get("context", 0), comment_time)
90+
try:
91+
context = int(url.params.get("context", 0))
92+
except ValueError:
93+
print(f"Got {url.params['context']} for url {url}, what should it be?")
94+
context = int(input())
95+
96+
log.add_comment(url.thread_id, url.comment_id, context, comment_time)
14697

14798

14899

@@ -171,9 +122,9 @@ def add_comment(url: parse.RedditURL, start_url: parse.RedditURL = None):
171122
roo_count += 1
172123

173124
last_url = url
174-
if comment.body == "[deleted]":
125+
if comment.body == "[deleted]" or comment.body == "[removed]":
175126
print("Comment was deleted")
176-
url = search_pushshift(last_url)
127+
url = parse.search_pushshift(comment, last_url)
177128
else:
178129
url = parse.parse_comment(comment.body)
179130

@@ -182,15 +133,31 @@ def add_comment(url: parse.RedditURL, start_url: parse.RedditURL = None):
182133
print("Roo linked incorrectly, searching thread for link")
183134
new_last_url = parse.find_roo_comment(comment)
184135
if new_last_url and last_url:
185-
new_last_url.params['context'] = str(int(new_last_url.params.get('context', 0)) +
186-
int(last_url.params.get('context', 0)))
136+
try:
137+
new_last_url.params['context'] = str(int(new_last_url.params.get('context', 0)) +
138+
int(last_url.params.get('context', 0)))
139+
except ValueError:
140+
print(f"Got {new_last_url.params['context']} and {last_url.params['context']}, what should it be?")
141+
new_last_url.params['context'] = int(input())
187142
if new_last_url:
188-
print(last_url.to_link(reddit), "should actually be", new_last_url.to_link(reddit))
189-
last_url = new_last_url
143+
if args.discover:
144+
print("Should", last_url.to_link(reddit), "actually be", new_last_url.to_link(reddit), "?")
145+
print("(y/n)")
146+
webbrowser.open(last_url.to_link(reddit))
147+
webbrowser.open(new_last_url.to_link(reddit))
148+
option = input()
149+
if option == "y":
150+
last_url = new_last_url
151+
else:
152+
last_url = new_last_url
190153
comment = reddit.comment(last_url.comment_id)
191-
url = parse.parse_comment(comment.body)
154+
if comment.body == "[deleted]" or comment.body == "[removed]":
155+
print("Comment was deleted")
156+
url = parse.search_pushshift(comment, last_url)
157+
else:
158+
url = parse.parse_comment(comment.body)
192159
else:
193-
url = search_pushshift(last_url)
160+
url = parse.search_pushshift(comment, last_url)
194161

195162
if args.discover:
196163
add_comment(last_url, start_url=start_url)

0 commit comments

Comments
 (0)