-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfind_effective_comments.py
More file actions
116 lines (76 loc) · 3.24 KB
/
find_effective_comments.py
File metadata and controls
116 lines (76 loc) · 3.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
'''
Uses the method proposed by Bosu et al. to find effective comments, and tag
them as such. Due to the complexity of this calculation we run this once and
store the results, instead of computing it on the fly.
'''
from json import JSONEncoder
from time import sleep
import github
import gh_tokens
import datetime
import traceback
from math import ceil
from pymongo import MongoClient
import queue
from analysis.effective_comments.find_effective import process_pr as find_effective
'''
Take a chunk of pull_requests, and find the effective comments in those pull
requests
'''
def process_pr_chunk(chunk):
print("Starting processing a chunk of size {}".format(len(chunk)))
mongo_client = MongoClient()
database = mongo_client["graduation"]
commits_collection = database["commits"]
pull_requests_collection = database["pull_requests"]
for pr in chunk:
if len(pr["commits"]) == 0:
continue
process_pr(pr, commits_collection, pull_requests_collection)
def process_pr(pr, commits_collection, pull_requests_collection):
try:
full_pr = pull_requests_collection.find_one({'_id': pr["_id"]},
{'bigrams': 0, 'raw_comments': 0, 'review_comments.bigrams': 0})
full_commits = [commits_collection.find_one({'sha': hash}) for hash in full_pr["commits"]]
hashes = full_pr["commits"]
full_pr["commits"] = full_commits
effective_comments = find_effective(full_pr)
for comment in full_pr["review_comments"]:
comment["is_effective"] = False
for effective in effective_comments:
matching_comment = [comment for comment in full_pr["review_comments"] if comment["url"] == effective[0]["url"]][0]
matching_comment["is_effective"] = True
pull_requests_collection.update({'_id':full_pr["_id"]}, {'$set': {'review_comments': full_pr["review_comments"]}})
except Exception as e:
print("Failed PR {}/{}:{} with {}\n{}".format(
pr["project_owner"], pr["project_name"], pr["number"],
e, traceback.print_tb(e.__traceback__)
))
def chunkIt(seq, num):
avg = len(seq) / float(num)
out = []
last = 0.0
while last < len(seq):
out.append(seq[int(last):int(last + avg)])
last += avg
return out
mongo_client = MongoClient()
database = mongo_client["graduation"]
pull_requests_collection = database["pull_requests"]
projects_collection = database["projects"]
commits_collection = database["commits"]
projects_list = list(projects_collection.find({'succeeded':True, 'travis_is_oldest_ci': True}))
pr_list = []
for proj in projects_list:
pr_list.extend(list(pull_requests_collection.find({'project_name': proj["full_name"].split("/")[1],
'project_owner': proj["full_name"].split("/")[0]},
{'project_owner':1, 'project_name':1, 'commits':1, 'number':1})))
todo_prs = chunkIt(pr_list, 8)
print("Loaded and chunked the commit list")
import multiprocessing
from functools import partial
print("Starting to split of threads \n")
m = multiprocessing.Manager()
with multiprocessing.Pool(8) as p:
func = partial(process_pr_chunk)
p.map(func, todo_prs)