-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathngramize_prs.py
More file actions
149 lines (101 loc) · 5.26 KB
/
ngramize_prs.py
File metadata and controls
149 lines (101 loc) · 5.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 18 12:41:03 2018
@author: natha
"""
# Process all pull requests in the dataset and run them through the ngramizer.
# After generating the bigrams, these are stored in mongodb instance.
import analysis.ngramizer as ngramizer
from pymongo import MongoClient
from collections import Counter
def ngramize_project_prs(project):
ngram_length = 2
mongo_client = MongoClient()
database = mongo_client["graduation"]
pull_requests_collection = database["pull_requests"]
print("Doing {}".format(project["full_name"]))
prs = list(pull_requests_collection.find(
{
'project_name': project["full_name"].split("/")[1],
'project_owner': project["full_name"].split("/")[0]
}))
usernames = []
for pr in prs:
usernames.extend(ngramizer.given_text_extract_usernames(pr["body"]))
usernames.append(pr["user"]["login"])
for comment in pr["raw_comments"]:
usernames.extend(ngramizer.given_text_extract_usernames(
comment["body"]))
usernames.append(comment["user"]["login"])
for comment in pr["review_comments"]:
usernames.extend(ngramizer.given_text_extract_usernames(
comment["body"]))
if comment["user"] is not None:
usernames.append(comment["user"]["login"])
usernames = list(set(usernames))
project["usernames"] = usernames
for pr in prs:
#if not ngramizer.is_bot_comment(pr["user"]["login"]):
body_counter = Counter()
ngramizer.add_text_ngrams_to_counter(pr["body"],
pr.get("html_url", ""),
ngram_length,
body_counter,
{},
project["usernames"]
)
pr["bigrams"] = []
for item in body_counter:
bigram_result = {
'bigram':[item[0], item[1]],
'occurrence': body_counter[item]
}
pr['bigrams'].append(bigram_result)
for comment in pr["raw_comments"]:
#if not ngramizer.is_bot_comment(comment["user"]["login"]):
comment_counter = Counter()
ngramizer.add_text_ngrams_to_counter(comment["body"],
comment.get(
"html_url", ""
),
ngram_length,
comment_counter,
{},
project["usernames"]
)
comment["bigrams"] = []
for item in comment_counter:
bigram_result = {
'bigram':[item[0], item[1]],
'occurrence': comment_counter[item]
}
comment['bigrams'].append(bigram_result)
for comment in pr["review_comments"]:
if comment["user"] is not None: #and \
#not ngramizer.is_bot_comment(comment["user"]["login"]):
comment_counter = Counter()
ngramizer.add_text_ngrams_to_counter(comment["body"],
comment.get(
"url", ""
),
ngram_length,
comment_counter,
{},
project["usernames"]
)
comment["bigrams"] = []
for item in comment_counter:
bigram_result = {
'bigram':[item[0], item[1]],
'occurrence': comment_counter[item]
}
comment['bigrams'].append(bigram_result)
pull_requests_collection.replace_one({"_id":pr["_id"]}, pr)
# Run this multithreaded with 8 threads
mongo_client = MongoClient()
database = mongo_client["graduation"]
projects_collection = database["projects"]
projects = list(projects_collection.find({'succeeded' : True, 'travis_is_oldest_ci': True}))
import multiprocessing
with multiprocessing.Pool(8) as p:
p.map(ngramize_project_prs, projects)