code_reviews/find_effective_comments.py at master · TheDutchDevil/code_reviews · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
'''
Uses the method proposed by Bosu et al. to find effective comments, and tag
them as such. Due to the complexity of this calculation we run this once and
store the results, instead of computing it on the fly.
'''

from json import JSONEncoder
from time import sleep

import github
import gh_tokens
import datetime
import traceback

from math import ceil
from pymongo import MongoClient

import queue
from analysis.effective_comments.find_effective import process_pr as find_effective

'''
Take a chunk of pull_requests, and find the effective comments in those pull
requests
'''
def process_pr_chunk(chunk):

    print("Starting processing a chunk of size {}".format(len(chunk)))

    mongo_client = MongoClient()

    database = mongo_client["graduation"]

    commits_collection = database["commits"]

    pull_requests_collection = database["pull_requests"]

    for pr in chunk:
        if len(pr["commits"]) == 0:
            continue

        process_pr(pr, commits_collection, pull_requests_collection)

def process_pr(pr, commits_collection, pull_requests_collection):

    try:

        full_pr = pull_requests_collection.find_one({'_id': pr["_id"]},
                {'bigrams': 0, 'raw_comments': 0, 'review_comments.bigrams': 0})

        full_commits = [commits_collection.find_one({'sha': hash}) for hash in full_pr["commits"]]

        hashes = full_pr["commits"]

        full_pr["commits"] = full_commits

        effective_comments = find_effective(full_pr)

        for comment in full_pr["review_comments"]:
            comment["is_effective"] = False

        for effective in effective_comments:
            matching_comment = [comment for comment in full_pr["review_comments"] if comment["url"] == effective[0]["url"]][0]
            matching_comment["is_effective"] = True

        pull_requests_collection.update({'_id':full_pr["_id"]}, {'$set': {'review_comments': full_pr["review_comments"]}})

    except Exception as e:
        print("Failed PR {}/{}:{} with {}\n{}".format(
            pr["project_owner"], pr["project_name"], pr["number"],
            e, traceback.print_tb(e.__traceback__)
        ))

def chunkIt(seq, num):
    avg = len(seq) / float(num)
    out = []
    last = 0.0

    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg

    return out

mongo_client = MongoClient()

database = mongo_client["graduation"]

pull_requests_collection = database["pull_requests"]

projects_collection = database["projects"]

commits_collection = database["commits"]

projects_list = list(projects_collection.find({'succeeded':True, 'travis_is_oldest_ci': True}))

pr_list = []

for proj in projects_list:
    pr_list.extend(list(pull_requests_collection.find({'project_name': proj["full_name"].split("/")[1],
                                        'project_owner': proj["full_name"].split("/")[0]},
                                        {'project_owner':1, 'project_name':1, 'commits':1, 'number':1})))

todo_prs = chunkIt(pr_list, 8)

print("Loaded and chunked the commit list")

import multiprocessing
from functools import partial

print("Starting to split of threads \n")

m = multiprocessing.Manager()

with multiprocessing.Pool(8) as p:
    func = partial(process_pr_chunk)
    p.map(func, todo_prs)