Skip to content

Commit 3e370d6

Browse files
Merge pull request #5 from IanOlin/affiliation
Final Polished Output for Affiliation Metrics
2 parents 7a38043 + 3c2dfd1 commit 3e370d6

11 files changed

+451
-329
lines changed

company-affiliation/CNTK_frequentcommitters.json

Lines changed: 0 additions & 1 deletion
This file was deleted.

company-affiliation/Theano_frequentcommitters.json

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
# This Python file uses the following encoding: utf-8
2+
#-*- coding: utf-8 -*-
3+
import os, sys
4+
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
5+
firefox_capabilities = DesiredCapabilities.FIREFOX
6+
firefox_capabilities['marionette'] = True
7+
firefox_capabilities['binary'] = '/usr/bin/firefox'
8+
from selenium import webdriver
9+
from selenium.common.exceptions import NoSuchElementException
10+
from selenium.webdriver.common.keys import Keys
11+
from datetime import datetime, time
12+
from pattern.web import *
13+
from pattern.web import URL, extension, download
14+
from sets import Set
15+
import json
16+
import re
17+
import csv
18+
import unicodedata
19+
20+
stack_committercount_dict = {"cinder": "cinder-openstack-dict.csv", "glance": "glance-openstack-dict.csv", "horizon": "horizon-openstack-dict.csv","keystone": "keystone-openstack-dict.csv", "neutron": "neutron-openstack-dict.csv", "nova": "nova-openstack-dict.csv", "swift": "swift-openstack-dict.csv", "cloudstack": "cloudstack-apache-dict.csv"}
21+
ml_committercount_dict = {"Theano": "Theano-Theano-dict.csv", "CNTK": "CNTK-Microsoft-dict.csv", "caffe": "caffe-BVLC-dict.csv", "deeplearning4j": "deeplearning4j-deeplearning4j-dict.csv", "tensorflow": "tensorflow-tensorflow-dict.csv"}
22+
23+
24+
"""
25+
Opens the csv file specified and loads the result into a dictionary
26+
27+
input: a filename
28+
output: a python dictionary with names as keys and commit counts as values
29+
"""
30+
def obtainCommittersandCount(repocommitfile):
31+
filepath = "/home/anne/github-research/committer_csvs"
32+
name_commits_dict = {}
33+
for root, _, files in os.walk(filepath):
34+
for f in files:
35+
fullpath = os.path.join(root, f)
36+
if (f == repocommitfile):
37+
try:
38+
with open(fullpath, "rt") as f_obj:
39+
reader = csv.reader(f_obj)
40+
for row in reader:
41+
name = row[0].decode('utf-8')
42+
commitcount = int(row[1])
43+
name_commits_dict[name] = commitcount
44+
except ValueError:
45+
print fullpath, " has this error: ", ValueError
46+
except TypeError:
47+
print fullpath, " has this error: ", TypeError
48+
return name_commits_dict
49+
50+
"""
51+
Calculates what 10% of the # of committers for a repo is
52+
53+
input: a filename (string), a percentage (int or float, like 10 or 10.0 for 10%)
54+
output: 10% of the # of committers
55+
"""
56+
def num_of_percent(repocommitfile, percent):
57+
name_commits_dict = obtainCommittersandCount(repocommitfile)
58+
if (percent > 100) or ((type(percent) != int) and (type(percent) != float)):
59+
print "\ninvalid percentage. Try again\n"
60+
return 0
61+
return int(len(name_commits_dict)*(percent*.01))
62+
63+
"""
64+
Gets the number of commits a person has for this particular project
65+
66+
input: name of committer (string), the repo's committer count filename (string representing .csv file)
67+
output: number of commits by committer, if any
68+
"""
69+
def findNumCommits(name, repocommitfile):
70+
name_commits_dict = obtainCommittersandCount(repocommitfile)
71+
try:
72+
if (type(name) == str):
73+
return name_commits_dict[name]
74+
elif (type(name) == unicode):
75+
return name_commits_dict[name.decode("utf-8")]
76+
except KeyError, Argument:
77+
return 0
78+
79+
"""
80+
Finds the available company affiliation of the committer
81+
82+
input: committer's name (string)
83+
output: organizations this person worked at (list)
84+
"""
85+
def findHistory(name):
86+
pending = []
87+
personalHistory = []
88+
readablename = name.split(" ")
89+
90+
try:
91+
name_key = ""
92+
if len(readablename) == 3:
93+
name_key += '{}{}{}'.format(readablename[0], readablename[1], readablename[2])
94+
elif len(readablename) == 2:
95+
name_key += '{}{}'.format(readablename[0], readablename[1])
96+
elif len(readablename) == 1:
97+
name_key += '{}'.format(readablename[0])
98+
elif len(readablename) == 4:
99+
name_key += '{}{}{}{}'.format(readablename[0], readablename[1], readablename[2], readablename[3])
100+
101+
name_key_unicode = name_key.decode('utf-8')
102+
103+
with open('companyaffiliation.json', 'r') as data_file:
104+
data = json.load(data_file)
105+
# print data_file
106+
try:
107+
return data[name_key_unicode]
108+
except KeyError, Argument:
109+
return
110+
except IOError, Argument:
111+
print "companyaffiliation.json doesn't exist yet", Argument
112+
except UnicodeEncodeError, Argument:
113+
pending.append(name)
114+
print "we can't decode {}".format(name), Argument
115+
116+
"""
117+
Gets a list of the most prolific committers in the repo
118+
119+
input: repocommitfile(string representing name of csv file input), percent(int or float, the top percent of commiters we want)
120+
output: a list of the most prolific committers in top 'percent' percent
121+
"""
122+
def frequentcommitters(repocommitfile, percent):
123+
name_commits_dict = obtainCommittersandCount(repocommitfile)
124+
num_percent = num_of_percent(repocommitfile, percent)
125+
# Loop through name_commits_dict num_percent times. (Inefficient, I know)
126+
committers = []
127+
count_max = 1
128+
# get the values (commit count) of the dictionary
129+
commit_number_list = sorted(name_commits_dict.values())[-1*num_percent:]
130+
for name in name_commits_dict:
131+
count = name_commits_dict[name]
132+
if (count in commit_number_list):
133+
committers.append(name)
134+
return committers
135+
136+
"""
137+
Finds the # of prolific committers affiliated with the repo's organization
138+
(for example, how many of tensorflow's committers are affiliated with Google?)
139+
140+
input: the repo's committer count filename (string representing .csv file), the repo's name (string), percent (float or int, representing top % of prolificcommitters)
141+
output: the committers who are affiliated with the repo's organization (list)
142+
"""
143+
def findNumEmployees(companyfile, repo, percent):
144+
employeeList = []
145+
# company is the phrase to look for in the linkedin data or email domain association
146+
association = {"tensorflow":["oogle", "Google"], \
147+
"CNTK": ["icrosoft", "Microsoft"], \
148+
"deeplearning4j": ["kymind", "Skymind.io"], \
149+
"Theano": ["Montr", "Univ. of Montreal"], \
150+
"caffe": ["erkeley", "Berkeley Vision and Learning Center"], \
151+
"cinder": ["penstack", "Openstack"], \
152+
"cloudstack": ["pache", "Apache Foundation"], \
153+
"glance": ["penstack", "Openstack"], \
154+
"horizon": ["penstack", "Openstack"], \
155+
"keystone": ["penstack", "Openstack"], \
156+
"neutron": ["penstack", "Openstack"], \
157+
"nova": ["penstack", "Openstack"], \
158+
"swift": ["penstack", "Openstack"]
159+
}
160+
161+
company_searchterm = association[repo][0]
162+
company = association[repo][1]
163+
committers_list = frequentcommitters(companyfile, percent)
164+
#looping through frequentcommitters to see if this person has worked at the company
165+
for name_index in range(len(committers_list)):
166+
name = committers_list[name_index]
167+
name_unicode = name.encode("utf-8")
168+
personalHistory = findHistory(name_unicode) #pulls up the personal work history of this person
169+
if personalHistory == None:
170+
continue
171+
else:
172+
for alist_index in range(len(personalHistory)):
173+
currentCompany = personalHistory[alist_index] #a list in the form of [company, dates]
174+
if (company_searchterm in currentCompany):
175+
employeeList.append(name)
176+
# So we don't double count:
177+
break
178+
jsondict = {}
179+
jsondict[percent] = {}
180+
jsondict[percent]["frequent"] = [len(committers_list), committers_list]
181+
jsondict[percent]["affiliated"] = [len(employeeList), employeeList]
182+
try:
183+
affiliated_over_overall = (len(employeeList)*100.0)/len(committers_list)
184+
print "Out of top {} percent of {}'s committers, at least {} percent of them are affiliated with {}".format(percent, repo, affiliated_over_overall, company)
185+
except ZeroDivisionError, Argument:
186+
print "More Commit Data needed for {}'s commits".format(repo)
187+
188+
return employeeList
189+
190+
"""
191+
Optional Helper Function
192+
Scrapes the committer's linkedin profile and stores it into the file, companyaffiliation.json
193+
194+
input: committer's name (string), url of committer's linkedin profilei (string)
195+
output: committer's name and work history (a tuple containing a string and a list)
196+
"""
197+
def getLinkedInInfo(name, url):
198+
driver = webdriver.Firefox(capabilities=firefox_capabilities)
199+
driver.get(url)
200+
time.sleep(2)
201+
title = driver.title
202+
orgs_worklife = driver.find_elements_by_class_name("item-subtitle")
203+
dateranges = driver.find_elements_by_class_name("date-range")
204+
orgsAndCompanies = []
205+
for i in range(len(dateranges)):
206+
try:
207+
daterange = dateranges[i]
208+
one_daterange = daterange.text.encode('ascii', 'ignore').decode('ascii')
209+
# print i
210+
one_org = orgs_worklife[i].text.encode('ascii', 'ignore').decode('ascii')
211+
# print one_org
212+
orgsAndCompanies.append((one_org, one_daterange))
213+
except IndexError as e:
214+
break
215+
driver.quit()
216+
readablename = name.encode("utf-8").split(" ")
217+
try:
218+
name_key = ""
219+
if len(readablename) == 3:
220+
name_key += '{}{}{}'.format(readablename[0], readablename[1], readablename[2])
221+
elif len(readablename) == 2:
222+
name_key += '{}{}'.format(readablename[0], readablename[1])
223+
elif len(readablename) == 1:
224+
name_key += '{}'.format(readablename[0])
225+
elif len(readablename) == 4:
226+
name_key += '{}{}{}{}'.format(readablename[0], readablename[1], readablename[2], readablename[3])
227+
name_key += "\n"
228+
name_key_unicode = name_key.decode('utf-8')
229+
230+
with open('companyaffiliation.json', 'r') as data_file:
231+
affiliations = json.load(data_file)
232+
affiliations[name_key] = orgsAndCompanies
233+
234+
with open('companyaffiliation.json', 'w') as data_file:
235+
json.dump(affiliations, data_file)
236+
except IOError, Argument:
237+
print "companyaffiliation.json doesn't exist yet", Argument
238+
except UnicodeEncodeError, Argument:
239+
print "we can't decode {}".format(name), Argument
240+
return name, orgsAndCompanies
241+
242+
if __name__ == '__main__':
243+
stack_committercount_dict = {"cinder": "cinder-openstack-dict.csv", "glance": "glance-openstack-dict.csv", "horizon": "horizon-openstack-dict.csv","keystone": "keystone-openstack-dict.csv", "neutron": "neutron-openstack-dict.csv", "nova": "nova-openstack-dict.csv", "swift": "swift-openstack-dict.csv", "cloudstack": "cloudstack-apache-dict.csv"}
244+
ml_committercount_dict = {"Theano": "Theano-Theano-dict.csv", "CNTK": "CNTK-Microsoft-dict.csv", "caffe": "caffe-BVLC-dict.csv", "deeplearning4j": "deeplearning4j-deeplearning4j-dict.csv", "tensorflow": "tensorflow-tensorflow-dict.csv"}
245+
percent = 100
246+
for repo in ml_committercount_dict:
247+
csv_file = ml_committercount_dict[repo]
248+
findNumEmployees(csv_file, repo, percent)
249+
print "\n"

0 commit comments

Comments
 (0)