1+ # This Python file uses the following encoding: utf-8
2+ #-*- coding: utf-8 -*-
3+ import os , sys
4+ from selenium .webdriver .common .desired_capabilities import DesiredCapabilities
5+ firefox_capabilities = DesiredCapabilities .FIREFOX
6+ firefox_capabilities ['marionette' ] = True
7+ firefox_capabilities ['binary' ] = '/usr/bin/firefox'
8+ from selenium import webdriver
9+ from selenium .common .exceptions import NoSuchElementException
10+ from selenium .webdriver .common .keys import Keys
11+ from datetime import datetime , time
12+ from pattern .web import *
13+ from pattern .web import URL , extension , download
14+ from sets import Set
15+ import json
16+ import re
17+ import csv
18+ import unicodedata
19+
20+ stack_committercount_dict = {"cinder" : "cinder-openstack-dict.csv" , "glance" : "glance-openstack-dict.csv" , "horizon" : "horizon-openstack-dict.csv" ,"keystone" : "keystone-openstack-dict.csv" , "neutron" : "neutron-openstack-dict.csv" , "nova" : "nova-openstack-dict.csv" , "swift" : "swift-openstack-dict.csv" , "cloudstack" : "cloudstack-apache-dict.csv" }
21+ ml_committercount_dict = {"Theano" : "Theano-Theano-dict.csv" , "CNTK" : "CNTK-Microsoft-dict.csv" , "caffe" : "caffe-BVLC-dict.csv" , "deeplearning4j" : "deeplearning4j-deeplearning4j-dict.csv" , "tensorflow" : "tensorflow-tensorflow-dict.csv" }
22+
23+
24+ """
25+ Opens the csv file specified and loads the result into a dictionary
26+
27+ input: a filename
28+ output: a python dictionary with names as keys and commit counts as values
29+ """
30+ def obtainCommittersandCount (repocommitfile ):
31+ filepath = "/home/anne/github-research/committer_csvs"
32+ name_commits_dict = {}
33+ for root , _ , files in os .walk (filepath ):
34+ for f in files :
35+ fullpath = os .path .join (root , f )
36+ if (f == repocommitfile ):
37+ try :
38+ with open (fullpath , "rt" ) as f_obj :
39+ reader = csv .reader (f_obj )
40+ for row in reader :
41+ name = row [0 ].decode ('utf-8' )
42+ commitcount = int (row [1 ])
43+ name_commits_dict [name ] = commitcount
44+ except ValueError :
45+ print fullpath , " has this error: " , ValueError
46+ except TypeError :
47+ print fullpath , " has this error: " , TypeError
48+ return name_commits_dict
49+
50+ """
51+ Calculates what 10% of the # of committers for a repo is
52+
53+ input: a filename (string), a percentage (int or float, like 10 or 10.0 for 10%)
54+ output: 10% of the # of committers
55+ """
56+ def num_of_percent (repocommitfile , percent ):
57+ name_commits_dict = obtainCommittersandCount (repocommitfile )
58+ if (percent > 100 ) or ((type (percent ) != int ) and (type (percent ) != float )):
59+ print "\n invalid percentage. Try again\n "
60+ return 0
61+ return int (len (name_commits_dict )* (percent * .01 ))
62+
63+ """
64+ Gets the number of commits a person has for this particular project
65+
66+ input: name of committer (string), the repo's committer count filename (string representing .csv file)
67+ output: number of commits by committer, if any
68+ """
69+ def findNumCommits (name , repocommitfile ):
70+ name_commits_dict = obtainCommittersandCount (repocommitfile )
71+ try :
72+ if (type (name ) == str ):
73+ return name_commits_dict [name ]
74+ elif (type (name ) == unicode ):
75+ return name_commits_dict [name .decode ("utf-8" )]
76+ except KeyError , Argument :
77+ return 0
78+
79+ """
80+ Finds the available company affiliation of the committer
81+
82+ input: committer's name (string)
83+ output: organizations this person worked at (list)
84+ """
85+ def findHistory (name ):
86+ pending = []
87+ personalHistory = []
88+ readablename = name .split (" " )
89+
90+ try :
91+ name_key = ""
92+ if len (readablename ) == 3 :
93+ name_key += '{}{}{}' .format (readablename [0 ], readablename [1 ], readablename [2 ])
94+ elif len (readablename ) == 2 :
95+ name_key += '{}{}' .format (readablename [0 ], readablename [1 ])
96+ elif len (readablename ) == 1 :
97+ name_key += '{}' .format (readablename [0 ])
98+ elif len (readablename ) == 4 :
99+ name_key += '{}{}{}{}' .format (readablename [0 ], readablename [1 ], readablename [2 ], readablename [3 ])
100+
101+ name_key_unicode = name_key .decode ('utf-8' )
102+
103+ with open ('companyaffiliation.json' , 'r' ) as data_file :
104+ data = json .load (data_file )
105+ # print data_file
106+ try :
107+ return data [name_key_unicode ]
108+ except KeyError , Argument :
109+ return
110+ except IOError , Argument :
111+ print "companyaffiliation.json doesn't exist yet" , Argument
112+ except UnicodeEncodeError , Argument :
113+ pending .append (name )
114+ print "we can't decode {}" .format (name ), Argument
115+
116+ """
117+ Gets a list of the most prolific committers in the repo
118+
119+ input: repocommitfile(string representing name of csv file input), percent(int or float, the top percent of commiters we want)
120+ output: a list of the most prolific committers in top 'percent' percent
121+ """
122+ def frequentcommitters (repocommitfile , percent ):
123+ name_commits_dict = obtainCommittersandCount (repocommitfile )
124+ num_percent = num_of_percent (repocommitfile , percent )
125+ # Loop through name_commits_dict num_percent times. (Inefficient, I know)
126+ committers = []
127+ count_max = 1
128+ # get the values (commit count) of the dictionary
129+ commit_number_list = sorted (name_commits_dict .values ())[- 1 * num_percent :]
130+ for name in name_commits_dict :
131+ count = name_commits_dict [name ]
132+ if (count in commit_number_list ):
133+ committers .append (name )
134+ return committers
135+
136+ """
137+ Finds the # of prolific committers affiliated with the repo's organization
138+ (for example, how many of tensorflow's committers are affiliated with Google?)
139+
140+ input: the repo's committer count filename (string representing .csv file), the repo's name (string), percent (float or int, representing top % of prolificcommitters)
141+ output: the committers who are affiliated with the repo's organization (list)
142+ """
143+ def findNumEmployees (companyfile , repo , percent ):
144+ employeeList = []
145+ # company is the phrase to look for in the linkedin data or email domain association
146+ association = {"tensorflow" :["oogle" , "Google" ], \
147+ "CNTK" : ["icrosoft" , "Microsoft" ], \
148+ "deeplearning4j" : ["kymind" , "Skymind.io" ], \
149+ "Theano" : ["Montr" , "Univ. of Montreal" ], \
150+ "caffe" : ["erkeley" , "Berkeley Vision and Learning Center" ], \
151+ "cinder" : ["penstack" , "Openstack" ], \
152+ "cloudstack" : ["pache" , "Apache Foundation" ], \
153+ "glance" : ["penstack" , "Openstack" ], \
154+ "horizon" : ["penstack" , "Openstack" ], \
155+ "keystone" : ["penstack" , "Openstack" ], \
156+ "neutron" : ["penstack" , "Openstack" ], \
157+ "nova" : ["penstack" , "Openstack" ], \
158+ "swift" : ["penstack" , "Openstack" ]
159+ }
160+
161+ company_searchterm = association [repo ][0 ]
162+ company = association [repo ][1 ]
163+ committers_list = frequentcommitters (companyfile , percent )
164+ #looping through frequentcommitters to see if this person has worked at the company
165+ for name_index in range (len (committers_list )):
166+ name = committers_list [name_index ]
167+ name_unicode = name .encode ("utf-8" )
168+ personalHistory = findHistory (name_unicode ) #pulls up the personal work history of this person
169+ if personalHistory == None :
170+ continue
171+ else :
172+ for alist_index in range (len (personalHistory )):
173+ currentCompany = personalHistory [alist_index ] #a list in the form of [company, dates]
174+ if (company_searchterm in currentCompany ):
175+ employeeList .append (name )
176+ # So we don't double count:
177+ break
178+ jsondict = {}
179+ jsondict [percent ] = {}
180+ jsondict [percent ]["frequent" ] = [len (committers_list ), committers_list ]
181+ jsondict [percent ]["affiliated" ] = [len (employeeList ), employeeList ]
182+ try :
183+ affiliated_over_overall = (len (employeeList )* 100.0 )/ len (committers_list )
184+ print "Out of top {} percent of {}'s committers, at least {} percent of them are affiliated with {}" .format (percent , repo , affiliated_over_overall , company )
185+ except ZeroDivisionError , Argument :
186+ print "More Commit Data needed for {}'s commits" .format (repo )
187+
188+ return employeeList
189+
190+ """
191+ Optional Helper Function
192+ Scrapes the committer's linkedin profile and stores it into the file, companyaffiliation.json
193+
194+ input: committer's name (string), url of committer's linkedin profilei (string)
195+ output: committer's name and work history (a tuple containing a string and a list)
196+ """
197+ def getLinkedInInfo (name , url ):
198+ driver = webdriver .Firefox (capabilities = firefox_capabilities )
199+ driver .get (url )
200+ time .sleep (2 )
201+ title = driver .title
202+ orgs_worklife = driver .find_elements_by_class_name ("item-subtitle" )
203+ dateranges = driver .find_elements_by_class_name ("date-range" )
204+ orgsAndCompanies = []
205+ for i in range (len (dateranges )):
206+ try :
207+ daterange = dateranges [i ]
208+ one_daterange = daterange .text .encode ('ascii' , 'ignore' ).decode ('ascii' )
209+ # print i
210+ one_org = orgs_worklife [i ].text .encode ('ascii' , 'ignore' ).decode ('ascii' )
211+ # print one_org
212+ orgsAndCompanies .append ((one_org , one_daterange ))
213+ except IndexError as e :
214+ break
215+ driver .quit ()
216+ readablename = name .encode ("utf-8" ).split (" " )
217+ try :
218+ name_key = ""
219+ if len (readablename ) == 3 :
220+ name_key += '{}{}{}' .format (readablename [0 ], readablename [1 ], readablename [2 ])
221+ elif len (readablename ) == 2 :
222+ name_key += '{}{}' .format (readablename [0 ], readablename [1 ])
223+ elif len (readablename ) == 1 :
224+ name_key += '{}' .format (readablename [0 ])
225+ elif len (readablename ) == 4 :
226+ name_key += '{}{}{}{}' .format (readablename [0 ], readablename [1 ], readablename [2 ], readablename [3 ])
227+ name_key += "\n "
228+ name_key_unicode = name_key .decode ('utf-8' )
229+
230+ with open ('companyaffiliation.json' , 'r' ) as data_file :
231+ affiliations = json .load (data_file )
232+ affiliations [name_key ] = orgsAndCompanies
233+
234+ with open ('companyaffiliation.json' , 'w' ) as data_file :
235+ json .dump (affiliations , data_file )
236+ except IOError , Argument :
237+ print "companyaffiliation.json doesn't exist yet" , Argument
238+ except UnicodeEncodeError , Argument :
239+ print "we can't decode {}" .format (name ), Argument
240+ return name , orgsAndCompanies
241+
242+ if __name__ == '__main__' :
243+ stack_committercount_dict = {"cinder" : "cinder-openstack-dict.csv" , "glance" : "glance-openstack-dict.csv" , "horizon" : "horizon-openstack-dict.csv" ,"keystone" : "keystone-openstack-dict.csv" , "neutron" : "neutron-openstack-dict.csv" , "nova" : "nova-openstack-dict.csv" , "swift" : "swift-openstack-dict.csv" , "cloudstack" : "cloudstack-apache-dict.csv" }
244+ ml_committercount_dict = {"Theano" : "Theano-Theano-dict.csv" , "CNTK" : "CNTK-Microsoft-dict.csv" , "caffe" : "caffe-BVLC-dict.csv" , "deeplearning4j" : "deeplearning4j-deeplearning4j-dict.csv" , "tensorflow" : "tensorflow-tensorflow-dict.csv" }
245+ percent = 100
246+ for repo in ml_committercount_dict :
247+ csv_file = ml_committercount_dict [repo ]
248+ findNumEmployees (csv_file , repo , percent )
249+ print "\n "
0 commit comments