|
1 | 1 |
|
2 | 2 | # coding: utf-8 |
3 | 3 |
|
| 4 | +# In[1]: |
| 5 | + |
| 6 | +get_ipython().magic('matplotlib inline') |
| 7 | + |
| 8 | + |
4 | 9 | # # User Testing gitnet |
5 | | -# |
6 | | -# #### *June 2016, using version 0.0.8 of gitnet on testpypi* |
7 | | -# |
| 10 | +# |
| 11 | +# #### *Written in June 2016, using version 0.0.8 of gitnet on testpypi* |
| 12 | +# |
8 | 13 |
|
9 | | -# |
10 | | -# ## *Introduction* |
11 | | -# |
| 14 | +# |
| 15 | +# ## *Introduction* |
| 16 | +# |
12 | 17 |
|
13 | | -# To follow this exercise successfully, you need to have: |
| 18 | +# ## To follow this exercise successfully, you need to have: |
14 | 19 | # - Python 3 (Anacondas 3.5 is the best bet) |
15 | 20 | # - Git (you can update git by running in the terminal: pip install git --upgrade) |
16 | | -# - The current version of gitnet is 0.0.8. |
| 21 | +# - The current version of git is 2.9. |
17 | 22 | # - NetworkX (you can install by running in ther terminal: pip install networkx) |
18 | 23 | # - Matplotlib (you can install by running in the terminal: pip install matplotlib) |
19 | 24 | # - Pygraphviz (not neccessarily required, only for the default layout, which happens to be the best one we could find) |
20 | | -# |
| 25 | +# |
21 | 26 | # **Note:** Unfortunately, Pygraphviz can potentially be difficult to install on Windows. If pip is not able to find vcvarsall.bat, then avoid editing the environment variables and use this website: http://www.lfd.uci.edu/~gohlke/pythonlibs/ to download the binary for Python 3.4. Unfortunately, although Pygraphviz will install, there still may be errors with the graph output. |
22 | | -# |
| 27 | +# |
23 | 28 | # Installing gitnet with pip will automatically install bash if you do not already have it installed |
24 | 29 | # To install gitnet, open a terminal window and type: |
25 | | -# |
| 30 | +# |
26 | 31 | # `pip install -i https://testpypi.python.org/pypi gitnet` |
27 | 32 |
|
| 33 | +# In[ ]: |
| 34 | + |
28 | 35 | # For all sections of this exercise, you will need to use the following libraries: |
29 | 36 |
|
30 | 37 | import os |
31 | | -# import pygraph # Needed for defaults used by quickplot, if you can't install, use layout='spring'. |
| 38 | +# import pygraphviz # Needed for defaults used by quickplot, if you can't install, use layout='spring'. |
32 | 39 | import gitnet as gn |
33 | 40 | import networkx as nx |
34 | 41 | import matplotlib.pyplot as plt |
|
37 | 44 | # ## *1. Write-Good Repo* |
38 | 45 |
|
39 | 46 | # For this exercise, we are going to use the project: https://github.com/btford/write-good |
40 | | -# |
| 47 | +# |
41 | 48 | # In a new terminal window, type: |
42 | | -# |
| 49 | +# |
43 | 50 | # `git clone https://github.com/btford/write-good.git` |
44 | | -# |
| 51 | +# |
45 | 52 | # OR open the page in a browser and download the zip folder. |
46 | 53 |
|
| 54 | +# In[ ]: |
| 55 | + |
47 | 56 | # Set the current working directory, so that all files created will be stored there. |
48 | 57 | # The best bet is to create a folder named 'temp' on your desktop. |
49 | 58 | os.chdir('path') |
50 | 59 |
|
| 60 | + |
| 61 | +# In[ ]: |
| 62 | + |
51 | 63 | # Insert the path to the write-good folder on your machine. |
52 | 64 | mylogs = gn.get_log('path') |
53 | 65 | # You can generate a network using any two tags that exist in the log. For a list of tags, just call .attributes() on your log object. |
54 | 66 | graph = mylogs.generate_network('author', 'files') |
55 | 67 | # Quickplot is a preset function that can be used to quickly visualize a network. |
56 | 68 | graph.quickplot('write_good_net.pdf', layout = 'spring') |
57 | 69 |
|
| 70 | + |
| 71 | +# In[ ]: |
| 72 | + |
58 | 73 | # You can get a list of all of the values of any tag in the log object. |
59 | 74 | # First, lets take a look at all of the possible tags. |
60 | 75 | print(mylogs.attributes()) |
|
65 | 80 | # ## *2. NetworkX* |
66 | 81 |
|
67 | 82 | # For this exercise, we are going to use this project: https://github.com/networkx/networkx |
68 | | -# |
| 83 | +# |
69 | 84 | # In a new terminal window, type: |
70 | | -# |
| 85 | +# |
71 | 86 | # `git clone https://github.com/networkx/networkx.git` |
72 | | -# |
| 87 | +# |
73 | 88 | # OR open the page in a browser and download the zip folder. |
74 | 89 |
|
| 90 | +# In[ ]: |
| 91 | + |
75 | 92 | # First, we are going to create another log object. |
76 | 93 | networkx_log = gn.get_log('path') |
77 | 94 |
|
| 95 | + |
| 96 | +# In[ ]: |
| 97 | + |
78 | 98 | # Now you can export the log as a TSV file. |
79 | 99 | networkx_log.tsv(fname = 'networkx_data.tsv') |
80 | 100 |
|
81 | 101 |
|
82 | 102 | # Take a minute to open this file and look at the contents. |
83 | | -# |
| 103 | +# |
84 | 104 | # Notice that there are similar author names that use the same email address. |
85 | | -# |
86 | | -# **Hint:** since version 0.0.8, we have simplified the process of identifying duplicate authors. |
87 | | -# Use `author_email_list` along with `detect_dup_emails` to find potentially duplicate authors. See the cheat sheet for more details. |
| 105 | +# |
| 106 | +# **Hint:** since version 0.0.8, we have simplified the process of identifying duplicate authors. Use `author_email_list` along with `detect_dup_emails` to find potentially duplicate authors. See the cheat sheet for more details. |
| 107 | + |
| 108 | +# In[ ]: |
88 | 109 |
|
89 | 110 | # Gitnet cannot automatically predict when a single author uses two different names to commit to a repo. |
90 | 111 | # For this reason, you may need to use replace one of their aliases with the other. |
91 | 112 | replaced_netx = networkx_log.replace_val('author', 'aric', 'Aric Hagburg') |
92 | 113 | # To make sure that this worked, just create a new TSV and look at the contents. |
93 | 114 | replaced_netx.tsv(fname = 'replaced_data.tsv') |
94 | 115 |
|
| 116 | + |
| 117 | +# In[ ]: |
| 118 | + |
95 | 119 | # You can also create an edgelist from any two tags. |
96 | 120 | # Check the possible tags. |
97 | 121 | print(replaced_netx.attributes()) |
|
104 | 128 | # ## *3. Tensorflow* |
105 | 129 |
|
106 | 130 | # For this exercise, we are going to use this project: https://github.com/tensorflow/tensorflow |
107 | | -# |
| 131 | +# |
108 | 132 | # In a new terminal window, type: |
109 | | -# |
| 133 | +# |
110 | 134 | # `git clone https://github.com/tensorflow/tensorflow.git` |
111 | | -# |
| 135 | +# |
112 | 136 | # OR open the page in a browser and download the zip folder. |
113 | 137 |
|
| 138 | +# In[ ]: |
| 139 | + |
114 | 140 | # Lets start by creating a log object and a graph object, just as in the first exercise. |
115 | 141 | logs_tensor = gn.get_log('path') |
116 | 142 | graph_tensor = logs_tensor.generate_network('author', 'files') |
117 | 143 |
|
| 144 | + |
118 | 145 | # For now, hold off on plotting or exporting, and try out some of the advanced methods |
119 | | -# |
| 146 | +# |
120 | 147 | # Below are some usage examples for filter and ignore |
121 | 148 |
|
| 149 | +# In[ ]: |
| 150 | + |
122 | 151 | # Filter seems to have an error in IPYNB format. |
123 | 152 |
|
| 153 | + |
124 | 154 | # Filter records based on the email domain. |
125 | 155 | filtered_email = logs_tensor.filter('email', 'has', '@gmail.com') |
126 | 156 | # Filter records based on the author name. |
127 | 157 | filtered_author = logs_tensor.filter('author', 'equals', 'Martin Wicke') |
128 | 158 | # Filter records based on commits that have occured after a certain date. |
129 | 159 | filtered_date = logs_tensor.filter('date', 'since', 'Fri Jun 10 15:41:25 2016 -0400') |
130 | 160 |
|
131 | | -# One of the limitations of filter is that because of the date-string format used by git, you need to type a pattern that at least partially matches the appearance of date-strings in the actually commits. |
132 | | -# |
| 161 | + |
| 162 | +# One of the limitations of filter is that because of the date-string format used by git, you need to type a pattern that at least partially matches the appearance of date-strings in the actually commits. |
| 163 | +# |
133 | 164 | # However, it is still possible to use expressions such as `Fri June 10 *`, so there is still some room for flexible filtering. |
134 | 165 |
|
| 166 | +# In[ ]: |
| 167 | + |
135 | 168 | # Save one of these to a TSV file to check that it worked. |
136 | 169 | filtered_author.tsv(fname = 'tensorflow_martin.tsv') |
137 | 170 |
|
| 171 | + |
| 172 | +# In[ ]: |
| 173 | + |
138 | 174 | # You can also ignore files and file edits that match any specified patter. |
139 | 175 | # Ignore python files: |
140 | 176 | ignore_python = logs_tensor.ignore('.py') |
141 | 177 | # Ignore files with the _ prefix: |
142 | 178 | ignore_prefix = logs_tensor.ignore('_*') |
143 | 179 |
|
144 | 180 |
|
145 | | -# Keep in mind that both `filter` and `ignore` can have a significant impact on the network graph. |
146 | | -# |
147 | | -# It is best to use them sparingly, and only when it is certainly useful to remove certain information. |
148 | | -# In many cases, it makes more sense to simply export the full graph and all its data (as a graphml file, for example) and then prune the data in R. |
| 181 | +# Keep in mind that both `filter` and `ignore` can have a significant impact on the network graph. |
| 182 | +# |
| 183 | +# It is best to use them sparingly, and only when it is certainly useful to remove certain information. In many cases, it makes more sense to simply export the full graph and all its data (as a graphml file, for example) and then prune the data in R. |
| 184 | + |
| 185 | +# In[ ]: |
149 | 186 |
|
150 | 187 | # Save one of these to a TSV file to check that it worked. |
151 | 188 | ignore_python.tsv(fname = 'nopy_data.tsv') |
152 | 189 |
|
| 190 | + |
| 191 | +# In[ ]: |
| 192 | + |
153 | 193 | # Try generating a network using one of these modified log objects, and compare it to previous results. |
154 | 194 | modified_graph = ignore_python.generate_network('author', 'files') |
155 | 195 | modified_graph.quickplot('modified_graph.pdf', layout = 'spring') # this runs very slow. |
156 | 196 |
|
157 | 197 |
|
158 | | -# One note about the quickploy function is that it typically uses the `neato` layout from `matplotlib`. |
159 | | -# |
| 198 | +# One note about the quickploy function is that it typically uses the `neato` layout from `matplotlib`. |
| 199 | +# |
160 | 200 | # Here we are using the `spring` layout from `NetworkX`, but if you did get matplotlib installed, then you can simply leave |
161 | 201 | # out the layout argument. It defaults to `neato`. |
162 | 202 |
|
| 203 | +# In[ ]: |
| 204 | + |
163 | 205 | # Try calling describe on both a log object and a graph object. |
164 | 206 | # Is there any other information you would like to see in the describe output? |
165 | 207 | ignore_python.describe() |
166 | 208 | modified_graph.describe() |
167 | 209 |
|
168 | 210 |
|
169 | | -# The last advanced method we have to show you is collapse graph. This quickly creates a one-mode network, using *mode1* of the |
| 211 | +# The last advanced method we have to show you is collapse graph. This quickly creates a one-mode network, using *mode1* of the |
170 | 212 | # original graph object. |
171 | 213 |
|
| 214 | +# In[ ]: |
| 215 | + |
172 | 216 | # Try calling one of the advanced graph methods, such as *collapse_edges* |
173 | | -basic_graph = logs_tensor.generate_network('author', 'files') |
| 217 | +basic_graph = logs_tensor.generate_network('author', 'files', colours="simple") |
174 | 218 | # Sum_weights = True is an optional argument that creates a weighted multigraph. |
175 | 219 | collapsed_graph = basic_graph.collapse_edges(sum_weights = True) |
176 | | -collapsed_graph.quickplot(fname = "ok_net.pdf") |
| 220 | +collapsed_graph.quickplot("ok_net.pdf", layout="spring") |
177 | 221 |
|
178 | 222 |
|
179 | | -# Optional: try reading an output file into R. |
180 | | -# |
| 223 | +# Optional: try reading a file into R. |
| 224 | +# |
181 | 225 | # Use the edge list created earlier, or create a new *tnet file* or *graphml file* and try reading it into R. |
182 | 226 |
|
| 227 | +# In[ ]: |
| 228 | + |
183 | 229 | # The graphml file will be saved at the directed path, while the tnet file will be saved in the current directory. |
184 | 230 | basic_graph.write_tnet('filename') |
185 | | -basic_graph.write_graphml('path/to/file') |
| 231 | +basic_graph.write_graphml('filename') |
| 232 | + |
| 233 | + |
| 234 | +# If you prefer, you can use the write_edges() function to export a weighted edgelist which can be read into R. |
| 235 | +# These edgelists also contain datetime entries, as a fourth column, which can be used to order nodes and create dynamic networks. |
| 236 | + |
| 237 | +# In[ ]: |
| 238 | + |
| 239 | +basic_graph.write_edges('filename.txt', weighted=True) |
| 240 | + |
| 241 | + |
| 242 | +# As you may have noticed, there is a colour argument in the `generate_network()` function. It is used at the time of network creation to specify if the user wants to create colour tags for the nodes. These colours are based on the type of node, and by extension on the contents of the "file" node type. |
| 243 | + |
| 244 | +# In[ ]: |
| 245 | + |
186 | 246 |
|
187 | 247 |
|
188 | | -# If you prefer, you can use two columns of the TSV file as the 'source' and 'target' of a networkx graph object in R. |
|
0 commit comments