Skip to content

Commit f949ee0

Browse files
committed
Refactor the paths used and change download notebook
Signed-off-by: m09 <[email protected]>
1 parent fcbf2d3 commit f949ee0

File tree

7 files changed

+199
-360
lines changed

7 files changed

+199
-360
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ gitbase: bblfshd
3030
--link devfest_bblfshd:devfest_bblfshd \
3131
--env BBLFSH_ENDPOINT=devfest_bblfshd:9432 \
3232
--env MAX_MEMORY=1024 \
33-
--volume $(PWD)/repos:/opt/repos \
33+
--volume $(PWD)/repos/git-data:/opt/repos \
3434
srcd/gitbase:v0.24.0-rc2
3535

3636
jupyter-image:

README.md

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@ Workshop given at [DevFest Nantes 2019](https://devfest.gdgnantes.com/sessions/u
55
Slides: on [gDrive](https://docs.google.com/presentation/d/1vF0JMagmXXzn-h-OaJu6CsDt78oSQSg58YFJsBUaHxk/edit#slide=id.g4f0d75b8b4_0_0)
66

77
OSS tools covered:
8-
- [gitbase](https://docs.sourced.tech/gitbase)
9-
- [bblfsh](https://doc.bblf.sh)
10-
- [BigARTM](http://bigartm.org)
11-
- [OpenNMT](http://opennmt.net)
8+
9+
- [gitbase](https://docs.sourced.tech/gitbase)
10+
- [bblfsh](https://doc.bblf.sh)
11+
- [BigARTM](http://bigartm.org)
12+
- [OpenNMT](http://opennmt.net)
1213

1314
<details>
1415
<summary>Abstract</summary>
@@ -32,8 +33,10 @@ OSS tools covered:
3233
3334
</details>
3435

36+
Slides: on [gDrive](https://docs.google.com/presentation/d/1vF0JMagmXXzn-h-OaJu6CsDt78oSQSg58YFJsBUaHxk/edit#slide=id.g4f0d75b8b4_0_0)
3537

3638
## Prerequisites
39+
3740
- Docker
3841

3942
## Dependencies
@@ -72,7 +75,7 @@ docker run \
7275
--link devfest_bblfshd:devfest_bblfshd \
7376
--env BBLFSH_ENDPOINT=devfest_bblfshd:9432 \
7477
--env MAX_MEMORY=1024 \
75-
--volume $(pwd)/repos:/opt/repos \
78+
--volume $(pwd)/repos/git-data:/opt/repos \
7679
srcd/gitbase:v0.24.0-rc2
7780
```
7881

@@ -104,8 +107,8 @@ To only launch the 3 required containers
104107
```shell
105108
make
106109
```
107-
</details>
108110

111+
</details>
109112

110113
## Workflow
111114

@@ -121,10 +124,8 @@ TBD
121124

122125
[Notebook 2: project and developer similarities](#link to local jupyther)
123126

124-
125127
### 3. Function Name Suggestion
126128

127129
TBD
128130

129131
[Notebook 2: function name suggestion](#link to local jupyther)
130-

notebooks/.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
11
.ipynb_checkpoints
22
__pycache__
3-
full

notebooks/Download repositories.ipynb

Lines changed: 100 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -20,79 +20,123 @@
2020
"We iterate as long as the API gives us a pointer to another response page. We filter forks to focus on original repositories."
2121
]
2222
},
23+
{
24+
"cell_type": "code",
25+
"execution_count": null,
26+
"metadata": {},
27+
"outputs": [],
28+
"source": [
29+
"from os import makedirs\n",
30+
"from os.path import join as path_join\n",
31+
"\n",
32+
"\n",
33+
"git_data_dir = path_join(\"/devfest\", \"repos\", \"git-data\")\n",
34+
"makedirs(git_data_dir, exist_ok=True)\n",
35+
"repos_json = path_join(git_data_dir, \"repos.json\")"
36+
]
37+
},
2338
{
2439
"cell_type": "code",
2540
"execution_count": null,
2641
"metadata": {},
2742
"outputs": [],
2843
"source": [
2944
"from json import dump as json_dump\n",
45+
"from operator import itemgetter\n",
3046
"from re import compile as re_compile\n",
31-
"from typing import Optional\n",
47+
"from typing import Any, Dict, List, Optional\n",
3248
"\n",
3349
"import requests\n",
50+
"from tqdm import tqdm_notebook as tqdm\n",
3451
"\n",
3552
"\n",
3653
"# Generate a personal access token here: https://github.com/settings/tokens\n",
3754
"TOKEN = # See comment above, please generate a token and put it here\n",
55+
"MAX_SIZE = 50 * 1024\n",
56+
"REPOS_NUMBER = 50\n",
3857
"\n",
3958
"\n",
4059
"next_pattern = re_compile('<(https://api.github.com/user/[^/]+/repos\\?[^>]*page=\\d+[^>]*)>; rel=\"next\"')\n",
60+
"last_pattern = re_compile('<https://api.github.com/user/[^/]+/repos\\?[^>]*page=(\\d+)[^>]*>; rel=\"last\"')\n",
4161
"\n",
4262
"\n",
4363
"def parse_next(link_header: str) -> Optional[str]:\n",
4464
" match = next_pattern.search(link_header)\n",
4565
" return match.group(1) if match is not None else None\n",
4666
"\n",
4767
"\n",
48-
"def list_repositories(user: str):\n",
49-
" headers = dict(Authorization=\"token {token}\".format(token=TOKEN))\n",
50-
" url = \"https://api.github.com/users/{user}/repos\".format(user=user)\n",
51-
" while url is not None:\n",
52-
" request = requests.get(url, headers=headers)\n",
53-
" request.raise_for_status()\n",
54-
" for repo in request.json():\n",
55-
" if not repo[\"fork\"]:\n",
56-
" yield repo[\"name\"], repo[\"clone_url\"], repo[\"size\"], repo[\"stargazers_count\"]\n",
57-
" url = parse_next(request.headers[\"Link\"])\n",
58-
"\n",
59-
"\n",
60-
"with open('output/repos.json', 'w') as fh:\n",
61-
" json_dump(list(list_repositories(\"apache\")), fh)"
62-
]
63-
},
64-
{
65-
"cell_type": "markdown",
66-
"metadata": {},
67-
"source": [
68-
"## Filtering for repos we want to analyze\n",
68+
"def parse_last(link_header: str) -> Optional[int]:\n",
69+
" match = last_pattern.search(link_header)\n",
70+
" return int(match.group(1)) if match is not None else None\n",
6971
"\n",
70-
"We'll keep the most popular repos by stars that are under a given size threshold."
71-
]
72-
},
73-
{
74-
"cell_type": "code",
75-
"execution_count": null,
76-
"metadata": {},
77-
"outputs": [],
78-
"source": [
79-
"from json import load as json_load\n",
80-
"from operator import itemgetter\n",
81-
"from pprint import pprint\n",
8272
"\n",
73+
"def filter_repos(repos: List[Dict[str, Any]], max_size: int, limit: int):\n",
74+
" return [repo\n",
75+
" for repo in sorted(repos,\n",
76+
" key=itemgetter(\"stars\"),\n",
77+
" reverse=True)\n",
78+
" if repo[\"size\"] <= max_size][:limit]\n",
8379
"\n",
84-
"MAX_SIZE = 50 * 1024\n",
8580
"\n",
81+
"def list_repositories(user: str, token: str, max_size: int, limit: int):\n",
82+
" repos_list_headers = dict(Authorization=\"token %s\" % token)\n",
83+
" repos_url = \"https://api.github.com/users/%s/repos\" % user\n",
8684
"\n",
87-
"filtered = []\n",
88-
"with open('output/repos.json', 'r') as fh:\n",
89-
" repos = json_load(fh)\n",
90-
" filtered = [(name, clone_url)\n",
91-
" for name, clone_url, size, _ in sorted(repos, key=itemgetter(3), reverse=True)\n",
92-
" if size <= MAX_SIZE]\n",
85+
" request_total = requests.get(repos_url,\n",
86+
" headers=repos_list_headers)\n",
87+
" total_pages = parse_last(request_total.headers[\"Link\"])\n",
88+
" assert total_pages is not None\n",
9389
"\n",
90+
" def get_page_url(page: int):\n",
91+
" return \"%s?page=%d\" % (repos_url, page)\n",
9492
"\n",
95-
"pprint(filtered)"
93+
" print(\"Retrieving repos list for user %s\" % user)\n",
94+
" repos = []\n",
95+
" for page in tqdm(range(total_pages)):\n",
96+
" request = requests.get(get_page_url(page),\n",
97+
" headers=repos_list_headers)\n",
98+
" request.raise_for_status()\n",
99+
" for repo in request.json():\n",
100+
" if repo[\"fork\"]:\n",
101+
" continue\n",
102+
" repos.append(dict(\n",
103+
" name=repo[\"name\"],\n",
104+
" branch=repo[\"default_branch\"],\n",
105+
" clone_url=repo[\"clone_url\"],\n",
106+
" size=repo[\"size\"],\n",
107+
" stars=repo[\"stargazers_count\"]\n",
108+
" ))\n",
109+
"\n",
110+
" print(\"Filtering to keep the top %d repositories\" % limit)\n",
111+
" filtered_repos = filter_repos(repos, max_size, limit)\n",
112+
"\n",
113+
" def get_repo_sha_url(user: str, repo: str, branch: str):\n",
114+
" return \"https://api.github.com/repos/%s/%s/commits/%s\" % (\n",
115+
" user,\n",
116+
" repo,\n",
117+
" branch\n",
118+
" )\n",
119+
"\n",
120+
" print(\"Getting SHA1 for each repository\")\n",
121+
" repo_sha_headers = dict(\n",
122+
" Authorization=\"token %s\" % token,\n",
123+
" Accept = \"application/vnd.github.VERSION.sha\"\n",
124+
" )\n",
125+
" for repo in tqdm(filtered_repos):\n",
126+
" request_sha = requests.get(\n",
127+
" get_repo_sha_url(user, repo[\"name\"], repo[\"branch\"]),\n",
128+
" headers=repo_sha_headers)\n",
129+
" if request_sha.status_code == 409:\n",
130+
" # Repo is empty\n",
131+
" continue\n",
132+
" else:\n",
133+
" request_sha.raise_for_status()\n",
134+
" repo[\"sha\"] = request_sha.text\n",
135+
" return filtered_repos\n",
136+
"\n",
137+
"\n",
138+
"with open(repos_json, \"w\", encoding=\"utf8\") as fh:\n",
139+
" json_dump(list_repositories(\"apache\", TOKEN, MAX_SIZE, REPOS_NUMBER), fh)"
96140
]
97141
},
98142
{
@@ -101,19 +145,28 @@
101145
"metadata": {},
102146
"outputs": [],
103147
"source": [
148+
"from json import load as json_load\n",
104149
"from multiprocessing.pool import ThreadPool\n",
105150
"\n",
106151
"\n",
107152
"PARALLEL_DOWNLOADS = 10\n",
108-
"REPOS_NUMBER = 50\n",
109153
"\n",
110154
"\n",
111-
"def clone_repo(clone_url: str):\n",
112-
" !cd /devfest/repos && git clone -q {clone_url}\n",
155+
"def clone_repo(name: str, clone_url: str, sha):\n",
156+
" !cd {git_data_dir} \\\n",
157+
" && git clone -q {clone_url} {name} \\\n",
158+
" && cd {name} \\\n",
159+
" && git checkout -q {sha}\n",
113160
"\n",
114161
"\n",
115-
"with ThreadPool(PARALLEL_DOWNLOADS) as pool:\n",
116-
" pool.map(clone_repo, [clone_url for _, clone_url in filtered[:REPOS_NUMBER]])"
162+
"with ThreadPool(PARALLEL_DOWNLOADS) as pool, \\\n",
163+
" open(repos_json, encoding=\"utf8\") as fh:\n",
164+
" repos = json_load(fh)\n",
165+
" pool.starmap(clone_repo,\n",
166+
" [(repo[\"name\"],\n",
167+
" repo[\"clone_url\"],\n",
168+
" repo[\"sha\"])\n",
169+
" for repo in repos])"
117170
]
118171
}
119172
],

0 commit comments

Comments
 (0)