Skip to content

Commit 3eac0ca

Browse files
committed
Improve download notebook (and fix a bug)
Signed-off-by: m09 <[email protected]>
1 parent 71ee1d0 commit 3eac0ca

File tree

1 file changed

+59
-23
lines changed

1 file changed

+59
-23
lines changed

notebooks/Download repositories.ipynb

Lines changed: 59 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,38 @@
2626
"metadata": {},
2727
"outputs": [],
2828
"source": [
29+
"from logging import getLogger\n",
2930
"from os import makedirs\n",
3031
"from os.path import join as path_join\n",
3132
"\n",
33+
"from coloredlogs import install as coloredlogs_install\n",
34+
"\n",
35+
"\n",
36+
"coloredlogs_install()\n",
37+
"logger = getLogger(\"downloader\")\n",
38+
"\n",
3239
"\n",
3340
"git_data_dir = path_join(\"/devfest\", \"repos\", \"git-data\")\n",
3441
"makedirs(git_data_dir, exist_ok=True)\n",
3542
"repos_json = path_join(git_data_dir, \"repos.json\")"
3643
]
3744
},
45+
{
46+
"cell_type": "markdown",
47+
"metadata": {},
48+
"source": [
49+
"To use GitHub API, we need a token. Please create one in your [GitHub account settings](https://github.com/settings/tokens) (the basic permissions are fine), and fill it here:"
50+
]
51+
},
52+
{
53+
"cell_type": "code",
54+
"execution_count": null,
55+
"metadata": {},
56+
"outputs": [],
57+
"source": [
58+
"TOKEN = "
59+
]
60+
},
3861
{
3962
"cell_type": "code",
4063
"execution_count": null,
@@ -50,12 +73,6 @@
5073
"from tqdm import tqdm_notebook as tqdm\n",
5174
"\n",
5275
"\n",
53-
"# Generate a personal access token here: https://github.com/settings/tokens\n",
54-
"TOKEN = # See comment above, please generate a token and put it here\n",
55-
"MAX_SIZE = 50 * 1024\n",
56-
"REPOS_NUMBER = 50\n",
57-
"\n",
58-
"\n",
5976
"next_pattern = re_compile('<(https://api.github.com/user/[^/]+/repos\\?[^>]*page=\\d+[^>]*)>; rel=\"next\"')\n",
6077
"last_pattern = re_compile('<https://api.github.com/user/[^/]+/repos\\?[^>]*page=(\\d+)[^>]*>; rel=\"last\"')\n",
6178
"\n",
@@ -70,15 +87,11 @@
7087
" return int(match.group(1)) if match is not None else None\n",
7188
"\n",
7289
"\n",
73-
"def filter_repos(repos: List[Dict[str, Any]], max_size: int, limit: int):\n",
74-
" return [repo\n",
75-
" for repo in sorted(repos,\n",
76-
" key=itemgetter(\"stars\"),\n",
77-
" reverse=True)\n",
78-
" if repo[\"size\"] <= max_size][:limit]\n",
79-
"\n",
80-
"\n",
81-
"def list_repositories(user: str, token: str, max_size: int, limit: int):\n",
90+
"def list_repositories(user: str,\n",
91+
" token: str,\n",
92+
" max_size_mb: int,\n",
93+
" repos_number: int\n",
94+
" ) -> List[Dict[str, Any]]:\n",
8295
" repos_list_headers = dict(Authorization=\"token %s\" % token)\n",
8396
" repos_url = \"https://api.github.com/users/%s/repos\" % user\n",
8497
"\n",
@@ -90,9 +103,9 @@
90103
" def get_page_url(page: int):\n",
91104
" return \"%s?page=%d\" % (repos_url, page)\n",
92105
"\n",
93-
" print(\"Retrieving repos list for user %s\" % user)\n",
106+
" logger.info(\"Retrieving repos list for user %s\" % user)\n",
94107
" repos = []\n",
95-
" for page in tqdm(range(total_pages)):\n",
108+
" for page in tqdm(range(1, total_pages + 1)):\n",
96109
" request = requests.get(get_page_url(page),\n",
97110
" headers=repos_list_headers)\n",
98111
" request.raise_for_status()\n",
@@ -107,8 +120,22 @@
107120
" stars=repo[\"stargazers_count\"]\n",
108121
" ))\n",
109122
"\n",
110-
" print(\"Filtering to keep the top %d repositories\" % limit)\n",
111-
" filtered_repos = filter_repos(repos, max_size, limit)\n",
123+
" if max_size_mb is not None:\n",
124+
" logger.info(\n",
125+
" \"Filtering to keep only repositories under %.2f MB\",\n",
126+
" max_size_mb\n",
127+
" )\n",
128+
" repos = [repo for repo in repos\n",
129+
" if repo[\"size\"] <= max_size_mb * 1024]\n",
130+
"\n",
131+
" if repos_number is not None:\n",
132+
" logger.info(\n",
133+
" \"Filtering to keep only the %d most popular repositories\",\n",
134+
" repos_number\n",
135+
" )\n",
136+
" repos = [repo for repo in sorted(repos,\n",
137+
" key=itemgetter(\"stars\"),\n",
138+
" reverse=True)][:repos_number]\n",
112139
"\n",
113140
" def get_repo_sha_url(user: str, repo: str, branch: str):\n",
114141
" return \"https://api.github.com/repos/%s/%s/commits/%s\" % (\n",
@@ -117,12 +144,12 @@
117144
" branch\n",
118145
" )\n",
119146
"\n",
120-
" print(\"Getting SHA1 for each repository\")\n",
147+
" logger.info(\"Getting SHA1 for each repository\")\n",
121148
" repo_sha_headers = dict(\n",
122149
" Authorization=\"token %s\" % token,\n",
123150
" Accept = \"application/vnd.github.VERSION.sha\"\n",
124151
" )\n",
125-
" for repo in tqdm(filtered_repos):\n",
152+
" for repo in tqdm(repos):\n",
126153
" request_sha = requests.get(\n",
127154
" get_repo_sha_url(user, repo[\"name\"], repo[\"branch\"]),\n",
128155
" headers=repo_sha_headers)\n",
@@ -132,11 +159,20 @@
132159
" else:\n",
133160
" request_sha.raise_for_status()\n",
134161
" repo[\"sha\"] = request_sha.text\n",
135-
" return filtered_repos\n",
162+
" return repos\n",
136163
"\n",
137164
"\n",
138165
"with open(repos_json, \"w\", encoding=\"utf8\") as fh:\n",
139-
" json_dump(list_repositories(\"apache\", TOKEN, MAX_SIZE, REPOS_NUMBER), fh)"
166+
" json_dump(\n",
167+
" list_repositories(\n",
168+
" user=\"apache\",\n",
169+
" # Generate a personal access token here\n",
170+
" # https://github.com/settings/tokens\n",
171+
" token=TOKEN,\n",
172+
" max_size_mb=50,\n",
173+
" repos_number=50),\n",
174+
" fh\n",
175+
" )"
140176
]
141177
},
142178
{

0 commit comments

Comments
 (0)