|
20 | 20 | "We iterate as long as the API gives us a pointer to another response page. We filter forks to focus on original repositories." |
21 | 21 | ] |
22 | 22 | }, |
| 23 | + { |
| 24 | + "cell_type": "code", |
| 25 | + "execution_count": null, |
| 26 | + "metadata": {}, |
| 27 | + "outputs": [], |
| 28 | + "source": [ |
| 29 | + "from os import makedirs\n", |
| 30 | + "from os.path import join as path_join\n", |
| 31 | + "\n", |
| 32 | + "\n", |
| 33 | + "git_data_dir = path_join(\"/devfest\", \"repos\", \"git-data\")\n", |
| 34 | + "makedirs(git_data_dir, exist_ok=True)\n", |
| 35 | + "repos_json = path_join(git_data_dir, \"repos.json\")" |
| 36 | + ] |
| 37 | + }, |
23 | 38 | { |
24 | 39 | "cell_type": "code", |
25 | 40 | "execution_count": null, |
26 | 41 | "metadata": {}, |
27 | 42 | "outputs": [], |
28 | 43 | "source": [ |
29 | 44 | "from json import dump as json_dump\n", |
| 45 | + "from operator import itemgetter\n", |
30 | 46 | "from re import compile as re_compile\n", |
31 | | - "from typing import Optional\n", |
| 47 | + "from typing import Any, Dict, List, Optional\n", |
32 | 48 | "\n", |
33 | 49 | "import requests\n", |
| 50 | + "from tqdm import tqdm_notebook as tqdm\n", |
34 | 51 | "\n", |
35 | 52 | "\n", |
36 | 53 | "# Generate a personal access token here: https://github.com/settings/tokens\n", |
37 | 54 | "TOKEN = # See comment above, please generate a token and put it here\n", |
| 55 | + "MAX_SIZE = 50 * 1024\n", |
| 56 | + "REPOS_NUMBER = 50\n", |
38 | 57 | "\n", |
39 | 58 | "\n", |
40 | 59 | "next_pattern = re_compile('<(https://api.github.com/user/[^/]+/repos\\?[^>]*page=\\d+[^>]*)>; rel=\"next\"')\n", |
| 60 | + "last_pattern = re_compile('<https://api.github.com/user/[^/]+/repos\\?[^>]*page=(\\d+)[^>]*>; rel=\"last\"')\n", |
41 | 61 | "\n", |
42 | 62 | "\n", |
43 | 63 | "def parse_next(link_header: str) -> Optional[str]:\n", |
44 | 64 | " match = next_pattern.search(link_header)\n", |
45 | 65 | " return match.group(1) if match is not None else None\n", |
46 | 66 | "\n", |
47 | 67 | "\n", |
48 | | - "def list_repositories(user: str):\n", |
49 | | - " headers = dict(Authorization=\"token {token}\".format(token=TOKEN))\n", |
50 | | - " url = \"https://api.github.com/users/{user}/repos\".format(user=user)\n", |
51 | | - " while url is not None:\n", |
52 | | - " request = requests.get(url, headers=headers)\n", |
53 | | - " request.raise_for_status()\n", |
54 | | - " for repo in request.json():\n", |
55 | | - " if not repo[\"fork\"]:\n", |
56 | | - " yield repo[\"name\"], repo[\"clone_url\"], repo[\"size\"], repo[\"stargazers_count\"]\n", |
57 | | - " url = parse_next(request.headers[\"Link\"])\n", |
58 | | - "\n", |
59 | | - "\n", |
60 | | - "with open('output/repos.json', 'w') as fh:\n", |
61 | | - " json_dump(list(list_repositories(\"apache\")), fh)" |
62 | | - ] |
63 | | - }, |
64 | | - { |
65 | | - "cell_type": "markdown", |
66 | | - "metadata": {}, |
67 | | - "source": [ |
68 | | - "## Filtering for repos we want to analyze\n", |
| 68 | + "def parse_last(link_header: str) -> Optional[int]:\n", |
| 69 | + " match = last_pattern.search(link_header)\n", |
| 70 | + " return int(match.group(1)) if match is not None else None\n", |
69 | 71 | "\n", |
70 | | - "We'll keep the most popular repos by stars that are under a given size threshold." |
71 | | - ] |
72 | | - }, |
73 | | - { |
74 | | - "cell_type": "code", |
75 | | - "execution_count": null, |
76 | | - "metadata": {}, |
77 | | - "outputs": [], |
78 | | - "source": [ |
79 | | - "from json import load as json_load\n", |
80 | | - "from operator import itemgetter\n", |
81 | | - "from pprint import pprint\n", |
82 | 72 | "\n", |
| 73 | + "def filter_repos(repos: List[Dict[str, Any]], max_size: int, limit: int):\n", |
| 74 | + " return [repo\n", |
| 75 | + " for repo in sorted(repos,\n", |
| 76 | + " key=itemgetter(\"stars\"),\n", |
| 77 | + " reverse=True)\n", |
| 78 | + " if repo[\"size\"] <= max_size][:limit]\n", |
83 | 79 | "\n", |
84 | | - "MAX_SIZE = 50 * 1024\n", |
85 | 80 | "\n", |
| 81 | + "def list_repositories(user: str, token: str, max_size: int, limit: int):\n", |
| 82 | + " repos_list_headers = dict(Authorization=\"token %s\" % token)\n", |
| 83 | + " repos_url = \"https://api.github.com/users/%s/repos\" % user\n", |
86 | 84 | "\n", |
87 | | - "filtered = []\n", |
88 | | - "with open('output/repos.json', 'r') as fh:\n", |
89 | | - " repos = json_load(fh)\n", |
90 | | - " filtered = [(name, clone_url)\n", |
91 | | - " for name, clone_url, size, _ in sorted(repos, key=itemgetter(3), reverse=True)\n", |
92 | | - " if size <= MAX_SIZE]\n", |
| 85 | + " request_total = requests.get(repos_url,\n", |
| 86 | + " headers=repos_list_headers)\n", |
| 87 | + " total_pages = parse_last(request_total.headers[\"Link\"])\n", |
| 88 | + " assert total_pages is not None\n", |
93 | 89 | "\n", |
| 90 | + " def get_page_url(page: int):\n", |
| 91 | + " return \"%s?page=%d\" % (repos_url, page)\n", |
94 | 92 | "\n", |
95 | | - "pprint(filtered)" |
| 93 | + " print(\"Retrieving repos list for user %s\" % user)\n", |
| 94 | + " repos = []\n", |
| 95 | + " for page in tqdm(range(total_pages)):\n", |
| 96 | + " request = requests.get(get_page_url(page),\n", |
| 97 | + " headers=repos_list_headers)\n", |
| 98 | + " request.raise_for_status()\n", |
| 99 | + " for repo in request.json():\n", |
| 100 | + " if repo[\"fork\"]:\n", |
| 101 | + " continue\n", |
| 102 | + " repos.append(dict(\n", |
| 103 | + " name=repo[\"name\"],\n", |
| 104 | + " branch=repo[\"default_branch\"],\n", |
| 105 | + " clone_url=repo[\"clone_url\"],\n", |
| 106 | + " size=repo[\"size\"],\n", |
| 107 | + " stars=repo[\"stargazers_count\"]\n", |
| 108 | + " ))\n", |
| 109 | + "\n", |
| 110 | + " print(\"Filtering to keep the top %d repositories\" % limit)\n", |
| 111 | + " filtered_repos = filter_repos(repos, max_size, limit)\n", |
| 112 | + "\n", |
| 113 | + " def get_repo_sha_url(user: str, repo: str, branch: str):\n", |
| 114 | + " return \"https://api.github.com/repos/%s/%s/commits/%s\" % (\n", |
| 115 | + " user,\n", |
| 116 | + " repo,\n", |
| 117 | + " branch\n", |
| 118 | + " )\n", |
| 119 | + "\n", |
| 120 | + " print(\"Getting SHA1 for each repository\")\n", |
| 121 | + " repo_sha_headers = dict(\n", |
| 122 | + " Authorization=\"token %s\" % token,\n", |
| 123 | + " Accept = \"application/vnd.github.VERSION.sha\"\n", |
| 124 | + " )\n", |
| 125 | + " for repo in tqdm(filtered_repos):\n", |
| 126 | + " request_sha = requests.get(\n", |
| 127 | + " get_repo_sha_url(user, repo[\"name\"], repo[\"branch\"]),\n", |
| 128 | + " headers=repo_sha_headers)\n", |
| 129 | + " if request_sha.status_code == 409:\n", |
| 130 | + " # Repo is empty\n", |
| 131 | + " continue\n", |
| 132 | + " else:\n", |
| 133 | + " request_sha.raise_for_status()\n", |
| 134 | + " repo[\"sha\"] = request_sha.text\n", |
| 135 | + " return filtered_repos\n", |
| 136 | + "\n", |
| 137 | + "\n", |
| 138 | + "with open(repos_json, \"w\", encoding=\"utf8\") as fh:\n", |
| 139 | + " json_dump(list_repositories(\"apache\", TOKEN, MAX_SIZE, REPOS_NUMBER), fh)" |
96 | 140 | ] |
97 | 141 | }, |
98 | 142 | { |
|
101 | 145 | "metadata": {}, |
102 | 146 | "outputs": [], |
103 | 147 | "source": [ |
| 148 | + "from json import load as json_load\n", |
104 | 149 | "from multiprocessing.pool import ThreadPool\n", |
105 | 150 | "\n", |
106 | 151 | "\n", |
107 | 152 | "PARALLEL_DOWNLOADS = 10\n", |
108 | | - "REPOS_NUMBER = 50\n", |
109 | 153 | "\n", |
110 | 154 | "\n", |
111 | | - "def clone_repo(clone_url: str):\n", |
112 | | - " !cd /devfest/repos && git clone -q {clone_url}\n", |
| 155 | + "def clone_repo(name: str, clone_url: str, sha):\n", |
| 156 | + " !cd {git_data_dir} \\\n", |
| 157 | + " && git clone -q {clone_url} {name} \\\n", |
| 158 | + " && cd {name} \\\n", |
| 159 | + " && git checkout -q {sha}\n", |
113 | 160 | "\n", |
114 | 161 | "\n", |
115 | | - "with ThreadPool(PARALLEL_DOWNLOADS) as pool:\n", |
116 | | - " pool.map(clone_repo, [clone_url for _, clone_url in filtered[:REPOS_NUMBER]])" |
| 162 | + "with ThreadPool(PARALLEL_DOWNLOADS) as pool, \\\n", |
| 163 | + " open(repos_json, encoding=\"utf8\") as fh:\n", |
| 164 | + " repos = json_load(fh)\n", |
| 165 | + " pool.starmap(clone_repo,\n", |
| 166 | + " [(repo[\"name\"],\n", |
| 167 | + " repo[\"clone_url\"],\n", |
| 168 | + " repo[\"sha\"])\n", |
| 169 | + " for repo in repos])" |
117 | 170 | ] |
118 | 171 | } |
119 | 172 | ], |
|
0 commit comments