|
26 | 26 | "metadata": {}, |
27 | 27 | "outputs": [], |
28 | 28 | "source": [ |
| 29 | + "from logging import getLogger\n", |
29 | 30 | "from os import makedirs\n", |
30 | 31 | "from os.path import join as path_join\n", |
31 | 32 | "\n", |
| 33 | + "from coloredlogs import install as coloredlogs_install\n", |
| 34 | + "\n", |
| 35 | + "\n", |
| 36 | + "coloredlogs_install()\n", |
| 37 | + "logger = getLogger(\"downloader\")\n", |
| 38 | + "\n", |
32 | 39 | "\n", |
33 | 40 | "git_data_dir = path_join(\"/devfest\", \"repos\", \"git-data\")\n", |
34 | 41 | "makedirs(git_data_dir, exist_ok=True)\n", |
35 | 42 | "repos_json = path_join(git_data_dir, \"repos.json\")" |
36 | 43 | ] |
37 | 44 | }, |
| 45 | + { |
| 46 | + "cell_type": "markdown", |
| 47 | + "metadata": {}, |
| 48 | + "source": [ |
| 49 | + "To use GitHub API, we need a token. Please create one in your [GitHub account settings](https://github.com/settings/tokens) (the basic permissions are fine), and fill it here:" |
| 50 | + ] |
| 51 | + }, |
| 52 | + { |
| 53 | + "cell_type": "code", |
| 54 | + "execution_count": null, |
| 55 | + "metadata": {}, |
| 56 | + "outputs": [], |
| 57 | + "source": [ |
| 58 | + "TOKEN = " |
| 59 | + ] |
| 60 | + }, |
38 | 61 | { |
39 | 62 | "cell_type": "code", |
40 | 63 | "execution_count": null, |
|
50 | 73 | "from tqdm import tqdm_notebook as tqdm\n", |
51 | 74 | "\n", |
52 | 75 | "\n", |
53 | | - "# Generate a personal access token here: https://github.com/settings/tokens\n", |
54 | | - "TOKEN = # See comment above, please generate a token and put it here\n", |
55 | | - "MAX_SIZE = 50 * 1024\n", |
56 | | - "REPOS_NUMBER = 50\n", |
57 | | - "\n", |
58 | | - "\n", |
59 | 76 | "next_pattern = re_compile('<(https://api.github.com/user/[^/]+/repos\\?[^>]*page=\\d+[^>]*)>; rel=\"next\"')\n", |
60 | 77 | "last_pattern = re_compile('<https://api.github.com/user/[^/]+/repos\\?[^>]*page=(\\d+)[^>]*>; rel=\"last\"')\n", |
61 | 78 | "\n", |
|
70 | 87 | " return int(match.group(1)) if match is not None else None\n", |
71 | 88 | "\n", |
72 | 89 | "\n", |
73 | | - "def filter_repos(repos: List[Dict[str, Any]], max_size: int, limit: int):\n", |
74 | | - " return [repo\n", |
75 | | - " for repo in sorted(repos,\n", |
76 | | - " key=itemgetter(\"stars\"),\n", |
77 | | - " reverse=True)\n", |
78 | | - " if repo[\"size\"] <= max_size][:limit]\n", |
79 | | - "\n", |
80 | | - "\n", |
81 | | - "def list_repositories(user: str, token: str, max_size: int, limit: int):\n", |
| 90 | + "def list_repositories(user: str,\n", |
| 91 | + " token: str,\n", |
| 92 | + " max_size_mb: int,\n", |
| 93 | + " repos_number: int\n", |
| 94 | + " ) -> List[Dict[str, Any]]:\n", |
82 | 95 | " repos_list_headers = dict(Authorization=\"token %s\" % token)\n", |
83 | 96 | " repos_url = \"https://api.github.com/users/%s/repos\" % user\n", |
84 | 97 | "\n", |
|
90 | 103 | " def get_page_url(page: int):\n", |
91 | 104 | " return \"%s?page=%d\" % (repos_url, page)\n", |
92 | 105 | "\n", |
93 | | - " print(\"Retrieving repos list for user %s\" % user)\n", |
| 106 | + " logger.info(\"Retrieving repos list for user %s\" % user)\n", |
94 | 107 | " repos = []\n", |
95 | | - " for page in tqdm(range(total_pages)):\n", |
| 108 | + " for page in tqdm(range(1, total_pages + 1)):\n", |
96 | 109 | " request = requests.get(get_page_url(page),\n", |
97 | 110 | " headers=repos_list_headers)\n", |
98 | 111 | " request.raise_for_status()\n", |
|
107 | 120 | " stars=repo[\"stargazers_count\"]\n", |
108 | 121 | " ))\n", |
109 | 122 | "\n", |
110 | | - " print(\"Filtering to keep the top %d repositories\" % limit)\n", |
111 | | - " filtered_repos = filter_repos(repos, max_size, limit)\n", |
| 123 | + " if max_size_mb is not None:\n", |
| 124 | + " logger.info(\n", |
| 125 | + " \"Filtering to keep only repositories under %.2f MB\",\n", |
| 126 | + " max_size_mb\n", |
| 127 | + " )\n", |
| 128 | + " repos = [repo for repo in repos\n", |
| 129 | + " if repo[\"size\"] <= max_size_mb * 1024]\n", |
| 130 | + "\n", |
| 131 | + " if repos_number is not None:\n", |
| 132 | + " logger.info(\n", |
| 133 | + " \"Filtering to keep only the %d most popular repositories\",\n", |
| 134 | + " repos_number\n", |
| 135 | + " )\n", |
| 136 | + " repos = [repo for repo in sorted(repos,\n", |
| 137 | + " key=itemgetter(\"stars\"),\n", |
| 138 | + " reverse=True)][:repos_number]\n", |
112 | 139 | "\n", |
113 | 140 | " def get_repo_sha_url(user: str, repo: str, branch: str):\n", |
114 | 141 | " return \"https://api.github.com/repos/%s/%s/commits/%s\" % (\n", |
|
117 | 144 | " branch\n", |
118 | 145 | " )\n", |
119 | 146 | "\n", |
120 | | - " print(\"Getting SHA1 for each repository\")\n", |
| 147 | + " logger.info(\"Getting SHA1 for each repository\")\n", |
121 | 148 | " repo_sha_headers = dict(\n", |
122 | 149 | " Authorization=\"token %s\" % token,\n", |
123 | 150 | " Accept = \"application/vnd.github.VERSION.sha\"\n", |
124 | 151 | " )\n", |
125 | | - " for repo in tqdm(filtered_repos):\n", |
| 152 | + " for repo in tqdm(repos):\n", |
126 | 153 | " request_sha = requests.get(\n", |
127 | 154 | " get_repo_sha_url(user, repo[\"name\"], repo[\"branch\"]),\n", |
128 | 155 | " headers=repo_sha_headers)\n", |
|
132 | 159 | " else:\n", |
133 | 160 | " request_sha.raise_for_status()\n", |
134 | 161 | " repo[\"sha\"] = request_sha.text\n", |
135 | | - " return filtered_repos\n", |
| 162 | + " return repos\n", |
136 | 163 | "\n", |
137 | 164 | "\n", |
138 | 165 | "with open(repos_json, \"w\", encoding=\"utf8\") as fh:\n", |
139 | | - " json_dump(list_repositories(\"apache\", TOKEN, MAX_SIZE, REPOS_NUMBER), fh)" |
| 166 | + " json_dump(\n", |
| 167 | + " list_repositories(\n", |
| 168 | + " user=\"apache\",\n", |
| 169 | + " # Generate a personal access token here\n", |
| 170 | + " # https://github.com/settings/tokens\n", |
| 171 | + " token=TOKEN,\n", |
| 172 | + " max_size_mb=50,\n", |
| 173 | + " repos_number=50),\n", |
| 174 | + " fh\n", |
| 175 | + " )" |
140 | 176 | ] |
141 | 177 | }, |
142 | 178 | { |
|
0 commit comments