|
8 | 8 | # ------------------------------------------------------------------------ |
9 | 9 | # A class to download and organize information from GitHub. |
10 | 10 |
|
11 | | -import json, logging, math, sys |
| 11 | +import json, logging, math, sys, time |
12 | 12 | import requests |
13 | 13 |
|
14 | 14 | class GitHubIssues: |
15 | 15 |
|
16 | | - def __init__(self, items=[], max_results=10000, token=None): |
| 16 | + def __init__(self, items=[], token=None): |
17 | 17 | self._token = token |
18 | | - self._json = {'items': items} |
19 | | - self._per_page = 100 |
20 | | - self._max_results = max_results |
| 18 | + self._items = items |
| 19 | + self._delay_per_request = 5 |
| 20 | + self._max_requests = 100 |
21 | 21 |
|
22 | 22 | def repo(self, org, repo): |
23 | 23 | return GitHubIssues(self.issues(lambda item: item['repository_url'].endswith(f'/repos/{org}/{repo}')), |
24 | | - max_results=self._max_results, token=self._token) |
| 24 | + token=self._token) |
25 | 25 |
|
26 | 26 | def prs(self): |
27 | 27 | return GitHubIssues(self.issues(lambda item: item['pull_request'] is True), |
28 | | - max_results=self._max_results, token=self._token) |
| 28 | + token=self._token) |
29 | 29 |
|
30 | 30 | def issues(self, predicate=lambda x: True): |
31 | | - return list(filter(predicate, self._json['items'])) |
| 31 | + return list(filter(predicate, self._items)) |
32 | 32 |
|
33 | 33 | def load(self, filepath): |
34 | 34 | with open(filepath) as f: |
35 | 35 | result = json.loads(f.read()) |
36 | | - self._merge(result) |
| 36 | + self._items.extend(result) |
37 | 37 |
|
38 | 38 | def save(self, filepath): |
39 | 39 | with open(filepath, 'w') as f: |
40 | | - return json.dump(self._json, f, sort_keys=True, indent=4) |
| 40 | + return json.dump(self._items, f, sort_keys=True, indent=4) |
| 41 | + |
| 42 | + @staticmethod |
| 43 | + def _search_url(query): |
| 44 | + return f"https://api.github.com/search/issues?q={query}+is:open&sort=created&order=asc&per_page=100" |
41 | 45 |
|
42 | 46 | def download(self, query): |
43 | 47 | """ |
44 | 48 | Downloads issues from GitHub according to the given query. |
45 | 49 | """ |
46 | | - |
47 | | - url = f"https://api.github.com/search/issues?q={query}+is:open&sort=created&order=asc&per_page={self._per_page}" |
48 | | - url = self._download_page(url) |
49 | | - |
50 | | - max_pages = math.ceil(self._max_results / self._per_page) |
51 | | - for i in range(1, max_pages): |
| 50 | + url = GitHubIssues._search_url(query) |
| 51 | + for _ in range(self._max_requests): |
| 52 | + url = self._download_page(url, query) |
52 | 53 | if not url: break |
53 | | - url = self._download_page(url) |
| 54 | + time.sleep(self._delay_per_request) |
54 | 55 |
|
55 | | - def _download_page(self, url): |
56 | | - headers = {} |
57 | | - if self._token: headers['Authorization'] = self._token |
| 56 | + def _download_page(self, url, query): |
| 57 | + headers = {'User-Agent': 'status.scijava.org'} |
| 58 | + if self._token: headers['Authorization'] = "token " + self._token |
58 | 59 |
|
59 | 60 | logging.debug(f'Downloading {url}') |
60 | | - response = requests.get(url, headers) |
| 61 | + response = requests.get(url, headers=headers) |
| 62 | + response.raise_for_status() |
61 | 63 | result = response.json() |
| 64 | + self._items.extend(result['items']) |
62 | 65 |
|
63 | | - self._merge(result) |
64 | | - |
65 | | - return response.links['next']['url'] if 'next' in response.links else None |
66 | | - |
67 | | - def _merge(self, content): |
68 | | - for key, value in content.items(): |
69 | | - if key in self._json and type(self._json[key]) == list: |
70 | | - # Append values to the list. |
71 | | - self._json[key].extend(value) |
72 | | - else: |
73 | | - # Overwrite value in the dict. |
74 | | - self._json[key] = value |
| 66 | + next_url = response.links['next']['url'] if 'next' in response.links else None |
| 67 | + if not next_url and result['total_count'] > 1000 and len(result['items']) > 0: |
| 68 | + # We hit the 1000-issue limit. Continue the search just beyond the last issue we got. |
| 69 | + next_url = GitHubIssues._search_url(f"{query}+created:>{result['items'][-1]['created_at']}") |
| 70 | + return next_url |
75 | 71 |
|
76 | 72 | if __name__ == '__main__': |
77 | 73 | if len(sys.argv) < 2: |
|
0 commit comments