Skip to content

Commit af62066

Browse files
committed
github.py: handle more than 1000 results
GitHub returns a maximum of 1000 items per search query. But since we request issues in creation date order, we can use a created:> constraint to continue the search where the results left off, until we stop hitting the 1000-item limit. To avoid going off the rails, we set the maximum number of requests to 100 by default (i.e. 10000 issues total). We also pass a User-Agent, and wait 5 seconds between each request, to encourage GitHub not to stop us cold with 403 Forbidden errors.
1 parent f0680bc commit af62066

File tree

1 file changed

+29
-33
lines changed

1 file changed

+29
-33
lines changed

github.py

Lines changed: 29 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -8,70 +8,66 @@
88
# ------------------------------------------------------------------------
99
# A class to download and organize information from GitHub.
1010

11-
import json, logging, math, sys
11+
import json, logging, math, sys, time
1212
import requests
1313

1414
class GitHubIssues:
1515

16-
def __init__(self, items=[], max_results=10000, token=None):
16+
def __init__(self, items=[], token=None):
1717
self._token = token
18-
self._json = {'items': items}
19-
self._per_page = 100
20-
self._max_results = max_results
18+
self._items = items
19+
self._delay_per_request = 5
20+
self._max_requests = 100
2121

2222
def repo(self, org, repo):
2323
return GitHubIssues(self.issues(lambda item: item['repository_url'].endswith(f'/repos/{org}/{repo}')),
24-
max_results=self._max_results, token=self._token)
24+
token=self._token)
2525

2626
def prs(self):
2727
return GitHubIssues(self.issues(lambda item: item['pull_request'] is True),
28-
max_results=self._max_results, token=self._token)
28+
token=self._token)
2929

3030
def issues(self, predicate=lambda x: True):
31-
return list(filter(predicate, self._json['items']))
31+
return list(filter(predicate, self._items))
3232

3333
def load(self, filepath):
3434
with open(filepath) as f:
3535
result = json.loads(f.read())
36-
self._merge(result)
36+
self._items.extend(result)
3737

3838
def save(self, filepath):
3939
with open(filepath, 'w') as f:
40-
return json.dump(self._json, f, sort_keys=True, indent=4)
40+
return json.dump(self._items, f, sort_keys=True, indent=4)
41+
42+
@staticmethod
43+
def _search_url(query):
44+
return f"https://api.github.com/search/issues?q={query}+is:open&sort=created&order=asc&per_page=100"
4145

4246
def download(self, query):
4347
"""
4448
Downloads issues from GitHub according to the given query.
4549
"""
46-
47-
url = f"https://api.github.com/search/issues?q={query}+is:open&sort=created&order=asc&per_page={self._per_page}"
48-
url = self._download_page(url)
49-
50-
max_pages = math.ceil(self._max_results / self._per_page)
51-
for i in range(1, max_pages):
50+
url = GitHubIssues._search_url(query)
51+
for _ in range(self._max_requests):
52+
url = self._download_page(url, query)
5253
if not url: break
53-
url = self._download_page(url)
54+
time.sleep(self._delay_per_request)
5455

55-
def _download_page(self, url):
56-
headers = {}
57-
if self._token: headers['Authorization'] = self._token
56+
def _download_page(self, url, query):
57+
headers = {'User-Agent': 'status.scijava.org'}
58+
if self._token: headers['Authorization'] = "token " + self._token
5859

5960
logging.debug(f'Downloading {url}')
60-
response = requests.get(url, headers)
61+
response = requests.get(url, headers=headers)
62+
response.raise_for_status()
6163
result = response.json()
64+
self._items.extend(result['items'])
6265

63-
self._merge(result)
64-
65-
return response.links['next']['url'] if 'next' in response.links else None
66-
67-
def _merge(self, content):
68-
for key, value in content.items():
69-
if key in self._json and type(self._json[key]) == list:
70-
# Append values to the list.
71-
self._json[key].extend(value)
72-
else:
73-
# Overwrite value in the dict.
74-
self._json[key] = value
66+
next_url = response.links['next']['url'] if 'next' in response.links else None
67+
if not next_url and result['total_count'] > 1000 and len(result['items']) > 0:
68+
# We hit the 1000-issue limit. Continue the search just beyond the last issue we got.
69+
next_url = GitHubIssues._search_url(f"{query}+created:>{result['items'][-1]['created_at']}")
70+
return next_url
7571

7672
if __name__ == '__main__':
7773
if len(sys.argv) < 2:

0 commit comments

Comments
 (0)