-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataloading.py
More file actions
58 lines (45 loc) · 2.38 KB
/
dataloading.py
File metadata and controls
58 lines (45 loc) · 2.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import logging
import traceback
from datetime import datetime, timedelta
import elasticsearch
from elasticsearch import Elasticsearch
from perceval.backends.core.github import GitHub, CATEGORY_PULL_REQUEST
from config import GITHUB_API_TOKEN, ELASTICSEARCH_HOST
def create_index(elastic_search: Elasticsearch, pull_request_index: str):
try:
elastic_search.indices.create(index=pull_request_index)
elastic_search.indices.put_settings(index=pull_request_index, body={
"index.mapping.total_fields.limit": 5000
})
print("Index %s created" % pull_request_index)
return True
except Exception:
print("Index already exists, remove before relaunching the script")
logging.error(traceback.format_exc())
return False
def index_pull_request(elastic_search: Elasticsearch, pull_request_data: dict, pull_request_index: str):
pull_request_id: str = str(pull_request_data['number'])
try:
elastic_search.index(index=pull_request_index, id=pull_request_id, document=pull_request_data)
except Exception:
print("Could not store pull request %s " % pull_request_id)
logging.error(traceback.format_exc())
def get_and_store(owner: str, repository: str, factor: int = 0, new_index: bool = True):
pull_request_index: str = owner.lower() + "-" + repository.lower()
days_in_year: int = 365
reference: datetime = datetime(year=2021, month=11, day=1)
from_date: datetime = reference - timedelta(days=(factor + 1) * days_in_year)
to_date: datetime = reference - timedelta(days=factor * days_in_year)
pull_requests: GitHub = GitHub(owner=owner, repository=repository,
api_token=GITHUB_API_TOKEN, sleep_for_rate=True)
elastic_search: Elasticsearch = elasticsearch.Elasticsearch(ELASTICSEARCH_HOST)
if new_index and not create_index(elastic_search, pull_request_index):
return
counter: int = 0
print("Loading pull request data from %s/%s. From %s to %s" % (owner, repository, str(from_date), str(to_date)))
for pull_request_data in pull_requests.fetch(category=CATEGORY_PULL_REQUEST, from_date=from_date,
to_date=to_date):
print('.', end='')
index_pull_request(elastic_search, pull_request_data['data'], pull_request_index)
counter += 1
print("\n%d issues stored" % counter)