Skip to content

Commit 11899e4

Browse files
committed
added parsing files
1 parent f69be7c commit 11899e4

File tree

2 files changed

+475
-0
lines changed

2 files changed

+475
-0
lines changed
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
import gzip
2+
from pathlib import Path
3+
import pandas as pd
4+
import numpy as np
5+
import pickle
6+
from tqdm.auto import tqdm
7+
import polars as pl
8+
9+
import util
10+
11+
def process_event(line):
12+
event_data = {
13+
'type': line['type'],
14+
'actor.login' : line['actor']['login'],
15+
'public': line['public'],
16+
'created_at': line['created_at'],
17+
'repo.name': line['repo']['name'],
18+
'org.login': line['org']['login'] if 'org' in line else None
19+
}
20+
return event_data
21+
22+
def process_issue(line):
23+
if not 'payload' in line or line['payload'] is None or len(line['payload']) == 0:
24+
return None
25+
action_issue_data = {
26+
'action': line['payload']['action'],
27+
'title': line['payload']['issue']['title'],
28+
'text': line['payload']['issue']['body'],
29+
'issue_id': line['payload']['issue']['id'],
30+
'issue_number': line['payload']['issue']['number'],
31+
# Those are needed to re-create open event if needed
32+
'issue_created_at': line['payload']['issue']['created_at'],
33+
'issue_user_login' : line['payload']['issue']['user']['login'],
34+
}
35+
return action_issue_data
36+
37+
def process_line(line):
38+
event_data = process_event(line)
39+
40+
action_issue_data = process_issue(line)
41+
if action_issue_data is None:
42+
return None
43+
event_data.update(action_issue_data)
44+
45+
if 'comment' in line['payload']:
46+
event_data['comment_id'] = line['payload']['comment']['id']
47+
event_data['text'] = line['payload']['comment']['body']
48+
49+
# if it is part of pull request grouping will be by its id, otherwise by issue id
50+
if 'pull_request' in line['payload']['issue'] and 'url' in line['payload']['issue']['pull_request']:
51+
split = line['payload']['issue']['pull_request']['url'].split('/')
52+
#'pull_request_user_login': split[-4],
53+
#'pull_request_repo': split[-3],
54+
#'pull_request_number': split[-1]
55+
event_data['pull_request.guid'] = f"{split[-4]}/{split[-3]}/pull/{split[-1]}"
56+
57+
event_data['group_id'] = event_data['pull_request.guid']
58+
else:
59+
event_data['pull_request.guid'] = None
60+
event_data['group_id'] = str(event_data['issue_id'])
61+
62+
return event_data
63+
64+
65+
def parse_comment(r):
66+
# actions: created, edited or deleted
67+
comment = {
68+
'type': 'comment',
69+
'action': r['action'],
70+
'datetime': str(r['created_at']),
71+
'author': r['actor.login'],
72+
'comment_id': int(r['comment_id']),
73+
}
74+
if r['action'] != 'deleted':
75+
comment['comment'] = r['text']
76+
return comment
77+
78+
def parse_issue(r, force_open=False):
79+
80+
if force_open:
81+
# if issue was from before 2015 or because of gharchive outage
82+
# it will not have open event, so we emulate it
83+
issue = {
84+
'type': 'issue',
85+
'action': 'opened',
86+
'datetime': str(r['issue_created_at']),
87+
'author': r['issue_user_login'],
88+
'title': r['title'],
89+
'description': r['text']
90+
}
91+
else:
92+
issue = {
93+
'type': 'issue',
94+
'action': r['action'],
95+
'datetime': str(r['created_at']),
96+
'author': r['actor.login'],
97+
}
98+
if r['action'] == 'opened' or r['action'] == 'edited' or r['action'] == 'reopened':
99+
# all those events can result in different title or description
100+
issue.update({
101+
'title': r['title'],
102+
'description': r['text']
103+
})
104+
105+
if r['action'] == 'closed':
106+
# no additional info
107+
pass
108+
109+
return issue
110+
111+
def parse_issue_history(gr):
112+
gr = gr[1]
113+
res_dict = {}
114+
indx = 0
115+
r = gr.iloc[0]
116+
res_dict['repo'] = r['repo.name']
117+
res_dict['org'] = r['org.login']
118+
res_dict['issue_id'] = int(r['issue_id'])
119+
res_dict['issue_number'] = int(r['issue_number'])
120+
121+
# From docs: all PR are issues but not all issues are PR, for PR review comments use PR api (which is not done yet)
122+
# supposedly if we have PR not None it is a general discussion of some PR, so we just mark issue's PR for now
123+
if not pd.isna(r['pull_request.guid']):
124+
res_dict['pull_request.guid'] = r['pull_request.guid']
125+
else:
126+
res_dict['pull_request.guid'] = None
127+
128+
res_dict['events'] = []
129+
130+
for i, r in gr.iterrows():
131+
if indx == 0:
132+
if r['type'] != 'IssuesEvent' or r['action'] != 'opened':
133+
res_dict['events'].append(parse_issue(r, force_open=True))
134+
135+
# TODO: edited events will need to be later aggregated with their open or reopen events
136+
# however, for some reason no single edited event in the set, check original parser
137+
if r['type'] == 'IssueCommentEvent':
138+
res_dict['events'].append(parse_comment(r))
139+
elif r['type'] == 'IssuesEvent':
140+
res_dict['events'].append(parse_issue(r))
141+
else:
142+
raise RuntimeError(f"unexpeceted event type: {r['type']}")
143+
indx += 1
144+
145+
return res_dict
146+
147+
def print_issue_history(data, truncate=False):
148+
res = ''
149+
res += f"REPO: {data['repo']}\n"
150+
if data['org'] is not None:
151+
res += f"ORG: {data['org']}\n"
152+
res += f"ISSUE NUNBER: {int(data['issue_number'])}\n"
153+
154+
if data['pull_request'] is not None:
155+
res += f"PULL REQUEST [USER: {data['pull_request']['user_login']} REPO: {data['pull_request']['repo']} NUMBER: {data['pull_request']['number']}]\n"
156+
res += '\n\n'
157+
158+
for event in data['events']:
159+
if event['type'] == 'comment':
160+
res += f"COMMENT {int(event['comment_id'])} {event['action'].upper()} [{event['datetime']}] BY {event['author']}\n"
161+
if event['action'] != 'deleted':
162+
if truncate:
163+
res += event['comment'][:100] + '...\n'
164+
else:
165+
res += event['comment'] + '\n'
166+
res += '\n'
167+
elif event['type'] == 'issue':
168+
res += f"ISSUE {event['action'].upper()} [{event['datetime']}] BY {event['author']}\n"
169+
if event['action'] == 'edited' or event['action'] == 'opened' or event['action'] == 'reopened':
170+
res += f"TITLE: {event['title']}\n"
171+
if truncate:
172+
res += f"DESCRIPTION: {event['description'][:100]}...\n"
173+
else:
174+
res += f"DESCRIPTION: {event['description']}\n"
175+
else:
176+
pass
177+
res += '\n'
178+
else:
179+
pass
180+
181+
return res

0 commit comments

Comments
 (0)