Skip to content

Commit 797b5c2

Browse files
authored
Merge pull request #5 from bigcode-project/issues_pr_processing
creation of Issues and PRs datasets and PRs rendering
2 parents 5b96384 + 7e47d5b commit 797b5c2

18 files changed

+4762
-0
lines changed
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import time
2+
from datetime import timedelta
3+
from urllib import request
4+
from tqdm.auto import tqdm
5+
6+
from cfg import gharchives_path, edate, sdate
7+
8+
def run():
9+
headers = {
10+
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0"
11+
}
12+
prefix_url = "https://data.gharchive.org/"
13+
14+
delta = edate - sdate # as timedelta
15+
16+
gharchives_path.mkdir(parents=True, exist_ok=True)
17+
18+
for i in tqdm(range(delta.days + 1)):
19+
day = sdate + timedelta(days=i)
20+
for j in range(24):
21+
file_name = f"{day}-{j}.json.gz"
22+
save_path = gharchives_path / file_name
23+
if save_path.is_file():
24+
continue
25+
26+
# TODO: add port management as they seems to run out time after time:
27+
# <urlopen error [Errno 99] Cannot assign requested address>
28+
try:
29+
url = prefix_url + file_name
30+
req = request.Request(url=url, headers=headers)
31+
output = request.urlopen(req).read()
32+
33+
with open(save_path, "wb") as f:
34+
f.write(output)
35+
except Exception as e:
36+
print("Can not download the following url..")
37+
print(url)
38+
print("-----------------------------------------")
39+
print(e)
40+
print("-----------------------------------------\n")
41+
42+
time.sleep(1)
43+
44+
if __name__ == "__main__":
45+
run()
46+
Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "562a9756-a22e-4de2-8d30-af2ab09fbea3",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"%load_ext autoreload\n",
11+
"%autoreload 2"
12+
]
13+
},
14+
{
15+
"cell_type": "code",
16+
"execution_count": 1,
17+
"id": "3a361285-4384-45a3-9fec-7d3ac2e82118",
18+
"metadata": {},
19+
"outputs": [],
20+
"source": [
21+
"import shutil\n",
22+
"import json\n",
23+
"from tqdm.auto import tqdm\n",
24+
"from pathlib import Path\n",
25+
"\n",
26+
"import util\n",
27+
"import ray_util\n",
28+
"import cfg\n",
29+
"import regroup\n",
30+
"import pr_issue_parser"
31+
]
32+
},
33+
{
34+
"cell_type": "markdown",
35+
"id": "6e9774fe-e292-4140-a586-527b456cdd43",
36+
"metadata": {},
37+
"source": [
38+
"## Ray cluster management"
39+
]
40+
},
41+
{
42+
"cell_type": "code",
43+
"execution_count": null,
44+
"id": "76c6ea8a-365c-45d6-9db2-38d64efb65a3",
45+
"metadata": {},
46+
"outputs": [],
47+
"source": [
48+
"import ray_server\n",
49+
"server = ray_server.get_ray_server()"
50+
]
51+
},
52+
{
53+
"cell_type": "code",
54+
"execution_count": null,
55+
"id": "bfb9685e-9f83-4c1e-8dc9-2d8e9f5f21c3",
56+
"metadata": {},
57+
"outputs": [],
58+
"source": [
59+
"server.scale_cluster(60)"
60+
]
61+
},
62+
{
63+
"cell_type": "markdown",
64+
"id": "10fa7369-995e-4bda-b927-f5354c1c0d3f",
65+
"metadata": {},
66+
"source": [
67+
"## Extract pr and issue events information"
68+
]
69+
},
70+
{
71+
"cell_type": "code",
72+
"execution_count": null,
73+
"id": "afab6f21-a43e-455e-a3cc-4262ea4ed91c",
74+
"metadata": {},
75+
"outputs": [],
76+
"source": [
77+
"files = util.glob_sorted(cfg.gharchives_path, '*.json.gz')\n",
78+
"files = regroup.split_items(files, 10, True)\n",
79+
"dst = cfg.parsed_issues_prs_path\n",
80+
"dst.mkdir(parents=True, exist_ok=True)\n",
81+
"\n",
82+
"parse_files = ray.remote(pr_issue_parser.parse_files)"
83+
]
84+
},
85+
{
86+
"cell_type": "code",
87+
"execution_count": null,
88+
"id": "e99b67ad-7f66-46ed-b675-6f1c81330b34",
89+
"metadata": {},
90+
"outputs": [],
91+
"source": [
92+
"parse_files._function(files[0], 0, dst)"
93+
]
94+
},
95+
{
96+
"cell_type": "code",
97+
"execution_count": null,
98+
"id": "354934d9-946f-4d32-aa6d-9a5b9d6ddcac",
99+
"metadata": {},
100+
"outputs": [],
101+
"source": [
102+
"res = []\n",
103+
"for i, gr, in enumerate(files):\n",
104+
" res.append(parse_files.remote(gr, i, dst))\n",
105+
"res = ray.get(res)"
106+
]
107+
},
108+
{
109+
"cell_type": "markdown",
110+
"id": "5cabd3f9-4bb0-438d-ada2-76c165299291",
111+
"metadata": {},
112+
"source": [
113+
"## Group events by issue or pr id"
114+
]
115+
},
116+
{
117+
"cell_type": "code",
118+
"execution_count": null,
119+
"id": "f97a5e7e-ea23-48c0-8c0d-8aa5750f63ef",
120+
"metadata": {},
121+
"outputs": [],
122+
"source": [
123+
"res = regroup.ray_shuffle.remote(\n",
124+
" cfg.parsed_issues_prs_path, cfg.issues_prs_grouped_path,\n",
125+
" 'group_id', 3\n",
126+
")\n",
127+
"res = ray.get(res)"
128+
]
129+
},
130+
{
131+
"cell_type": "code",
132+
"execution_count": null,
133+
"id": "8e30a84f-88a9-48ec-94b7-8ae79db84519",
134+
"metadata": {},
135+
"outputs": [],
136+
"source": [
137+
"## Exract information for Issues and PRs from goruped events"
138+
]
139+
},
140+
{
141+
"cell_type": "code",
142+
"execution_count": null,
143+
"id": "5e13eaec-69ca-4f8e-862f-bebbe279ed47",
144+
"metadata": {},
145+
"outputs": [],
146+
"source": [
147+
"files = util.glob_sorted(cfg.issues_prs_grouped_path, '*.parquet')\n",
148+
"dst_prs_grouped = cfg.prs_grouped_path\n",
149+
"dst_prs_grouped.mkdir(parents=True, exist_ok=True)"
150+
]
151+
},
152+
{
153+
"cell_type": "code",
154+
"execution_count": null,
155+
"id": "5942f7cd-b9fe-4f73-89d2-5b61c7435983",
156+
"metadata": {},
157+
"outputs": [],
158+
"source": [
159+
"parse_grouped_files = ray.remote(pr_issue_parser.parse_grouped_files)"
160+
]
161+
},
162+
{
163+
"cell_type": "code",
164+
"execution_count": null,
165+
"id": "106d7d7c-138d-4f1d-8602-949fb9d47126",
166+
"metadata": {},
167+
"outputs": [],
168+
"source": [
169+
"res = ray_util.ray_map(parse_grouped_files, files, prs_dest=dst_prs_grouped)"
170+
]
171+
},
172+
{
173+
"cell_type": "code",
174+
"execution_count": null,
175+
"id": "c51c72b7-6313-47ab-9776-1233745fd2fe",
176+
"metadata": {},
177+
"outputs": [],
178+
"source": [
179+
"res = ray.get(res)"
180+
]
181+
},
182+
{
183+
"cell_type": "markdown",
184+
"id": "ecad868b-d672-4267-b713-d020b6311130",
185+
"metadata": {},
186+
"source": [
187+
"## Ray cluster management"
188+
]
189+
},
190+
{
191+
"cell_type": "code",
192+
"execution_count": null,
193+
"id": "93a5a5c7-eafe-4dfa-8a4e-cefd5605c608",
194+
"metadata": {},
195+
"outputs": [],
196+
"source": [
197+
"ray.shutdown()\n",
198+
"server.scale_cluster(60)"
199+
]
200+
}
201+
],
202+
"metadata": {
203+
"kernelspec": {
204+
"display_name": "Python 3 (ipykernel)",
205+
"language": "python",
206+
"name": "python3"
207+
},
208+
"language_info": {
209+
"codemirror_mode": {
210+
"name": "ipython",
211+
"version": 3
212+
},
213+
"file_extension": ".py",
214+
"mimetype": "text/x-python",
215+
"name": "python",
216+
"nbconvert_exporter": "python",
217+
"pygments_lexer": "ipython3",
218+
"version": "3.11.6"
219+
}
220+
},
221+
"nbformat": 4,
222+
"nbformat_minor": 5
223+
}

0 commit comments

Comments
 (0)