Skip to content

Commit b3cb45d

Browse files
authored
Merge pull request #8 from bigcode-project/issues_pr_processing
fixes after refactoring
2 parents a846cbc + 8360d6c commit b3cb45d

File tree

7 files changed

+69
-75
lines changed

7 files changed

+69
-75
lines changed

pull_requests_and_issues/1_parse_issue_and_pr_events.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
},
1414
{
1515
"cell_type": "code",
16-
"execution_count": 1,
16+
"execution_count": null,
1717
"id": "3a361285-4384-45a3-9fec-7d3ac2e82118",
1818
"metadata": {},
1919
"outputs": [],

pull_requests_and_issues/2_process_commit_pairs.ipynb

Lines changed: 14 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": null,
66
"id": "918d5f68-2b90-4f0d-a90e-32baedf27f39",
77
"metadata": {},
88
"outputs": [],
@@ -13,7 +13,7 @@
1313
},
1414
{
1515
"cell_type": "code",
16-
"execution_count": 8,
16+
"execution_count": null,
1717
"id": "58449e85-14df-442d-adfb-380a878d310d",
1818
"metadata": {},
1919
"outputs": [],
@@ -42,48 +42,21 @@
4242
},
4343
{
4444
"cell_type": "code",
45-
"execution_count": 3,
45+
"execution_count": null,
4646
"id": "0327976e-b27b-47d4-9a04-68bcebe217df",
4747
"metadata": {},
48-
"outputs": [
49-
{
50-
"name": "stderr",
51-
"output_type": "stream",
52-
"text": [
53-
"2024-02-27 19:21:33,231\tINFO worker.py:1458 -- Connecting to existing Ray cluster at address: 10.210.154.158:8786...\n",
54-
"2024-02-27 19:21:33,262\tINFO worker.py:1633 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttp://10.210.154.158:8000 \u001b[39m\u001b[22m\n"
55-
]
56-
},
57-
{
58-
"name": "stdout",
59-
"output_type": "stream",
60-
"text": [
61-
"https://453f3e01-e179-4204-a29b-3eb6bb94a853-8000.job.console.elementai.com\n"
62-
]
63-
}
64-
],
48+
"outputs": [],
6549
"source": [
6650
"import ray_server\n",
6751
"server = ray_server.get_ray_server()"
6852
]
6953
},
7054
{
7155
"cell_type": "code",
72-
"execution_count": 4,
56+
"execution_count": null,
7357
"id": "ce27b201-7e22-48fa-b646-80eebeb6cac0",
7458
"metadata": {},
75-
"outputs": [
76-
{
77-
"data": {
78-
"text/plain": [
79-
"'OK'"
80-
]
81-
},
82-
"execution_count": 4,
83-
"metadata": {},
84-
"output_type": "execute_result"
85-
}
86-
],
59+
"outputs": [],
8760
"source": [
8861
"server.scale_cluster(60)"
8962
]
@@ -98,7 +71,7 @@
9871
},
9972
{
10073
"cell_type": "code",
101-
"execution_count": 5,
74+
"execution_count": null,
10275
"id": "ea60c091-265f-441b-9293-5cc66d7c7c6f",
10376
"metadata": {},
10477
"outputs": [],
@@ -117,7 +90,7 @@
11790
},
11891
{
11992
"cell_type": "code",
120-
"execution_count": 6,
93+
"execution_count": null,
12194
"id": "430fa2fa-326a-481e-bdc9-a29482090908",
12295
"metadata": {},
12396
"outputs": [],
@@ -136,22 +109,7 @@
136109
"execution_count": null,
137110
"id": "5f9668d8-c01f-4c67-8131-d2b82a2a4d7e",
138111
"metadata": {},
139-
"outputs": [
140-
{
141-
"name": "stderr",
142-
"output_type": "stream",
143-
"text": [
144-
"2024-02-27 20:06:57,855\tWARNING worker.py:2058 -- The node with node id: eb6161373dd810eb1c6006acce4ba31c2b4cdb21c55e1d1fcdd4f6cb and address: 10.210.142.8 and node name: 10.210.142.8 has been marked dead because the detector has missed too many heartbeats from it. This can happen when a \t(1) raylet crashes unexpectedly (OOM, preempted node, etc.) \n",
145-
"\t(2) raylet has lagging heartbeats due to slow network or busy workload.\n",
146-
"2024-02-27 20:09:21,012\tWARNING worker.py:2058 -- The node with node id: 28835ff69e9f4134cdda660fa1096ee90823f4cd0de1a6e955788c6c and address: 10.210.174.138 and node name: 10.210.174.138 has been marked dead because the detector has missed too many heartbeats from it. This can happen when a \t(1) raylet crashes unexpectedly (OOM, preempted node, etc.) \n",
147-
"\t(2) raylet has lagging heartbeats due to slow network or busy workload.\n",
148-
"2024-02-27 20:16:55,405\tWARNING worker.py:2058 -- The node with node id: 78ddbe7cece04c64f1b371c5c91410545212c1f8192be1b401e4ed5d and address: 10.210.142.229 and node name: 10.210.142.229 has been marked dead because the detector has missed too many heartbeats from it. This can happen when a \t(1) raylet crashes unexpectedly (OOM, preempted node, etc.) \n",
149-
"\t(2) raylet has lagging heartbeats due to slow network or busy workload.\n",
150-
"2024-02-27 20:35:15,246\tWARNING worker.py:2058 -- The node with node id: b9c718c14c17e89b35bb1594f649731911f6b573ab1c0285aedb3569 and address: 10.210.51.136 and node name: 10.210.51.136 has been marked dead because the detector has missed too many heartbeats from it. This can happen when a \t(1) raylet crashes unexpectedly (OOM, preempted node, etc.) \n",
151-
"\t(2) raylet has lagging heartbeats due to slow network or busy workload.\n"
152-
]
153-
}
154-
],
112+
"outputs": [],
155113
"source": [
156114
"util.repo_names_licenses_convert_to_sqlite(\n",
157115
" cfg.repo_licenses_path / Path(cfg.repo_licenses_s3['path']).name,\n",
@@ -316,7 +274,7 @@
316274
"outputs": [],
317275
"source": [
318276
"params = process_commit_pairs.FilterParams()\n",
319-
"res = util.ray_map(\n",
277+
"res = ray_util.ray_map(\n",
320278
" process_commit_pairs.clean_files_bucket,\n",
321279
" files,\n",
322280
" dst=dst,\n",
@@ -344,7 +302,7 @@
344302
"source": [
345303
"res = regroup.ray_shuffle.remote(\n",
346304
" cfg.pr_commid_pairs_files_filtered_cleaned_path,\n",
347-
" cfg.pr_commid_pairs_files_filtered_cleaned_grouped_path',\n",
305+
" cfg.pr_commid_pairs_files_filtered_cleaned_grouped_path,\n",
348306
" 'pull_request.guid',\n",
349307
" 3\n",
350308
")"
@@ -354,7 +312,9 @@
354312
"cell_type": "code",
355313
"execution_count": null,
356314
"id": "2ab3999f-237a-4c9d-a08a-609f546954c9",
357-
"metadata": {},
315+
"metadata": {
316+
"scrolled": true
317+
},
358318
"outputs": [],
359319
"source": [
360320
"res = ray.get(res)"

pull_requests_and_issues/3_filter_and_render_pr.ipynb

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
"import yaml\n",
2222
"import numpy as np\n",
2323
"import pandas as pd\n",
24+
"import ray\n",
25+
"from pathlib import Path\n",
2426
"\n",
2527
"from render import RenderParams, get_line_diff_range\n",
2628
"import ray_util\n",
@@ -127,7 +129,7 @@
127129
"outputs": [],
128130
"source": [
129131
"rp = RenderParams()\n",
130-
"res = util.ray_map(\n",
132+
"res = ray_util.ray_map(\n",
131133
" process_pr_events.process_pr_bucket,\n",
132134
" files,\n",
133135
" dst=dst,\n",
@@ -175,11 +177,17 @@
175177
"def merge_pr_count_per_repo(data):\n",
176178
" file = data[0]\n",
177179
" df_pr_per_repo = data[1][['pull_request.guid', 'pr_count_per_repo']]\n",
178-
" df = pd.read_parquet(file)\n",
180+
" try:\n",
181+
" df = pd.read_parquet(file)\n",
182+
" except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):\n",
183+
" return data[0]\n",
179184
" if 'pr_count_per_repo' in df.columns:\n",
180185
" return 1\n",
181186
" df = df.merge(df_pr_per_repo, on = 'pull_request.guid', how='left')\n",
182-
" util.df_to_parquet_safe(df, file)\n",
187+
" try:\n",
188+
" util.df_to_parquet_safe(df, file)\n",
189+
" except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):\n",
190+
" return data[0]\n",
183191
" return 0"
184192
]
185193
},
@@ -191,7 +199,7 @@
191199
"outputs": [],
192200
"source": [
193201
"files = list(cfg.prs_grouped_filtered_path.glob('*.parquet'))\n",
194-
"res = util.ray_map(\n",
202+
"res = ray_util.ray_map(\n",
195203
" get_df_repo_pr_bucket,\n",
196204
" files\n",
197205
")\n",
@@ -218,13 +226,25 @@
218226
"metadata": {},
219227
"outputs": [],
220228
"source": [
221-
"res = util.ray_map(\n",
229+
"res = ray_util.ray_map(\n",
222230
" merge_pr_count_per_repo,\n",
223231
" src\n",
224232
")\n",
225233
"res = ray.get(res)"
226234
]
227235
},
236+
{
237+
"cell_type": "code",
238+
"execution_count": null,
239+
"id": "b2b101e6-f7f1-4bed-8b74-0eae318ff65a",
240+
"metadata": {},
241+
"outputs": [],
242+
"source": [
243+
"for el in res:\n",
244+
" if type(el) is Path:\n",
245+
" print('failed part: ', el)"
246+
]
247+
},
228248
{
229249
"cell_type": "markdown",
230250
"id": "00b94bc5-92f8-4984-be34-68de63796920",
@@ -260,11 +280,11 @@
260280
"outputs": [],
261281
"source": [
262282
"res = []\n",
263-
"for f in rest_files:\n",
283+
"for f in pr_files:\n",
264284
" res.append(render.get_renders_for_bucket.remote(\n",
265285
" f, commits_path,\n",
266286
" render_params,\n",
267-
" return_render=False,\n",
287+
" return_render=True,\n",
268288
" return_lang_distr=False,\n",
269289
" return_data=False,\n",
270290
" base_seed=42,\n",
@@ -303,6 +323,14 @@
303323
"ray.shutdown()\n",
304324
"server.scale_cluster(0)"
305325
]
326+
},
327+
{
328+
"cell_type": "code",
329+
"execution_count": null,
330+
"id": "556237c9-1988-48e1-8a09-2b4a42d22d0e",
331+
"metadata": {},
332+
"outputs": [],
333+
"source": []
306334
}
307335
],
308336
"metadata": {

pull_requests_and_issues/cfg.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,6 @@
3030
'ext': 'parquet'
3131
}
3232
pr_commit_pairs_files_path = root_path / 'pr_commit_pairs_files'
33-
pr_commid_pairs_files_filtered_path = root_path / 'pr_commid_pairs_files_filtered'
34-
pr_commid_pairs_files_filtered_cleaned_path = root_path / 'pr_commid_pairs_files_filtered_cleaned'
35-
pr_commid_pairs_files_filtered_cleaned_grouped_path = root_path / 'pr_commid_pairs_files_filtered_cleaned_grouped'
33+
pr_commid_pairs_files_filtered_path = root_path / 'pr_commit_pairs_files_filtered'
34+
pr_commid_pairs_files_filtered_cleaned_path = root_path / 'pr_commit_pairs_files_filtered_cleaned'
35+
pr_commid_pairs_files_filtered_cleaned_grouped_path = root_path / 'pr_commit_pairs_files_filtered_cleaned_grouped'

pull_requests_and_issues/process_commit_pairs.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
import ray
66
from functools import partial
77
import numpy as np
8+
import sqlite3
9+
from collections import defaultdict
10+
11+
from render import get_line_diff_range
812

913
def add_license_to_pr_remove_non_permissive(dfis, repo_licenses_sqlite_file):
1014

pull_requests_and_issues/regroup.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,12 @@ def combine_bucket(path):
7171
dst_filename = path.parent / f'{path.name.split("_")[1]}.parquet'
7272
dst_filename_tmp = Path(str(dst_filename) + '.__tmp__')
7373

74-
#src = polars.scan_parquet(path / '*.parquet')
75-
#src.sink_parquet(dst_filename_tmp, compression='snappy')
76-
77-
#sometimes this crashes for some reason
78-
df = dd.read_parquet(str(path / 'part_*.parquet')).compute()
79-
#print(dst_filename_tmp)
80-
#print(dst_filename)
74+
files = list(path.glob('*.parquet'))
75+
data = []
76+
for f in files:
77+
data.append(pd.read_parquet(f))
78+
df = pd.concat(data)
79+
8180
df = df.reset_index(drop=True)
8281
# seems dask adds __index_level_0__ so remove column with the same name
8382
# before saving

pull_requests_and_issues/render.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -747,8 +747,8 @@ def render_pr(row, df_commit_pairs, render_params, return_render, language_black
747747
res += c_rendering
748748
res += render_pr_review_event(event, base_info, render_params, len(res))
749749
elif (
750-
event['type'] == 'issue' or
751-
event['type'] == 'comment'
750+
event['type'] == 'IssueEvent' or
751+
event['type'] == 'IssueCommentEvent'
752752
):
753753
# does not have head and base info so no diff render
754754
res += render_issue(event, render_params, len(res))
@@ -790,7 +790,10 @@ def get_renders_for_bucket(
790790
# if source is not path seed per bucket must be provided
791791

792792
else:
793-
df = pd.read_parquet(source)
793+
try:
794+
df = pd.read_parquet(source)
795+
except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):
796+
return source
794797

795798
assert not seed is None
796799
seed += base_seed

0 commit comments

Comments
 (0)