Merge pull request #8 from bigcode-project/issues_pr_processing

bigximik · web-flow · commit b3cb45d191a5 · 2024-03-01T09:00:34.000+02:00
fixes after refactoring
diff --git a/pull_requests_and_issues/1_parse_issue_and_pr_events.ipynb b/pull_requests_and_issues/1_parse_issue_and_pr_events.ipynb
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "3a361285-4384-45a3-9fec-7d3ac2e82118",
    "metadata": {},
    "outputs": [],
diff --git a/pull_requests_and_issues/2_process_commit_pairs.ipynb b/pull_requests_and_issues/2_process_commit_pairs.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "918d5f68-2b90-4f0d-a90e-32baedf27f39",
    "metadata": {},
    "outputs": [],
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "id": "58449e85-14df-442d-adfb-380a878d310d",
    "metadata": {},
    "outputs": [],
@@ -42,48 +42,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "0327976e-b27b-47d4-9a04-68bcebe217df",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-02-27 19:21:33,231\tINFO worker.py:1458 -- Connecting to existing Ray cluster at address: 10.210.154.158:8786...\n",
-      "2024-02-27 19:21:33,262\tINFO worker.py:1633 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttp://10.210.154.158:8000 \u001b[39m\u001b[22m\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "https://453f3e01-e179-4204-a29b-3eb6bb94a853-8000.job.console.elementai.com\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import ray_server\n",
     "server = ray_server.get_ray_server()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "ce27b201-7e22-48fa-b646-80eebeb6cac0",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'OK'"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "server.scale_cluster(60)"
    ]
@@ -98,7 +71,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "ea60c091-265f-441b-9293-5cc66d7c7c6f",
    "metadata": {},
    "outputs": [],
@@ -117,7 +90,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "430fa2fa-326a-481e-bdc9-a29482090908",
    "metadata": {},
    "outputs": [],
@@ -136,22 +109,7 @@
    "execution_count": null,
    "id": "5f9668d8-c01f-4c67-8131-d2b82a2a4d7e",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-02-27 20:06:57,855\tWARNING worker.py:2058 -- The node with node id: eb6161373dd810eb1c6006acce4ba31c2b4cdb21c55e1d1fcdd4f6cb and address: 10.210.142.8 and node name: 10.210.142.8 has been marked dead because the detector has missed too many heartbeats from it. This can happen when a \t(1) raylet crashes unexpectedly (OOM, preempted node, etc.) \n",
-      "\t(2) raylet has lagging heartbeats due to slow network or busy workload.\n",
-      "2024-02-27 20:09:21,012\tWARNING worker.py:2058 -- The node with node id: 28835ff69e9f4134cdda660fa1096ee90823f4cd0de1a6e955788c6c and address: 10.210.174.138 and node name: 10.210.174.138 has been marked dead because the detector has missed too many heartbeats from it. This can happen when a \t(1) raylet crashes unexpectedly (OOM, preempted node, etc.) \n",
-      "\t(2) raylet has lagging heartbeats due to slow network or busy workload.\n",
-      "2024-02-27 20:16:55,405\tWARNING worker.py:2058 -- The node with node id: 78ddbe7cece04c64f1b371c5c91410545212c1f8192be1b401e4ed5d and address: 10.210.142.229 and node name: 10.210.142.229 has been marked dead because the detector has missed too many heartbeats from it. This can happen when a \t(1) raylet crashes unexpectedly (OOM, preempted node, etc.) \n",
-      "\t(2) raylet has lagging heartbeats due to slow network or busy workload.\n",
-      "2024-02-27 20:35:15,246\tWARNING worker.py:2058 -- The node with node id: b9c718c14c17e89b35bb1594f649731911f6b573ab1c0285aedb3569 and address: 10.210.51.136 and node name: 10.210.51.136 has been marked dead because the detector has missed too many heartbeats from it. This can happen when a \t(1) raylet crashes unexpectedly (OOM, preempted node, etc.) \n",
-      "\t(2) raylet has lagging heartbeats due to slow network or busy workload.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "util.repo_names_licenses_convert_to_sqlite(\n",
     "    cfg.repo_licenses_path / Path(cfg.repo_licenses_s3['path']).name,\n",
@@ -316,7 +274,7 @@
    "outputs": [],
    "source": [
     "params = process_commit_pairs.FilterParams()\n",
-    "res = util.ray_map(\n",
+    "res = ray_util.ray_map(\n",
     "    process_commit_pairs.clean_files_bucket,\n",
     "    files,\n",
     "    dst=dst,\n",
@@ -344,7 +302,7 @@
    "source": [
     "res = regroup.ray_shuffle.remote(\n",
     "    cfg.pr_commid_pairs_files_filtered_cleaned_path,\n",
-    "    cfg.pr_commid_pairs_files_filtered_cleaned_grouped_path',\n",
+    "    cfg.pr_commid_pairs_files_filtered_cleaned_grouped_path,\n",
     "    'pull_request.guid',\n",
     "    3\n",
     ")"
@@ -354,7 +312,9 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "2ab3999f-237a-4c9d-a08a-609f546954c9",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "res = ray.get(res)"
diff --git a/pull_requests_and_issues/3_filter_and_render_pr.ipynb b/pull_requests_and_issues/3_filter_and_render_pr.ipynb
@@ -21,6 +21,8 @@
     "import yaml\n",
     "import numpy as np\n",
     "import pandas as pd\n",
+    "import ray\n",
+    "from pathlib import Path\n",
     "\n",
     "from render import RenderParams, get_line_diff_range\n",
     "import ray_util\n",
@@ -127,7 +129,7 @@
    "outputs": [],
    "source": [
     "rp = RenderParams()\n",
-    "res = util.ray_map(\n",
+    "res = ray_util.ray_map(\n",
     "    process_pr_events.process_pr_bucket,\n",
     "    files,\n",
     "    dst=dst,\n",
@@ -175,11 +177,17 @@
     "def merge_pr_count_per_repo(data):\n",
     "    file = data[0]\n",
     "    df_pr_per_repo = data[1][['pull_request.guid', 'pr_count_per_repo']]\n",
-    "    df = pd.read_parquet(file)\n",
+    "    try:\n",
+    "        df = pd.read_parquet(file)\n",
+    "    except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):\n",
+    "        return data[0]\n",
     "    if 'pr_count_per_repo' in df.columns:\n",
     "        return 1\n",
     "    df = df.merge(df_pr_per_repo, on = 'pull_request.guid', how='left')\n",
-    "    util.df_to_parquet_safe(df, file)\n",
+    "    try:\n",
+    "        util.df_to_parquet_safe(df, file)\n",
+    "    except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):\n",
+    "        return data[0]\n",
     "    return 0"
    ]
   },
@@ -191,7 +199,7 @@
    "outputs": [],
    "source": [
     "files = list(cfg.prs_grouped_filtered_path.glob('*.parquet'))\n",
-    "res = util.ray_map(\n",
+    "res = ray_util.ray_map(\n",
     "    get_df_repo_pr_bucket,\n",
     "    files\n",
     ")\n",
@@ -218,13 +226,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "res = util.ray_map(\n",
+    "res = ray_util.ray_map(\n",
     "    merge_pr_count_per_repo,\n",
     "    src\n",
     ")\n",
     "res = ray.get(res)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2b101e6-f7f1-4bed-8b74-0eae318ff65a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for el in res:\n",
+    "    if type(el) is Path:\n",
+    "        print('failed part: ', el)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "00b94bc5-92f8-4984-be34-68de63796920",
@@ -260,11 +280,11 @@
    "outputs": [],
    "source": [
     "res = []\n",
-    "for f in rest_files:\n",
+    "for f in pr_files:\n",
     "    res.append(render.get_renders_for_bucket.remote(\n",
     "        f, commits_path,\n",
     "        render_params,\n",
-    "        return_render=False,\n",
+    "        return_render=True,\n",
     "        return_lang_distr=False,\n",
     "        return_data=False,\n",
     "        base_seed=42,\n",
@@ -303,6 +323,14 @@
     "ray.shutdown()\n",
     "server.scale_cluster(0)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "556237c9-1988-48e1-8a09-2b4a42d22d0e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/pull_requests_and_issues/cfg.py b/pull_requests_and_issues/cfg.py
@@ -30,6 +30,6 @@
     'ext': 'parquet'
 }
 pr_commit_pairs_files_path = root_path / 'pr_commit_pairs_files'
-pr_commid_pairs_files_filtered_path = root_path / 'pr_commid_pairs_files_filtered'
-pr_commid_pairs_files_filtered_cleaned_path = root_path / 'pr_commid_pairs_files_filtered_cleaned'
-pr_commid_pairs_files_filtered_cleaned_grouped_path = root_path / 'pr_commid_pairs_files_filtered_cleaned_grouped'
+pr_commid_pairs_files_filtered_path = root_path / 'pr_commit_pairs_files_filtered'
+pr_commid_pairs_files_filtered_cleaned_path = root_path / 'pr_commit_pairs_files_filtered_cleaned'
+pr_commid_pairs_files_filtered_cleaned_grouped_path = root_path / 'pr_commit_pairs_files_filtered_cleaned_grouped'
diff --git a/pull_requests_and_issues/process_commit_pairs.py b/pull_requests_and_issues/process_commit_pairs.py
@@ -5,6 +5,10 @@
 import ray
 from functools import partial
 import numpy as np
+import sqlite3
+from collections import defaultdict
+
+from render import get_line_diff_range
 
 def add_license_to_pr_remove_non_permissive(dfis, repo_licenses_sqlite_file):
     
diff --git a/pull_requests_and_issues/regroup.py b/pull_requests_and_issues/regroup.py
@@ -71,13 +71,12 @@ def combine_bucket(path):
     dst_filename = path.parent / f'{path.name.split("_")[1]}.parquet'
     dst_filename_tmp = Path(str(dst_filename) + '.__tmp__')
 
-    #src = polars.scan_parquet(path / '*.parquet')
-    #src.sink_parquet(dst_filename_tmp, compression='snappy')
-
-    #sometimes this crashes for some reason
-    df = dd.read_parquet(str(path / 'part_*.parquet')).compute()
-    #print(dst_filename_tmp)
-    #print(dst_filename)
+    files = list(path.glob('*.parquet'))
+    data = []
+    for f in files:
+        data.append(pd.read_parquet(f))
+    df = pd.concat(data)
+    
     df = df.reset_index(drop=True)
     # seems dask adds __index_level_0__ so remove column with the same name 
     # before saving
diff --git a/pull_requests_and_issues/render.py b/pull_requests_and_issues/render.py
@@ -747,8 +747,8 @@ def render_pr(row, df_commit_pairs, render_params, return_render, language_black
                     res += c_rendering
                     res += render_pr_review_event(event, base_info, render_params, len(res))
             elif (
-                event['type'] == 'issue' or
-                event['type'] == 'comment'
+                event['type'] == 'IssueEvent' or
+                event['type'] == 'IssueCommentEvent'
             ):
                 # does not have head and base info so no diff render
                 res += render_issue(event, render_params, len(res))
@@ -790,7 +790,10 @@ def get_renders_for_bucket(
         # if source is not path seed per bucket must be provided
         
     else:
-        df = pd.read_parquet(source)
+        try:
+            df = pd.read_parquet(source)
+        except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):
+            return source
 
     assert not seed is None
     seed += base_seed

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@`
`13`	`13`	`},`
`14`	`14`	`{`
`15`	`15`	`"cell_type": "code",`
`16`		`- "execution_count": 1,`
	`16`	`+ "execution_count": null,`
`17`	`17`	`"id": "3a361285-4384-45a3-9fec-7d3ac2e82118",`
`18`	`18`	`"metadata": {},`
`19`	`19`	`"outputs": [],`