Merge leetcode contest dataset with leet10k (#2559)

ehartford · web-flow · commit fefdcbf7596c · 2023-05-01T09:57:09.000+01:00
diff --git a/data/datasets/oa_leet10k/oa_leet10k.ipynb b/data/datasets/oa_leet10k/oa_leet10k.ipynb
@@ -42,8 +42,11 @@
     "import random\n",
     "from IPython.display import display\n",
     "from datasets import Dataset\n",
+    "import requests\n",
     "\n",
     "data_source = \"https://www.kaggle.com/datasets/erichartford/leetcode-solutions\"\n",
+    "lc_contests_data_source = \"https://github.com/Nan-Do/LeetCodeContestsDataset/raw/main/submissions.json\"\n",
+    "\n",
     "output_dir = \"data\"\n",
     "os.makedirs(output_dir, exist_ok=True)"
    ]
@@ -54,7 +57,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "kaggle.api.dataset_download_files(\"erichartford/leetcode-solutions\", \"data\", unzip=True)"
+    "kaggle.api.dataset_download_files(\"erichartford/leetcode-solutions\", \"data\", unzip=True)\n",
+    "r = requests.get(lc_contests_data_source, allow_redirects=True)\n",
+    "with open(\"data/lc_contests.json\", \"wb\") as f:\n",
+    "    for chunk in r.iter_content(chunk_size=1024):\n",
+    "        if chunk:\n",
+    "            f.write(chunk)"
    ]
   },
   {
@@ -64,6 +72,7 @@
    "outputs": [],
    "source": [
     "leetcode_solutions = pd.read_json(\"data/leetcode-solutions.jsonl\", lines=True)\n",
+    "leetcode_contests = pd.read_json(\"data/lc_contests.json\")\n",
     "\n",
     "# Create dataframe with columns INSTRUCTION, RESPONSE, SOURCE\n",
     "# The INSTRUCTION a random choice from ONE_STEP_TEMPLATES with the language and content filled in\n",
@@ -83,7 +92,21 @@
     "                    \"SOURCE\": data_source,\n",
     "                }\n",
     "            )\n",
+    "\n",
+    "oa_leetcode_contests = []\n",
+    "for index, row in leetcode_contests.iterrows():\n",
+    "    oa_leetcode_contests.append(\n",
+    "        {\n",
+    "            \"INSTRUCTION\": row[\"instruction\"] + \"\\n\" + row[\"input\"],\n",
+    "            \"RESPONSE\": row[\"output\"],\n",
+    "            \"SOURCE\": \"https://github.com/Nan-Do/LeetCodeContestsDataset\",\n",
+    "        }\n",
+    "    )\n",
+    "\n",
     "oa_leet10k = pd.DataFrame(oa_leet10k)\n",
+    "oa_leetcode_contests = pd.DataFrame(oa_leetcode_contests)\n",
+    "\n",
+    "print(f\"oa_leet10k: {oa_leet10k.shape[0]}, oa_leetcode_contests: {oa_leetcode_contests.shape[0]}\")\n",
     "\n",
     "# Print the first 5 rows of the dataframe with full width and newline characters correctly displayed in the RESPONSE column\n",
     "with pd.option_context(\"display.max_colwidth\", 80):\n",
@@ -94,7 +117,13 @@
     "                \"text-align\": \"left\",\n",
     "                \"white-space\": \"pre-wrap\",\n",
     "            }\n",
-    "        )\n",
+    "        ),\n",
+    "        oa_leetcode_contests.head(5).style.set_properties(\n",
+    "            **{\n",
+    "                \"text-align\": \"left\",\n",
+    "                \"white-space\": \"pre-wrap\",\n",
+    "            }\n",
+    "        ),\n",
     "    )"
    ]
   },
@@ -106,9 +135,12 @@
    "source": [
     "# Upload dataset to HF\n",
     "oa_leet10k.to_parquet(\"oa_leet10k.parquet\", row_group_size=100, engine=\"pyarrow\")\n",
-    "ds = Dataset.from_parquet(\"oa_leet10k.parquet\")\n",
+    "ds_leet10k = Dataset.from_parquet(\"oa_leet10k.parquet\")\n",
+    "oa_leetcode_contests.to_parquet(\"oa_leetcode_contests.parquet\", row_group_size=100, engine=\"pyarrow\")\n",
+    "ds_leetcode_contests = Dataset.from_parquet(\"oa_leetcode_contests.parquet\")\n",
     "# Uncomment to push dataset to HF\n",
-    "# ds.push_to_hub(\"ehartford/oa_leet10k\")"
+    "# ds_leet10k.push_to_hub(\"ehartford/oa_leet10k\")\n",
+    "# ds_leetcode_contests.push_to_hub(\"ehartford/oa_leet10k\")"
    ]
   }
  ],