Skip to content

Commit fefdcbf

Browse files
authored
Merge leetcode contest dataset with leet10k (#2559)
1 parent 2d2fd6b commit fefdcbf

File tree

1 file changed

+36
-4
lines changed

1 file changed

+36
-4
lines changed

data/datasets/oa_leet10k/oa_leet10k.ipynb

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,11 @@
4242
"import random\n",
4343
"from IPython.display import display\n",
4444
"from datasets import Dataset\n",
45+
"import requests\n",
4546
"\n",
4647
"data_source = \"https://www.kaggle.com/datasets/erichartford/leetcode-solutions\"\n",
48+
"lc_contests_data_source = \"https://github.com/Nan-Do/LeetCodeContestsDataset/raw/main/submissions.json\"\n",
49+
"\n",
4750
"output_dir = \"data\"\n",
4851
"os.makedirs(output_dir, exist_ok=True)"
4952
]
@@ -54,7 +57,12 @@
5457
"metadata": {},
5558
"outputs": [],
5659
"source": [
57-
"kaggle.api.dataset_download_files(\"erichartford/leetcode-solutions\", \"data\", unzip=True)"
60+
"kaggle.api.dataset_download_files(\"erichartford/leetcode-solutions\", \"data\", unzip=True)\n",
61+
"r = requests.get(lc_contests_data_source, allow_redirects=True)\n",
62+
"with open(\"data/lc_contests.json\", \"wb\") as f:\n",
63+
" for chunk in r.iter_content(chunk_size=1024):\n",
64+
" if chunk:\n",
65+
" f.write(chunk)"
5866
]
5967
},
6068
{
@@ -64,6 +72,7 @@
6472
"outputs": [],
6573
"source": [
6674
"leetcode_solutions = pd.read_json(\"data/leetcode-solutions.jsonl\", lines=True)\n",
75+
"leetcode_contests = pd.read_json(\"data/lc_contests.json\")\n",
6776
"\n",
6877
"# Create dataframe with columns INSTRUCTION, RESPONSE, SOURCE\n",
6978
"# The INSTRUCTION a random choice from ONE_STEP_TEMPLATES with the language and content filled in\n",
@@ -83,7 +92,21 @@
8392
" \"SOURCE\": data_source,\n",
8493
" }\n",
8594
" )\n",
95+
"\n",
96+
"oa_leetcode_contests = []\n",
97+
"for index, row in leetcode_contests.iterrows():\n",
98+
" oa_leetcode_contests.append(\n",
99+
" {\n",
100+
" \"INSTRUCTION\": row[\"instruction\"] + \"\\n\" + row[\"input\"],\n",
101+
" \"RESPONSE\": row[\"output\"],\n",
102+
" \"SOURCE\": \"https://github.com/Nan-Do/LeetCodeContestsDataset\",\n",
103+
" }\n",
104+
" )\n",
105+
"\n",
86106
"oa_leet10k = pd.DataFrame(oa_leet10k)\n",
107+
"oa_leetcode_contests = pd.DataFrame(oa_leetcode_contests)\n",
108+
"\n",
109+
"print(f\"oa_leet10k: {oa_leet10k.shape[0]}, oa_leetcode_contests: {oa_leetcode_contests.shape[0]}\")\n",
87110
"\n",
88111
"# Print the first 5 rows of the dataframe with full width and newline characters correctly displayed in the RESPONSE column\n",
89112
"with pd.option_context(\"display.max_colwidth\", 80):\n",
@@ -94,7 +117,13 @@
94117
" \"text-align\": \"left\",\n",
95118
" \"white-space\": \"pre-wrap\",\n",
96119
" }\n",
97-
" )\n",
120+
" ),\n",
121+
" oa_leetcode_contests.head(5).style.set_properties(\n",
122+
" **{\n",
123+
" \"text-align\": \"left\",\n",
124+
" \"white-space\": \"pre-wrap\",\n",
125+
" }\n",
126+
" ),\n",
98127
" )"
99128
]
100129
},
@@ -106,9 +135,12 @@
106135
"source": [
107136
"# Upload dataset to HF\n",
108137
"oa_leet10k.to_parquet(\"oa_leet10k.parquet\", row_group_size=100, engine=\"pyarrow\")\n",
109-
"ds = Dataset.from_parquet(\"oa_leet10k.parquet\")\n",
138+
"ds_leet10k = Dataset.from_parquet(\"oa_leet10k.parquet\")\n",
139+
"oa_leetcode_contests.to_parquet(\"oa_leetcode_contests.parquet\", row_group_size=100, engine=\"pyarrow\")\n",
140+
"ds_leetcode_contests = Dataset.from_parquet(\"oa_leetcode_contests.parquet\")\n",
110141
"# Uncomment to push dataset to HF\n",
111-
"# ds.push_to_hub(\"ehartford/oa_leet10k\")"
142+
"# ds_leet10k.push_to_hub(\"ehartford/oa_leet10k\")\n",
143+
"# ds_leetcode_contests.push_to_hub(\"ehartford/oa_leet10k\")"
112144
]
113145
}
114146
],

0 commit comments

Comments
 (0)