|
42 | 42 | "import random\n", |
43 | 43 | "from IPython.display import display\n", |
44 | 44 | "from datasets import Dataset\n", |
| 45 | + "import requests\n", |
45 | 46 | "\n", |
46 | 47 | "data_source = \"https://www.kaggle.com/datasets/erichartford/leetcode-solutions\"\n", |
| 48 | + "lc_contests_data_source = \"https://github.com/Nan-Do/LeetCodeContestsDataset/raw/main/submissions.json\"\n", |
| 49 | + "\n", |
47 | 50 | "output_dir = \"data\"\n", |
48 | 51 | "os.makedirs(output_dir, exist_ok=True)" |
49 | 52 | ] |
|
54 | 57 | "metadata": {}, |
55 | 58 | "outputs": [], |
56 | 59 | "source": [ |
57 | | - "kaggle.api.dataset_download_files(\"erichartford/leetcode-solutions\", \"data\", unzip=True)" |
| 60 | + "kaggle.api.dataset_download_files(\"erichartford/leetcode-solutions\", \"data\", unzip=True)\n", |
| 61 | + "r = requests.get(lc_contests_data_source, allow_redirects=True)\n", |
| 62 | + "with open(\"data/lc_contests.json\", \"wb\") as f:\n", |
| 63 | + " for chunk in r.iter_content(chunk_size=1024):\n", |
| 64 | + " if chunk:\n", |
| 65 | + " f.write(chunk)" |
58 | 66 | ] |
59 | 67 | }, |
60 | 68 | { |
|
64 | 72 | "outputs": [], |
65 | 73 | "source": [ |
66 | 74 | "leetcode_solutions = pd.read_json(\"data/leetcode-solutions.jsonl\", lines=True)\n", |
| 75 | + "leetcode_contests = pd.read_json(\"data/lc_contests.json\")\n", |
67 | 76 | "\n", |
68 | 77 | "# Create dataframe with columns INSTRUCTION, RESPONSE, SOURCE\n", |
69 | 78 | "# The INSTRUCTION a random choice from ONE_STEP_TEMPLATES with the language and content filled in\n", |
|
83 | 92 | " \"SOURCE\": data_source,\n", |
84 | 93 | " }\n", |
85 | 94 | " )\n", |
| 95 | + "\n", |
| 96 | + "oa_leetcode_contests = []\n", |
| 97 | + "for index, row in leetcode_contests.iterrows():\n", |
| 98 | + " oa_leetcode_contests.append(\n", |
| 99 | + " {\n", |
| 100 | + " \"INSTRUCTION\": row[\"instruction\"] + \"\\n\" + row[\"input\"],\n", |
| 101 | + " \"RESPONSE\": row[\"output\"],\n", |
| 102 | + " \"SOURCE\": \"https://github.com/Nan-Do/LeetCodeContestsDataset\",\n", |
| 103 | + " }\n", |
| 104 | + " )\n", |
| 105 | + "\n", |
86 | 106 | "oa_leet10k = pd.DataFrame(oa_leet10k)\n", |
| 107 | + "oa_leetcode_contests = pd.DataFrame(oa_leetcode_contests)\n", |
| 108 | + "\n", |
| 109 | + "print(f\"oa_leet10k: {oa_leet10k.shape[0]}, oa_leetcode_contests: {oa_leetcode_contests.shape[0]}\")\n", |
87 | 110 | "\n", |
88 | 111 | "# Print the first 5 rows of the dataframe with full width and newline characters correctly displayed in the RESPONSE column\n", |
89 | 112 | "with pd.option_context(\"display.max_colwidth\", 80):\n", |
|
94 | 117 | " \"text-align\": \"left\",\n", |
95 | 118 | " \"white-space\": \"pre-wrap\",\n", |
96 | 119 | " }\n", |
97 | | - " )\n", |
| 120 | + " ),\n", |
| 121 | + " oa_leetcode_contests.head(5).style.set_properties(\n", |
| 122 | + " **{\n", |
| 123 | + " \"text-align\": \"left\",\n", |
| 124 | + " \"white-space\": \"pre-wrap\",\n", |
| 125 | + " }\n", |
| 126 | + " ),\n", |
98 | 127 | " )" |
99 | 128 | ] |
100 | 129 | }, |
|
106 | 135 | "source": [ |
107 | 136 | "# Upload dataset to HF\n", |
108 | 137 | "oa_leet10k.to_parquet(\"oa_leet10k.parquet\", row_group_size=100, engine=\"pyarrow\")\n", |
109 | | - "ds = Dataset.from_parquet(\"oa_leet10k.parquet\")\n", |
| 138 | + "ds_leet10k = Dataset.from_parquet(\"oa_leet10k.parquet\")\n", |
| 139 | + "oa_leetcode_contests.to_parquet(\"oa_leetcode_contests.parquet\", row_group_size=100, engine=\"pyarrow\")\n", |
| 140 | + "ds_leetcode_contests = Dataset.from_parquet(\"oa_leetcode_contests.parquet\")\n", |
110 | 141 | "# Uncomment to push dataset to HF\n", |
111 | | - "# ds.push_to_hub(\"ehartford/oa_leet10k\")" |
| 142 | + "# ds_leet10k.push_to_hub(\"ehartford/oa_leet10k\")\n", |
| 143 | + "# ds_leetcode_contests.push_to_hub(\"ehartford/oa_leet10k\")" |
112 | 144 | ] |
113 | 145 | } |
114 | 146 | ], |
|
0 commit comments