|
61 | 61 | "\n", |
62 | 62 | "The dataset is available on [Kaggle](https://www.kaggle.com/chadgostopp/recsys-challenge-2015). You need to download it and copy to the `DATA_FOLDER` path. Note that we are only using the `yoochoose-clicks.dat` file.\n", |
63 | 63 | "\n", |
| 64 | + "Alternatively, you can generate a synthetic dataset with the same columns and dtypes as the `YOOCHOOSE` dataset and a default date range of 5 days. If the environment variable `USE_SYNTHETIC` is set to `True`, the code below will execute the function `generate_synthetic_data` and the rest of the notebook will run on a synthetic dataset.\n", |
| 65 | + "\n", |
64 | 66 | "First, let's start by importing several libraries:" |
65 | 67 | ] |
66 | 68 | }, |
|
75 | 77 | "output_type": "stream", |
76 | 78 | "text": [ |
77 | 79 | "/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/tf.py:52: UserWarning: Tensorflow dtype mappings did not load successfully due to an error: No module named 'tensorflow'\n", |
78 | | - " warn(f\"Tensorflow dtype mappings did not load successfully due to an error: {exc.msg}\")\n", |
79 | | - "/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", |
80 | | - " from .autonotebook import tqdm as notebook_tqdm\n" |
| 80 | + " warn(f\"Tensorflow dtype mappings did not load successfully due to an error: {exc.msg}\")\n" |
81 | 81 | ] |
82 | 82 | } |
83 | 83 | ], |
84 | 84 | "source": [ |
85 | 85 | "import os\n", |
86 | 86 | "import glob\n", |
87 | 87 | "import numpy as np\n", |
| 88 | + "import pandas as pd\n", |
88 | 89 | "import gc\n", |
| 90 | + "import calendar\n", |
| 91 | + "import datetime\n", |
89 | 92 | "\n", |
90 | 93 | "import cudf\n", |
91 | 94 | "import cupy\n", |
|
128 | 131 | "metadata": {}, |
129 | 132 | "outputs": [], |
130 | 133 | "source": [ |
131 | | - "DATA_FOLDER = \"/workspace/data/\"\n", |
| 134 | + "DATA_FOLDER = os.environ.get(\"DATA_FOLDER\", \"/workspace/data\")\n", |
132 | 135 | "FILENAME_PATTERN = 'yoochoose-clicks.dat'\n", |
133 | 136 | "DATA_PATH = os.path.join(DATA_FOLDER, FILENAME_PATTERN)\n", |
134 | 137 | "\n", |
135 | 138 | "OUTPUT_FOLDER = \"./yoochoose_transformed\"\n", |
136 | | - "OVERWRITE = False" |
| 139 | + "OVERWRITE = False\n", |
| 140 | + "\n", |
| 141 | + "USE_SYNTHETIC = os.environ.get(\"USE_SYNTHETIC\", False)" |
137 | 142 | ] |
138 | 143 | }, |
139 | 144 | { |
|
144 | 149 | "## Load and clean raw data" |
145 | 150 | ] |
146 | 151 | }, |
| 152 | + { |
| 153 | + "cell_type": "markdown", |
| 154 | + "id": "3fba8546-668c-4743-960e-ea2aef99ef24", |
| 155 | + "metadata": {}, |
| 156 | + "source": [ |
| 157 | + "Execute the cell below if you would like to work with synthetic data. Otherwise you can skip and continue with the next cell." |
| 158 | + ] |
| 159 | + }, |
147 | 160 | { |
148 | 161 | "cell_type": "code", |
149 | 162 | "execution_count": 5, |
| 163 | + "id": "07d14289-c783-45f0-86e8-e5c1001bfd76", |
| 164 | + "metadata": {}, |
| 165 | + "outputs": [], |
| 166 | + "source": [ |
| 167 | + "def generate_synthetic_data(\n", |
| 168 | + " start_date: datetime.date, end_date: datetime.date, rows_per_day: int = 10000\n", |
| 169 | + ") -> pd.DataFrame:\n", |
| 170 | + " assert end_date > start_date, \"end_date must be later than start_date\"\n", |
| 171 | + "\n", |
| 172 | + " number_of_days = (end_date - start_date).days\n", |
| 173 | + " total_number_of_rows = number_of_days * rows_per_day\n", |
| 174 | + "\n", |
| 175 | + " # Generate a long-tail distribution of item interactions. This simulates that some items are\n", |
| 176 | + " # more popular than others.\n", |
| 177 | + " long_tailed_item_distribution = np.clip(\n", |
| 178 | + " np.random.lognormal(3.0, 1.0, total_number_of_rows).astype(np.int64), 1, 50000\n", |
| 179 | + " )\n", |
| 180 | + "\n", |
| 181 | + " # generate random item interaction features\n", |
| 182 | + " df = pd.DataFrame(\n", |
| 183 | + " {\n", |
| 184 | + " \"session_id\": np.random.randint(70000, 80000, total_number_of_rows),\n", |
| 185 | + " \"item_id\": long_tailed_item_distribution,\n", |
| 186 | + " },\n", |
| 187 | + " )\n", |
| 188 | + "\n", |
| 189 | + " # generate category mapping for each item-id\n", |
| 190 | + " df[\"category\"] = pd.cut(df[\"item_id\"], bins=334, labels=np.arange(1, 335)).astype(\n", |
| 191 | + " np.int64\n", |
| 192 | + " )\n", |
| 193 | + "\n", |
| 194 | + " max_session_length = 60 * 60 # 1 hour\n", |
| 195 | + "\n", |
| 196 | + " def add_timestamp_to_session(session: pd.DataFrame):\n", |
| 197 | + " random_start_date_and_time = calendar.timegm(\n", |
| 198 | + " (\n", |
| 199 | + " start_date\n", |
| 200 | + " # Add day offset from start_date\n", |
| 201 | + " + datetime.timedelta(days=np.random.randint(0, number_of_days))\n", |
| 202 | + " # Add time offset within the random day\n", |
| 203 | + " + datetime.timedelta(seconds=np.random.randint(0, 86_400))\n", |
| 204 | + " ).timetuple()\n", |
| 205 | + " )\n", |
| 206 | + " session[\"timestamp\"] = random_start_date_and_time + np.clip(\n", |
| 207 | + " np.random.lognormal(3.0, 1.0, len(session)).astype(np.int64),\n", |
| 208 | + " 0,\n", |
| 209 | + " max_session_length,\n", |
| 210 | + " )\n", |
| 211 | + " return session\n", |
| 212 | + "\n", |
| 213 | + " df = df.groupby(\"session_id\").apply(add_timestamp_to_session).reset_index()\n", |
| 214 | + "\n", |
| 215 | + " return df" |
| 216 | + ] |
| 217 | + }, |
| 218 | + { |
| 219 | + "cell_type": "code", |
| 220 | + "execution_count": 6, |
150 | 221 | "id": "f35dff52", |
151 | 222 | "metadata": {}, |
152 | 223 | "outputs": [], |
153 | 224 | "source": [ |
154 | | - "interactions_df = cudf.read_csv(DATA_PATH, sep=',', \n", |
155 | | - " names=['session_id','timestamp', 'item_id', 'category'], \n", |
156 | | - " dtype=['int', 'datetime64[s]', 'int', 'int'])" |
| 225 | + "if USE_SYNTHETIC:\n", |
| 226 | + " START_DATE = os.environ.get(\"START_DATE\", \"2014/4/1\")\n", |
| 227 | + " END_DATE = os.environ.get(\"END_DATE\", \"2014/4/5\")\n", |
| 228 | + " interactions_df = generate_synthetic_data(datetime.datetime.strptime(START_DATE, '%Y/%m/%d'),\n", |
| 229 | + " datetime.datetime.strptime(END_DATE, '%Y/%m/%d'))\n", |
| 230 | + " interactions_df = cudf.from_pandas(interactions_df)\n", |
| 231 | + "else:\n", |
| 232 | + " interactions_df = cudf.read_csv(DATA_PATH, sep=',', \n", |
| 233 | + " names=['session_id','timestamp', 'item_id', 'category'], \n", |
| 234 | + " dtype=['int', 'datetime64[s]', 'int', 'int'])" |
157 | 235 | ] |
158 | 236 | }, |
159 | 237 | { |
|
166 | 244 | }, |
167 | 245 | { |
168 | 246 | "cell_type": "code", |
169 | | - "execution_count": 6, |
| 247 | + "execution_count": 7, |
170 | 248 | "id": "22c2df72", |
171 | 249 | "metadata": {}, |
172 | 250 | "outputs": [ |
|
181 | 259 | ], |
182 | 260 | "source": [ |
183 | 261 | "print(\"Count with in-session repeated interactions: {}\".format(len(interactions_df)))\n", |
| 262 | + "\n", |
184 | 263 | "# Sorts the dataframe by session and timestamp, to remove consecutive repetitions\n", |
185 | 264 | "interactions_df.timestamp = interactions_df.timestamp.astype(int)\n", |
186 | 265 | "interactions_df = interactions_df.sort_values(['session_id', 'timestamp'])\n", |
187 | 266 | "past_ids = interactions_df['item_id'].shift(1).fillna()\n", |
188 | 267 | "session_past_ids = interactions_df['session_id'].shift(1).fillna()\n", |
| 268 | + "\n", |
189 | 269 | "# Keeping only no consecutive repeated in session interactions\n", |
190 | 270 | "interactions_df = interactions_df[~((interactions_df['session_id'] == session_past_ids) & (interactions_df['item_id'] == past_ids))]\n", |
| 271 | + "\n", |
191 | 272 | "print(\"Count after removed in-session repeated interactions: {}\".format(len(interactions_df)))" |
192 | 273 | ] |
193 | 274 | }, |
|
201 | 282 | }, |
202 | 283 | { |
203 | 284 | "cell_type": "code", |
204 | | - "execution_count": 7, |
| 285 | + "execution_count": 8, |
205 | 286 | "id": "66a1bd13", |
206 | 287 | "metadata": {}, |
207 | 288 | "outputs": [ |
|
234 | 315 | }, |
235 | 316 | { |
236 | 317 | "cell_type": "code", |
237 | | - "execution_count": 8, |
| 318 | + "execution_count": 9, |
238 | 319 | "id": "a0f908a1", |
239 | 320 | "metadata": {}, |
240 | 321 | "outputs": [], |
241 | 322 | "source": [ |
| 323 | + "if os.path.isdir(DATA_FOLDER) == False:\n", |
| 324 | + " os.mkdir(DATA_FOLDER)\n", |
242 | 325 | "interactions_merged_df.to_parquet(os.path.join(DATA_FOLDER, 'interactions_merged_df.parquet'))" |
243 | 326 | ] |
244 | 327 | }, |
245 | 328 | { |
246 | 329 | "cell_type": "code", |
247 | | - "execution_count": 9, |
| 330 | + "execution_count": 10, |
248 | 331 | "id": "909f87c5-bff5-48c8-b714-cc556a4bc64d", |
249 | 332 | "metadata": { |
250 | 333 | "tags": [] |
|
265 | 348 | }, |
266 | 349 | { |
267 | 350 | "cell_type": "code", |
268 | | - "execution_count": 10, |
| 351 | + "execution_count": 11, |
269 | 352 | "id": "04a3b5b7", |
270 | 353 | "metadata": {}, |
271 | 354 | "outputs": [ |
272 | 355 | { |
273 | 356 | "data": { |
274 | 357 | "text/plain": [ |
275 | | - "0" |
| 358 | + "517" |
276 | 359 | ] |
277 | 360 | }, |
278 | | - "execution_count": 10, |
| 361 | + "execution_count": 11, |
279 | 362 | "metadata": {}, |
280 | 363 | "output_type": "execute_result" |
281 | 364 | } |
|
330 | 413 | }, |
331 | 414 | { |
332 | 415 | "cell_type": "code", |
333 | | - "execution_count": 11, |
| 416 | + "execution_count": 13, |
334 | 417 | "id": "86f80068", |
335 | 418 | "metadata": {}, |
336 | 419 | "outputs": [], |
|
425 | 508 | }, |
426 | 509 | { |
427 | 510 | "cell_type": "code", |
428 | | - "execution_count": 12, |
| 511 | + "execution_count": 14, |
429 | 512 | "id": "10b5c96c", |
430 | 513 | "metadata": {}, |
431 | 514 | "outputs": [], |
|
447 | 530 | "# Truncate sequence features to first interacted 20 items \n", |
448 | 531 | "SESSIONS_MAX_LENGTH = 20 \n", |
449 | 532 | "\n", |
450 | | - "\n", |
451 | 533 | "item_feat = groupby_features['item_id-list'] >> nvt.ops.TagAsItemID()\n", |
452 | 534 | "cont_feats = groupby_features['et_dayofweek_sin-list', 'product_recency_days_log_norm-list'] >> nvt.ops.AddMetadata(tags=[Tags.CONTINUOUS])\n", |
453 | 535 | "\n", |
|
491 | 573 | }, |
492 | 574 | { |
493 | 575 | "cell_type": "code", |
494 | | - "execution_count": 13, |
| 576 | + "execution_count": 15, |
495 | 577 | "id": "45803886", |
496 | 578 | "metadata": {}, |
497 | 579 | "outputs": [], |
|
513 | 595 | }, |
514 | 596 | { |
515 | 597 | "cell_type": "code", |
516 | | - "execution_count": 14, |
| 598 | + "execution_count": 16, |
517 | 599 | "id": "4c10efb5-89c5-4458-a634-475eb459a47c", |
518 | 600 | "metadata": { |
519 | 601 | "tags": [] |
|
600 | 682 | " <tr>\n", |
601 | 683 | " <th>2</th>\n", |
602 | 684 | " <td>item_id-list</td>\n", |
603 | | - " <td>(Tags.CATEGORICAL, Tags.ITEM, Tags.ID, Tags.LIST)</td>\n", |
| 685 | + " <td>(Tags.CATEGORICAL, Tags.ID, Tags.LIST, Tags.ITEM)</td>\n", |
604 | 686 | " <td>DType(name='int64', element_type=<ElementType....</td>\n", |
605 | 687 | " <td>True</td>\n", |
606 | 688 | " <td>True</td>\n", |
|
697 | 779 | "</div>" |
698 | 780 | ], |
699 | 781 | "text/plain": [ |
700 | | - "[{'name': 'session_id', 'tags': {<Tags.CATEGORICAL: 'categorical'>}, 'properties': {}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'item_id-count', 'tags': {<Tags.CATEGORICAL: 'categorical'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': './/categories/unique.item_id.parquet', 'domain': {'min': 0, 'max': 52741, 'name': 'item_id'}, 'embedding_sizes': {'cardinality': 52742, 'dimension': 512}}, 'dtype': DType(name='int32', element_type=<ElementType.Int: 'int'>, element_size=32, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'item_id-list', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.ITEM: 'item'>, <Tags.ID: 'id'>, <Tags.LIST: 'list'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': './/categories/unique.item_id.parquet', 'domain': {'min': 0, 'max': 52741, 'name': 'item_id'}, 'embedding_sizes': {'cardinality': 52742, 'dimension': 512}, 'value_count': {'min': 0, 'max': 20}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None), Dimension(min=0, max=20)))), 'is_list': True, 'is_ragged': True}, {'name': 'et_dayofweek_sin-list', 'tags': {<Tags.CONTINUOUS: 'continuous'>, <Tags.LIST: 'list'>}, 'properties': {'value_count': {'min': 0, 'max': 20}}, 'dtype': DType(name='float64', element_type=<ElementType.Float: 'float'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None), Dimension(min=0, max=20)))), 'is_list': True, 'is_ragged': True}, {'name': 'product_recency_days_log_norm-list', 'tags': {<Tags.CONTINUOUS: 'continuous'>, <Tags.LIST: 'list'>}, 'properties': {'value_count': {'min': 0, 'max': 20}}, 'dtype': DType(name='float32', element_type=<ElementType.Float: 'float'>, element_size=32, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None), Dimension(min=0, max=20)))), 'is_list': True, 'is_ragged': True}, {'name': 'category-list', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.LIST: 'list'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': './/categories/unique.category.parquet', 'domain': {'min': 0, 'max': 336, 'name': 'category'}, 'embedding_sizes': {'cardinality': 337, 'dimension': 42}, 'value_count': {'min': 0, 'max': 20}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None), Dimension(min=0, max=20)))), 'is_list': True, 'is_ragged': True}, {'name': 'day_index', 'tags': {<Tags.CATEGORICAL: 'categorical'>}, 'properties': {}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}]" |
| 782 | + "[{'name': 'session_id', 'tags': {<Tags.CATEGORICAL: 'categorical'>}, 'properties': {}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'item_id-count', 'tags': {<Tags.CATEGORICAL: 'categorical'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': './/categories/unique.item_id.parquet', 'domain': {'min': 0, 'max': 52741, 'name': 'item_id'}, 'embedding_sizes': {'cardinality': 52742, 'dimension': 512}}, 'dtype': DType(name='int32', element_type=<ElementType.Int: 'int'>, element_size=32, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'item_id-list', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.ID: 'id'>, <Tags.LIST: 'list'>, <Tags.ITEM: 'item'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': './/categories/unique.item_id.parquet', 'domain': {'min': 0, 'max': 52741, 'name': 'item_id'}, 'embedding_sizes': {'cardinality': 52742, 'dimension': 512}, 'value_count': {'min': 0, 'max': 20}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None), Dimension(min=0, max=20)))), 'is_list': True, 'is_ragged': True}, {'name': 'et_dayofweek_sin-list', 'tags': {<Tags.CONTINUOUS: 'continuous'>, <Tags.LIST: 'list'>}, 'properties': {'value_count': {'min': 0, 'max': 20}}, 'dtype': DType(name='float64', element_type=<ElementType.Float: 'float'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None), Dimension(min=0, max=20)))), 'is_list': True, 'is_ragged': True}, {'name': 'product_recency_days_log_norm-list', 'tags': {<Tags.CONTINUOUS: 'continuous'>, <Tags.LIST: 'list'>}, 'properties': {'value_count': {'min': 0, 'max': 20}}, 'dtype': DType(name='float32', element_type=<ElementType.Float: 'float'>, element_size=32, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None), Dimension(min=0, max=20)))), 'is_list': True, 'is_ragged': True}, {'name': 'category-list', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.LIST: 'list'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': './/categories/unique.category.parquet', 'domain': {'min': 0, 'max': 336, 'name': 'category'}, 'embedding_sizes': {'cardinality': 337, 'dimension': 42}, 'value_count': {'min': 0, 'max': 20}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None), Dimension(min=0, max=20)))), 'is_list': True, 'is_ragged': True}, {'name': 'day_index', 'tags': {<Tags.CATEGORICAL: 'categorical'>}, 'properties': {}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}]" |
701 | 783 | ] |
702 | 784 | }, |
703 | | - "execution_count": 14, |
| 785 | + "execution_count": 16, |
704 | 786 | "metadata": {}, |
705 | 787 | "output_type": "execute_result" |
706 | 788 | } |
|
719 | 801 | }, |
720 | 802 | { |
721 | 803 | "cell_type": "code", |
722 | | - "execution_count": 15, |
| 804 | + "execution_count": 17, |
723 | 805 | "id": "2d035a88-2146-4b9a-96fd-dd42be86e2a1", |
724 | 806 | "metadata": {}, |
725 | 807 | "outputs": [], |
|
747 | 829 | }, |
748 | 830 | { |
749 | 831 | "cell_type": "code", |
750 | | - "execution_count": 16, |
| 832 | + "execution_count": 18, |
751 | 833 | "id": "2b4f5b73-459c-4356-87c8-9afb974cc77d", |
752 | 834 | "metadata": {}, |
753 | 835 | "outputs": [], |
754 | 836 | "source": [ |
755 | 837 | "# read in the processed train dataset\n", |
756 | 838 | "sessions_gdf = cudf.read_parquet(os.path.join(DATA_FOLDER, \"processed_nvt/part_0.parquet\"))\n", |
757 | | - "sessions_gdf = sessions_gdf[sessions_gdf.day_index>=178]" |
| 839 | + "if USE_SYNTHETIC:\n", |
| 840 | + " THRESHOLD_DAY_INDEX = int(os.environ.get(\"THRESHOLD_DAY_INDEX\", '1'))\n", |
| 841 | + " sessions_gdf = sessions_gdf[sessions_gdf.day_index>=THRESHOLD_DAY_INDEX]\n", |
| 842 | + "else:\n", |
| 843 | + " sessions_gdf = sessions_gdf[sessions_gdf.day_index>=178]" |
758 | 844 | ] |
759 | 845 | }, |
760 | 846 | { |
761 | 847 | "cell_type": "code", |
762 | | - "execution_count": 17, |
| 848 | + "execution_count": 19, |
763 | 849 | "id": "e18d9c63", |
764 | 850 | "metadata": {}, |
765 | 851 | "outputs": [ |
|
783 | 869 | "6606149 [-0.7818309228245777, -0.7818309228245777] \n", |
784 | 870 | "\n", |
785 | 871 | " product_recency_days_log_norm-list \\\n", |
786 | | - "6606147 [1.5241553, 1.5238751, 1.5239341, 1.5241631, 1... \n", |
787 | | - "6606148 [-0.5330064, 1.521494] \n", |
788 | | - "6606149 [1.5338266, 1.5355074] \n", |
| 872 | + "6606147 [1.5241561, 1.523876, 1.523935, 1.5241641, 1.5... \n", |
| 873 | + "6606148 [-0.533007, 1.521495] \n", |
| 874 | + "6606149 [1.5338274, 1.5355083] \n", |
789 | 875 | "\n", |
790 | 876 | " category-list day_index \n", |
791 | | - "6606147 [4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4] 178 \n", |
792 | | - "6606148 [3, 3] 178 \n", |
| 877 | + "6606147 [4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 4] 178 \n", |
| 878 | + "6606148 [1, 3] 178 \n", |
793 | 879 | "6606149 [8, 8] 180 \n" |
794 | 880 | ] |
795 | 881 | } |
|
800 | 886 | }, |
801 | 887 | { |
802 | 888 | "cell_type": "code", |
803 | | - "execution_count": 18, |
| 889 | + "execution_count": 20, |
804 | 890 | "id": "5175aeaf", |
805 | 891 | "metadata": {}, |
806 | 892 | "outputs": [ |
807 | 893 | { |
808 | 894 | "name": "stderr", |
809 | 895 | "output_type": "stream", |
810 | 896 | "text": [ |
811 | | - "Creating time-based splits: 100%|██████████| 5/5 [00:02<00:00, 2.37it/s]\n" |
| 897 | + "Creating time-based splits: 100%|██████████| 5/5 [00:02<00:00, 2.24it/s]\n" |
812 | 898 | ] |
813 | 899 | } |
814 | 900 | ], |
|
823 | 909 | }, |
824 | 910 | { |
825 | 911 | "cell_type": "code", |
826 | | - "execution_count": 19, |
| 912 | + "execution_count": 21, |
827 | 913 | "id": "3bd1bad9", |
828 | 914 | "metadata": {}, |
829 | 915 | "outputs": [ |
830 | 916 | { |
831 | 917 | "data": { |
832 | 918 | "text/plain": [ |
833 | | - "583" |
| 919 | + "748" |
834 | 920 | ] |
835 | 921 | }, |
836 | | - "execution_count": 19, |
| 922 | + "execution_count": 21, |
837 | 923 | "metadata": {}, |
838 | 924 | "output_type": "execute_result" |
839 | 925 | } |
|
0 commit comments