Skip to content

Commit fe92eb2

Browse files
committed
Tutorials: Added init tutorial notebooks
1 parent 1b16c2f commit fe92eb2

11 files changed

+54934
-0
lines changed

tutorial_notebooks/annotations.ipynb

Lines changed: 1912 additions & 0 deletions
Large diffs are not rendered by default.

tutorial_notebooks/benchmarks.ipynb

Lines changed: 1499 additions & 0 deletions
Large diffs are not rendered by default.

tutorial_notebooks/handling_missing_data.ipynb

Lines changed: 3766 additions & 0 deletions
Large diffs are not rendered by default.

tutorial_notebooks/series_based_choosing_data.ipynb

Lines changed: 1150 additions & 0 deletions
Large diffs are not rendered by default.

tutorial_notebooks/series_based_loading_data.ipynb

Lines changed: 14116 additions & 0 deletions
Large diffs are not rendered by default.

tutorial_notebooks/series_based_using_scalers.ipynb

Lines changed: 1404 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 394 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,394 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Example of usage for simple forecasting"
8+
]
9+
},
10+
{
11+
"cell_type": "markdown",
12+
"metadata": {},
13+
"source": [
14+
"### Import"
15+
]
16+
},
17+
{
18+
"cell_type": "code",
19+
"execution_count": 1,
20+
"metadata": {},
21+
"outputs": [],
22+
"source": [
23+
"import tqdm\n",
24+
"import torch\n",
25+
"import torch.nn as nn\n",
26+
"import torch.optim as optim\n",
27+
"import numpy as np\n",
28+
"from sklearn.metrics import mean_squared_error\n",
29+
"\n",
30+
"from cesnet_tszoo.utils.enums import FillerType, ScalerType\n",
31+
"from cesnet_tszoo.benchmarks import load_benchmark"
32+
]
33+
},
34+
{
35+
"cell_type": "markdown",
36+
"metadata": {},
37+
"source": [
38+
"### Preparing dataset"
39+
]
40+
},
41+
{
42+
"cell_type": "code",
43+
"execution_count": 2,
44+
"metadata": {},
45+
"outputs": [
46+
{
47+
"name": "stdout",
48+
"output_type": "stream",
49+
"text": [
50+
"File size: 0.01GB\n",
51+
"Remaining: 0.01GB\n"
52+
]
53+
},
54+
{
55+
"name": "stderr",
56+
"output_type": "stream",
57+
"text": [
58+
"100%|██████████| 9.59M/9.59M [00:00<00:00, 25.2MB/s]\n",
59+
"100%|██████████| 283/283 [00:03<00:00, 71.15it/s]"
60+
]
61+
},
62+
{
63+
"name": "stdout",
64+
"output_type": "stream",
65+
"text": [
66+
"\n",
67+
"Config Details\n",
68+
" Used for database: CESNET-TimeSeries24\n",
69+
" Aggregation: AgreggationType.AGG_1_DAY\n",
70+
" Source: SourceType.INSTITUTIONS\n",
71+
"\n",
72+
" Time series\n",
73+
" Time series IDS: [ 30 222 276 48 243 ... 112 19 15 101 117], Length=283\n",
74+
" Test time series IDS: None\n",
75+
" Time periods\n",
76+
" Train time periods: range(0, 168)\n",
77+
" Val time periods: range(161, 196)\n",
78+
" Test time periods: range(189, 280)\n",
79+
" All time periods: range(0, 280)\n",
80+
" Features\n",
81+
" Taken features: ['n_bytes']\n",
82+
" Default values: [nan]\n",
83+
" Time series ID included: False\n",
84+
" Time included: False\n",
85+
" Sliding window\n",
86+
" Sliding window size: 7\n",
87+
" Sliding window prediction size: 1\n",
88+
" Sliding window step size: 1\n",
89+
" Set shared size: 7\n",
90+
" Fillers\n",
91+
" Filler type: None\n",
92+
" Scalers\n",
93+
" Scaler type: None\n",
94+
" Batch sizes\n",
95+
" Train batch size: 32\n",
96+
" Val batch size: 64\n",
97+
" Test batch size: 128\n",
98+
" All batch size: 128\n",
99+
" Default workers\n",
100+
" Init worker count: 4\n",
101+
" Train worker count: 4\n",
102+
" Val worker count: 3\n",
103+
" Test worker count: 2\n",
104+
" All worker count: 4\n",
105+
" Other\n",
106+
" Nan threshold: 1.0\n",
107+
" Random state: None\n",
108+
" \n"
109+
]
110+
},
111+
{
112+
"name": "stderr",
113+
"output_type": "stream",
114+
"text": [
115+
"\n"
116+
]
117+
}
118+
],
119+
"source": [
120+
"benchmark = load_benchmark(identifier=\"0d523e69c328\", data_root=\"/some_directory/\")\n",
121+
"dataset = benchmark.get_initialized_dataset()"
122+
]
123+
},
124+
{
125+
"cell_type": "markdown",
126+
"metadata": {},
127+
"source": [
128+
"### Changing used config values"
129+
]
130+
},
131+
{
132+
"cell_type": "code",
133+
"execution_count": 3,
134+
"metadata": {},
135+
"outputs": [
136+
{
137+
"name": "stderr",
138+
"output_type": "stream",
139+
"text": [
140+
"100%|██████████| 283/283 [00:03<00:00, 72.20it/s]\n",
141+
"100%|██████████| 283/283 [00:04<00:00, 70.63it/s]\n",
142+
"100%|██████████| 283/283 [00:03<00:00, 70.90it/s]\n",
143+
"100%|██████████| 283/283 [00:03<00:00, 72.05it/s]"
144+
]
145+
},
146+
{
147+
"name": "stdout",
148+
"output_type": "stream",
149+
"text": [
150+
"\n",
151+
"Config Details\n",
152+
" Used for database: CESNET-TimeSeries24\n",
153+
" Aggregation: AgreggationType.AGG_1_DAY\n",
154+
" Source: SourceType.INSTITUTIONS\n",
155+
"\n",
156+
" Time series\n",
157+
" Time series IDS: [ 30 222 276 48 243 ... 112 19 15 101 117], Length=283\n",
158+
" Test time series IDS: None\n",
159+
" Time periods\n",
160+
" Train time periods: range(0, 168)\n",
161+
" Val time periods: range(144, 196)\n",
162+
" Test time periods: range(172, 280)\n",
163+
" All time periods: range(0, 280)\n",
164+
" Features\n",
165+
" Taken features: ['n_bytes']\n",
166+
" Default values: [0.]\n",
167+
" Time series ID included: False\n",
168+
" Time included: False\n",
169+
" Sliding window\n",
170+
" Sliding window size: 24\n",
171+
" Sliding window prediction size: 1\n",
172+
" Sliding window step size: 1\n",
173+
" Set shared size: 24\n",
174+
" Fillers\n",
175+
" Filler type: mean_filler\n",
176+
" Scalers\n",
177+
" Scaler type: min_max_scaler\n",
178+
" Is scaler per Time series: True\n",
179+
" Are scalers premade: False\n",
180+
" Are premade scalers partial_fitted: False\n",
181+
" Batch sizes\n",
182+
" Train batch size: 32\n",
183+
" Val batch size: 64\n",
184+
" Test batch size: 128\n",
185+
" All batch size: 128\n",
186+
" Default workers\n",
187+
" Init worker count: 4\n",
188+
" Train worker count: 4\n",
189+
" Val worker count: 3\n",
190+
" Test worker count: 2\n",
191+
" All worker count: 4\n",
192+
" Other\n",
193+
" Nan threshold: 1.0\n",
194+
" Random state: None\n",
195+
" \n"
196+
]
197+
},
198+
{
199+
"name": "stderr",
200+
"output_type": "stream",
201+
"text": [
202+
"\n"
203+
]
204+
}
205+
],
206+
"source": [
207+
"# (optional) Set default value for missing data \n",
208+
"dataset.set_default_values(0)\n",
209+
"\n",
210+
"# (optional) Set filler for filling missing data \n",
211+
"dataset.apply_filler(FillerType.MEAN_FILLER)\n",
212+
"\n",
213+
"# (optional) Set scaller for data\n",
214+
"dataset.apply_scaler(ScalerType.MIN_MAX_SCALER)\n",
215+
"\n",
216+
"# (optional) Change sliding window setting\n",
217+
"dataset.set_sliding_window(sliding_window_size=24, sliding_window_prediction_size=1, sliding_window_step=1, set_shared_size=24)\n",
218+
"\n",
219+
"# (optional) Change batch sizes\n",
220+
"dataset.set_batch_sizes()\n",
221+
"\n",
222+
"# Display final config\n",
223+
"dataset.display_config()"
224+
]
225+
},
226+
{
227+
"cell_type": "markdown",
228+
"metadata": {},
229+
"source": [
230+
"### Using simple LSTM model"
231+
]
232+
},
233+
{
234+
"cell_type": "markdown",
235+
"metadata": {},
236+
"source": [
237+
"#### Creating class for model"
238+
]
239+
},
240+
{
241+
"cell_type": "code",
242+
"execution_count": 4,
243+
"metadata": {},
244+
"outputs": [],
245+
"source": [
246+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
247+
"\n",
248+
"class SimpleLSTM(nn.Module):\n",
249+
" def __init__(self, input_size=1, hidden_size=8, output_size=1):\n",
250+
" super(SimpleLSTM, self).__init__()\n",
251+
" self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)\n",
252+
" self.fc = nn.Linear(hidden_size, output_size)\n",
253+
"\n",
254+
" def forward(self, x):\n",
255+
" _, (h_n, _) = self.lstm(x) # h_n: (1, batch, hidden)\n",
256+
" out = self.fc(h_n[-1]) # (batch, output_size)\n",
257+
" return out.unsqueeze(1) # (batch, 1, output_size)\n",
258+
" \n",
259+
" def fit(self, train_dataloader, val_dataloader, n_epochs, device):\n",
260+
" self.train()\n",
261+
" criterion = nn.MSELoss()\n",
262+
" optimizer = optim.Adam(self.parameters(), lr=0.01)\n",
263+
" for epoch in range(n_epochs):\n",
264+
" train_losses = []\n",
265+
" val_losses = []\n",
266+
" for (batch_train, batch_val) in zip(train_dataloader, val_dataloader):\n",
267+
" batch_x, batch_y = batch_train\n",
268+
" batch_x = torch.tensor(batch_x, dtype=torch.float32).to(device)\n",
269+
" batch_y = torch.tensor(batch_y, dtype=torch.float32).to(device)\n",
270+
"\n",
271+
" optimizer.zero_grad()\n",
272+
" output = self(batch_x)\n",
273+
" loss = criterion(output, batch_y)\n",
274+
" loss.backward()\n",
275+
" optimizer.step()\n",
276+
" train_losses.append(loss.item())\n",
277+
"\n",
278+
" # validation loss\n",
279+
" with torch.no_grad():\n",
280+
" batch_x_val, batch_y_val = batch_val\n",
281+
" batch_x_val = torch.tensor(batch_x_val, dtype=torch.float32).to(device)\n",
282+
" batch_y_val = torch.tensor(batch_y_val, dtype=torch.float32).to(device)\n",
283+
" val_output = self(batch_x_val)\n",
284+
" val_loss = criterion(val_output, batch_y_val)\n",
285+
" val_losses.append(val_loss.item())\n",
286+
"\n",
287+
" \n",
288+
" def predict(self, test_dataloader, device):\n",
289+
" self.eval()\n",
290+
" all_preds = []\n",
291+
" all_targets = []\n",
292+
"\n",
293+
" with torch.no_grad():\n",
294+
" for batch_x_test, batch_y_test in test_dataloader:\n",
295+
" batch_x_test = torch.tensor(batch_x_test, dtype=torch.float32).to(device)\n",
296+
" batch_y_test = torch.tensor(batch_y_test, dtype=torch.float32).to(device)\n",
297+
"\n",
298+
" output = self(batch_x_test)\n",
299+
" all_preds.append(output.cpu().numpy().flatten())\n",
300+
" all_targets.append(batch_y_test.cpu().numpy().flatten())\n",
301+
"\n",
302+
" y_pred = np.concatenate(all_preds)\n",
303+
" y_true = np.concatenate(all_targets)\n",
304+
" return y_pred, y_true"
305+
]
306+
},
307+
{
308+
"cell_type": "markdown",
309+
"metadata": {},
310+
"source": [
311+
"#### Training model"
312+
]
313+
},
314+
{
315+
"cell_type": "code",
316+
"execution_count": 5,
317+
"metadata": {},
318+
"outputs": [
319+
{
320+
"name": "stderr",
321+
"output_type": "stream",
322+
"text": [
323+
"100%|██████████| 283/283 [02:08<00:00, 2.21it/s]\n"
324+
]
325+
}
326+
],
327+
"source": [
328+
"results = []\n",
329+
"for ts_id in tqdm.tqdm(dataset.get_data_about_set(about='train')['ts_ids']):\n",
330+
" model = SimpleLSTM().to(device)\n",
331+
" model.fit(\n",
332+
" dataset.get_train_dataloader(ts_id), \n",
333+
" dataset.get_val_dataloader(ts_id), \n",
334+
" n_epochs=5, \n",
335+
" device=device,\n",
336+
" )\n",
337+
" y_pred, y_true = model.predict(\n",
338+
" dataset.get_test_dataloader(ts_id), \n",
339+
" device=device,\n",
340+
" )\n",
341+
" \n",
342+
" rmse = mean_squared_error(y_true, y_pred)\n",
343+
" results.append(rmse)\n"
344+
]
345+
},
346+
{
347+
"cell_type": "markdown",
348+
"metadata": {},
349+
"source": [
350+
"#### Final prediction scores on test set"
351+
]
352+
},
353+
{
354+
"cell_type": "code",
355+
"execution_count": 6,
356+
"metadata": {},
357+
"outputs": [
358+
{
359+
"name": "stdout",
360+
"output_type": "stream",
361+
"text": [
362+
"Mean RMSE: 0.082187\n",
363+
"Std RMSE: 0.146893\n"
364+
]
365+
}
366+
],
367+
"source": [
368+
"print(f\"Mean RMSE: {np.mean(results):.6f}\")\n",
369+
"print(f\"Std RMSE: {np.std(results):.6f}\") "
370+
]
371+
}
372+
],
373+
"metadata": {
374+
"kernelspec": {
375+
"display_name": ".venv",
376+
"language": "python",
377+
"name": "python3"
378+
},
379+
"language_info": {
380+
"codemirror_mode": {
381+
"name": "ipython",
382+
"version": 3
383+
},
384+
"file_extension": ".py",
385+
"mimetype": "text/x-python",
386+
"name": "python",
387+
"nbconvert_exporter": "python",
388+
"pygments_lexer": "ipython3",
389+
"version": "3.12.3"
390+
}
391+
},
392+
"nbformat": 4,
393+
"nbformat_minor": 2
394+
}

0 commit comments

Comments
 (0)