Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 187 additions & 9 deletions nbs/common.base_model.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@
" windows_batch_size: int,\n",
" inference_windows_batch_size: Union[int, None],\n",
" start_padding_enabled: bool,\n",
" training_data_availability_threshold: Union[float, List[float]] = 0.0,\n",
" n_series: Union[int, None] = None,\n",
" n_samples: Union[int, None] = 100,\n",
" h_train: int = 1,\n",
Expand Down Expand Up @@ -358,6 +359,28 @@
" else:\n",
" self.inference_windows_batch_size = inference_windows_batch_size\n",
"\n",
" # Filtering training windows by available sample fractions\n",
" if isinstance(training_data_availability_threshold, int):\n",
" raise ValueError(\"training_data_availability_threshold cannot be an integer - must be a float\")\n",
" elif isinstance(training_data_availability_threshold, float):\n",
" if training_data_availability_threshold < 0.0 or training_data_availability_threshold > 1.0:\n",
" raise ValueError(f\"training_data_availability_threshold must be between 0.0 and 1.0, got {training_data_availability_threshold}\")\n",
" self.min_insample_fraction = training_data_availability_threshold\n",
" self.min_outsample_fraction = training_data_availability_threshold\n",
" elif isinstance(training_data_availability_threshold, (list, tuple)) and len(training_data_availability_threshold) == 2:\n",
" for i, value in enumerate(training_data_availability_threshold):\n",
" if isinstance(value, int):\n",
" raise ValueError(f\"training_data_availability_threshold[{i}] cannot be an integer - must be a float\")\n",
" if not isinstance(value, float):\n",
" raise ValueError(f\"training_data_availability_threshold[{i}] must be a float\")\n",
" if value < 0.0 or value > 1.0:\n",
" raise ValueError(f\"training_data_availability_threshold[{i}] must be between 0.0 and 1.0, got {value}\")\n",
" \n",
" self.min_insample_fraction = training_data_availability_threshold[0]\n",
" self.min_outsample_fraction = training_data_availability_threshold[1]\n",
" else:\n",
" raise ValueError(\"training_data_availability_threshold must be a float or a list/tuple of two floats\")\n",
"\n",
" # Optimization \n",
" self.learning_rate = learning_rate\n",
" self.max_steps = max_steps\n",
Expand Down Expand Up @@ -674,16 +697,20 @@
" windows = windows.flatten(0, 1)\n",
" windows = windows.unsqueeze(-1)\n",
"\n",
" # Sample and Available conditions\n",
" available_idx = temporal_cols.get_loc('available_mask') \n",
" available_condition = windows[:, :self.input_size, available_idx]\n",
" available_condition = torch.sum(available_condition, axis=(1, -1)) # Sum over time & series dimension\n",
" final_condition = (available_condition > 0)\n",
" \n",
" # Calculate minimum required available points based on fractions\n",
" min_insample_points = max(1, int(self.input_size * self.min_insample_fraction))\n",
" min_outsample_points = max(1, int(self.h * self.min_outsample_fraction))\n",
"\n",
" # Sample based on available conditions\n",
" available_idx = temporal_cols.get_loc(\"available_mask\")\n",
" available_condition = windows[:, : self.input_size, available_idx]\n",
" available_condition = torch.sum(available_condition, axis=(1, -1)) # Sum over time & series dimension\n",
" final_condition = available_condition >= min_insample_points\n",
"\n",
" if self.h > 0:\n",
" sample_condition = windows[:, self.input_size:, available_idx]\n",
" sample_condition = torch.sum(sample_condition, axis=(1, -1)) # Sum over time & series dimension\n",
" final_condition = (sample_condition > 0) & (available_condition > 0)\n",
" sample_condition = windows[:, self.input_size :, available_idx]\n",
" sample_condition = torch.sum(sample_condition, axis=(1, -1)) # Sum over time & series dimension\n",
" final_condition = (sample_condition >= min_outsample_points) & (available_condition >= min_insample_points)\n",
" \n",
" windows = windows[final_condition]\n",
"\n",
Expand Down Expand Up @@ -1426,6 +1453,157 @@
" fcsts = torch.vstack(fcsts)\n",
" return tensor_to_numpy(fcsts) "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b10c68fe",
"metadata": {},
"outputs": [],
"source": [
"#| hide,\n",
"# Test window filtering\n",
"\n",
"def test_sample_fractions_parameter():\n",
" \"\"\"Test the training_data_availability_threshold parameter filtering logic.\"\"\"\n",
" \n",
" # Mock parameters\n",
" input_size = 10 # Smaller for easier testing\n",
" h = 5\n",
" \n",
" # Create mock temporal data with known availability pattern\n",
" # Shape: [1, 2, time_length] where dim 1 has [y_values, available_mask]\n",
" time_length = 20\n",
" temporal = torch.zeros(1, 2, time_length)\n",
" \n",
" # Set y values\n",
" temporal[0, 0, :] = torch.arange(time_length) # y values: [0, 1, 2, ..., 19]\n",
" \n",
" # Create availability pattern: only positions 5-14 are available (10 points)\n",
" available_mask = torch.zeros(time_length)\n",
" available_mask[5:15] = 1.0\n",
" temporal[0, 1, :] = available_mask # available_mask\n",
" \n",
" # Apply padding (input_size - 1 = 9 zeros at start)\n",
" padding = torch.zeros(1, 2, 9)\n",
" temporal_padded = torch.cat([padding, temporal, torch.zeros(1, 2, h)], dim=2)\n",
" \n",
" # Create windows using unfold\n",
" window_size = input_size + h # 10 + 5 = 15\n",
" windows = temporal_padded.unfold(dimension=-1, size=window_size, step=1)\n",
" # Reshape to [n_windows, window_size, n_channels, 1]\n",
" windows = windows.squeeze(0).permute(1, 2, 0).unsqueeze(-1)\n",
" \n",
" available_idx = 1 # Index of available_mask\n",
" \n",
" # Test different sample_fractions values\n",
" test_cases = [\n",
" 0.0, # Default behavior - allow 1 valid point\"\n",
" 0.2, # Require 20% valid points\"\n",
" 0.5, # Require 50% valid points\"\n",
" [0.1, 0.8], # 10% insample, 80% outsample\n",
" [0.5, 0.2], # 50% insample, 20% outsample\n",
" ]\n",
" \n",
" for sample_fractions in test_cases: \n",
" # Process sample_fractions parameter\n",
" if isinstance(sample_fractions, float):\n",
" min_insample_fraction = float(sample_fractions)\n",
" min_outsample_fraction = float(sample_fractions)\n",
" elif isinstance(sample_fractions, (list, tuple)) and len(sample_fractions) == 2:\n",
" min_insample_fraction = float(sample_fractions[0])\n",
" min_outsample_fraction = float(sample_fractions[1])\n",
" \n",
" # Calculate minimum required points\n",
" min_insample_points = max(1, int(input_size * min_insample_fraction))\n",
" min_outsample_points = max(1, int(h * min_outsample_fraction))\n",
" \n",
" # Apply filtering logic\n",
" available_condition = windows[:, :input_size, available_idx, 0]\n",
" available_condition = torch.sum(available_condition, axis=1)\n",
" \n",
" sample_condition = windows[:, input_size:, available_idx, 0]\n",
" sample_condition = torch.sum(sample_condition, axis=1)\n",
" \n",
" final_condition = (sample_condition >= min_outsample_points) & (available_condition >= min_insample_points)\n",
" \n",
" # Verify some specific cases\n",
" if sample_fractions == 0.0:\n",
" # Should allow windows with 1 valid point\n",
" assert final_condition.sum() > 0, \"Default behavior should allow some windows\"\n",
" \n",
" elif sample_fractions == 0.5:\n",
" # Should require 50% valid points\n",
" expected_insample = max(1, int(input_size * 0.5)) # 50% of 10 = 5\n",
" expected_outsample = max(1, int(h * 0.5)) # 50% of 5 = 2 (as integer)\n",
" \n",
" # Check that filtered windows actually meet criteria\n",
" for i, is_valid in enumerate(final_condition):\n",
" if is_valid:\n",
" insample_valid = available_condition[i].item()\n",
" outsample_valid = sample_condition[i].item()\n",
" assert insample_valid >= expected_insample, f\"Window {i}: insample {insample_valid} < {expected_insample}\"\n",
" assert outsample_valid >= expected_outsample, f\"Window {i}: outsample {outsample_valid} < {expected_outsample}\"\n",
"\n",
"def test_sample_fractions_edge_cases():\n",
" \"\"\"Test edge cases for sample_fractions parameter.\"\"\"\n",
" \n",
" # Test invalid inputs\n",
" try:\n",
" # Should raise error for list with wrong length\n",
" process_sample_fractions([0.1, 0.2, 0.3])\n",
" assert False, \"Should have raised ValueError for list with wrong length\"\n",
" except ValueError:\n",
" pass\n",
" \n",
" try:\n",
" # Should raise error for invalid type\n",
" process_sample_fractions(\"invalid\")\n",
" assert False, \"Should have raised ValueError for invalid type\"\n",
" except ValueError:\n",
" pass\n",
"\n",
" try:\n",
" # Should raise error for values greater than 1.0\n",
" process_sample_fractions([1.2])\n",
" assert False, \"Should have raised ValueError for values greater than 1.0\"\n",
" except ValueError:\n",
" pass\n",
" \n",
" # Test valid inputs\n",
" try:\n",
" insample, outsample = process_sample_fractions(0.5)\n",
" assert insample == 0.5 and outsample == 0.5, \"Single float should work\"\n",
" \n",
" insample, outsample = process_sample_fractions([0.2, 0.8])\n",
" assert insample == 0.2 and outsample == 0.8, \"List of two floats should work\"\n",
" \n",
" insample, outsample = process_sample_fractions((0.1, 0.9))\n",
" assert insample == 0.1 and outsample == 0.9, \"Tuple of two floats should work\"\n",
" \n",
" except Exception:\n",
" pass\n",
"\n",
"def process_sample_fractions(sample_fractions):\n",
" \"\"\"Helper function to process sample_fractions parameter.\"\"\"\n",
" if isinstance(sample_fractions, (int, float)):\n",
" return float(sample_fractions), float(sample_fractions)\n",
" elif isinstance(sample_fractions, (list, tuple)) and len(sample_fractions) == 2:\n",
" return float(sample_fractions[0]), float(sample_fractions[1])\n",
" else:\n",
" raise ValueError(\"sample_fractions must be a float or a list/tuple of two floats\")\n",
"\n",
"test_sample_fractions_parameter()\n",
"test_sample_fractions_edge_cases()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2440f6f2",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
20 changes: 20 additions & 0 deletions nbs/core.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3676,6 +3676,26 @@
" 'LSTM', 'LSTM1', 'LSTM1-median', 'LSTM2_ql0.5', 'TSMixer', 'TSMixer1',\n",
" 'TSMixer1-median', 'TSMixer2_ql0.5']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c49c8651",
"metadata": {},
"outputs": [],
"source": [
"#| hide\n",
"# test training_data_availability_threshold\n",
"\n",
"models = [\n",
" NHITS(h=12, input_size=12, training_data_availability_threshold=0.1, max_steps=2),\n",
" LSTM(h=12, input_size=12, recurrent=True, training_data_availability_threshold=[0.1, 0.1], max_steps=2, accelerator='cpu'),\n",
" TSMixer(h=12, input_size=12, n_series=2, training_data_availability_threshold=0.5, max_steps=2)\n",
"]\n",
"\n",
"nf = NeuralForecast(models=models, freq=\"M\")\n",
"nf.fit(AirPassengersPanel_train)"
]
}
],
"metadata": {
Expand Down
3 changes: 3 additions & 0 deletions nbs/models.autoformer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,7 @@
" `windows_batch_size`: int=1024, number of windows to sample in each training batch, default uses all.<br>\n",
" `inference_windows_batch_size`: int=1024, number of windows to sample in each inference batch.<br>\n",
" `start_padding_enabled`: bool=False, if True, the model will pad the time series with zeros at the beginning, by input size.<br>\n",
" `training_data_availability_threshold`: Union[float, List[float]]=0.0, minimum fraction of valid data points required for training windows. Single float applies to both insample and outsample; list of two floats specifies [insample_fraction, outsample_fraction]. Default 0.0 allows windows with only 1 valid data point (current behavior).<br>\n",
" `scaler_type`: str='robust', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).<br>\n",
" `random_seed`: int=1, random_seed for pytorch initializer and numpy generators.<br>\n",
" `drop_last_loader`: bool=False, if True `TimeSeriesDataLoader` drops last non-full batch.<br>\n",
Expand Down Expand Up @@ -508,6 +509,7 @@
" windows_batch_size = 1024,\n",
" inference_windows_batch_size = 1024,\n",
" start_padding_enabled = False,\n",
" training_data_availability_threshold = 0.0,\n",
" step_size: int = 1,\n",
" scaler_type: str = 'identity',\n",
" random_seed: int = 1,\n",
Expand Down Expand Up @@ -537,6 +539,7 @@
" windows_batch_size=windows_batch_size,\n",
" inference_windows_batch_size=inference_windows_batch_size,\n",
" start_padding_enabled = start_padding_enabled,\n",
" training_data_availability_threshold = training_data_availability_threshold,\n",
" step_size=step_size,\n",
" scaler_type=scaler_type,\n",
" random_seed=random_seed,\n",
Expand Down
3 changes: 3 additions & 0 deletions nbs/models.bitcn.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@
" `windows_batch_size`: int=1024, number of windows to sample in each training batch, default uses all.<br>\n",
" `inference_windows_batch_size`: int=1024, number of windows to sample in each inference batch, -1 uses all.<br>\n",
" `start_padding_enabled`: bool=False, if True, the model will pad the time series with zeros at the beginning, by input size.<br>\n",
" `training_data_availability_threshold`: Union[float, List[float]]=0.0, minimum fraction of valid data points required for training windows. Single float applies to both insample and outsample; list of two floats specifies [insample_fraction, outsample_fraction]. Default 0.0 allows windows with only 1 valid data point (current behavior).<br>\n",
" `step_size`: int=1, step size between each window of temporal data.<br>\n",
" `scaler_type`: str='identity', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).<br>\n",
" `random_seed`: int=1, random_seed for pytorch initializer and numpy generators.<br>\n",
Expand Down Expand Up @@ -219,6 +220,7 @@
" windows_batch_size = 1024,\n",
" inference_windows_batch_size = 1024,\n",
" start_padding_enabled = False,\n",
" training_data_availability_threshold = 0.0,\n",
" step_size: int = 1,\n",
" scaler_type: str = 'identity',\n",
" random_seed: int = 1,\n",
Expand Down Expand Up @@ -249,6 +251,7 @@
" windows_batch_size=windows_batch_size,\n",
" inference_windows_batch_size=inference_windows_batch_size,\n",
" start_padding_enabled=start_padding_enabled,\n",
" training_data_availability_threshold = training_data_availability_threshold,\n",
" step_size=step_size,\n",
" scaler_type=scaler_type,\n",
" random_seed=random_seed,\n",
Expand Down
3 changes: 3 additions & 0 deletions nbs/models.deepar.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@
" `windows_batch_size`: int=1024, number of windows to sample in each training batch, default uses all.<br>\n",
" `inference_windows_batch_size`: int=-1, number of windows to sample in each inference batch, -1 uses all.<br>\n",
" `start_padding_enabled`: bool=False, if True, the model will pad the time series with zeros at the beginning, by input size.<br>\n",
" `training_data_availability_threshold`: Union[float, List[float]]=0.0, minimum fraction of valid data points required for training windows. Single float applies to both insample and outsample; list of two floats specifies [insample_fraction, outsample_fraction]. Default 0.0 allows windows with only 1 valid data point (current behavior).<br>\n",
" `step_size`: int=1, step size between each window of temporal data.<br>\n",
" `scaler_type`: str='identity', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).<br>\n",
" `random_seed`: int, random_seed for pytorch initializer and numpy generators.<br>\n",
Expand Down Expand Up @@ -227,6 +228,7 @@
" windows_batch_size: int = 1024,\n",
" inference_windows_batch_size: int = -1,\n",
" start_padding_enabled = False,\n",
" training_data_availability_threshold = 0.0,\n",
" step_size: int = 1,\n",
" scaler_type: str = 'identity',\n",
" random_seed: int = 1,\n",
Expand Down Expand Up @@ -262,6 +264,7 @@
" windows_batch_size=windows_batch_size,\n",
" inference_windows_batch_size=inference_windows_batch_size,\n",
" start_padding_enabled=start_padding_enabled,\n",
" training_data_availability_threshold = training_data_availability_threshold,\n",
" step_size=step_size,\n",
" scaler_type=scaler_type,\n",
" random_seed=random_seed,\n",
Expand Down
Loading