diff --git a/build/160.json b/build/160.json new file mode 100644 index 00000000..4b4ccad7 --- /dev/null +++ b/build/160.json @@ -0,0 +1,66 @@ +{ + "id": "160", + "title": "Mixed Precision Training", + "difficulty": "medium", + "category": "Machine Learning", + "video": "", + "likes": "0", + "dislikes": "0", + "contributor": [ + { + "profile_link": "https://github.com/komaksym", + "name": "komaksym" + } + ], + "description": "Write a Python class to implement Mixed Precision Training that uses both float32 and float16 data types to optimize memory usage and speed. Your class should have an `__init__(self, loss_scale=1024.0)` method to initialize with loss scaling factor. Implement `forward(self, weights, inputs, targets)` to perform forward pass with float16 computation and return Mean Squared Error (MSE) loss (scaled) in float32, and `backward(self, gradients)` to unscale gradients and check for overflow. Use float16 for computations but float32 for gradient accumulation. Return gradients as float32 and set them to zero if overflow is detected. Only use NumPy.", + "learn_section": "# **Mixed Precision Training**\n## **1. Definition**\nMixed Precision Training is a **deep learning optimization technique** that uses both **float16** (half precision) and **float32** (single precision) data types during training to reduce memory usage and increase training speed while maintaining model accuracy.\nThe technique works by:\n- **Using float16 for forward pass computations** to save memory and increase speed\n- **Using float32 for gradient accumulation** to maintain numerical precision\n- **Applying loss scaling** to prevent gradient underflow in float16\n---\n## **2. Key Components**\n### **Mean Squared Error (MSE) Loss**\nThe loss function must be computed as Mean Squared Error:\n$$\n\\text{MSE} = \\frac{1}{n} \\sum_{i=1}^{n} (y_i - \\hat{y}_i)^2\n$$\nwhere $y_i$ is the target and $\\hat{y}_i$ is the prediction for sample $i$.\n\n### **Loss Scaling**\nTo prevent gradient underflow in float16, gradients are scaled up during the forward pass:\n$$\n\\text{scaled\\_loss} = \\text{MSE} \\times \\text{scale\\_factor}\n$$\nThen unscaled during backward pass:\n$$\n\\text{gradient} = \\frac{\\text{scaled\\_gradient}}{\\text{scale\\_factor}}\n$$\n### **Overflow Detection**\nCheck for invalid gradients (NaN or Inf) that indicate numerical overflow:\n$$\n\\text{overflow} = \\text{any}(\\text{isnan}(\\text{gradients}) \\text{ or } \\text{isinf}(\\text{gradients}))\n$$\n---\n## **3. Precision Usage**\n- **float16**: Forward pass computations, activations, temporary calculations\n- **float32**: Gradient accumulation, parameter updates, loss scaling\n- **Automatic casting**: Convert between precisions as needed\n- **Loss computation**: Use MSE as the loss function before scaling\n---\n## **4. Benefits and Applications**\n- **Memory Efficiency**: Reduces memory usage by ~50% for activations\n- **Speed Improvement**: Faster computation on modern GPUs with Tensor Cores\n- **Training Stability**: Loss scaling prevents gradient underflow\n- **Model Accuracy**: Maintains comparable accuracy to full precision training\nCommon in training large neural networks where memory is a constraint and speed is critical.\n---", + "starter_code": "import numpy as np\n\nclass MixedPrecision:\n def __init__(self, loss_scale=1024.0):\n # Initialize loss scaling factor\n pass\n \n def forward(self, weights, inputs, targets):\n # Perform forward pass with float16, return scaled loss as float32\n pass\n \n def backward(self, gradients):\n # Unscale gradients and check for overflow, return as float32\n pass", + "solution": "import numpy as np\n\nclass MixedPrecision:\n def __init__(self, loss_scale=1024.0):\n self.loss_scale = loss_scale\n\n def forward(self, weights, inputs, targets):\n # Convert ALL inputs to float16 for computation (regardless of input dtype)\n weights_fp16 = weights.astype(np.float16)\n inputs_fp16 = inputs.astype(np.float16)\n targets_fp16 = targets.astype(np.float16)\n\n # Simple forward pass: linear model + MSE loss\n predictions = np.dot(inputs_fp16, weights_fp16)\n loss = np.mean((targets_fp16 - predictions) ** 2)\n\n # Scale loss and convert back to float32 (Python float)\n scaled_loss = float(loss) * self.loss_scale\n return scaled_loss\n\n def backward(self, gradients):\n # Convert gradients to float32 for precision (regardless of input dtype)\n gradients_fp32 = gradients.astype(np.float32)\n\n # Check for overflow (NaN or Inf)\n overflow = np.any(np.isnan(gradients_fp32)) or np.any(np.isinf(gradients_fp32))\n\n if overflow:\n # Return zero gradients if overflow detected (must be float32)\n return np.zeros_like(gradients_fp32, dtype=np.float32)\n\n # Unscale gradients (ensure result is float32)\n unscaled_gradients = gradients_fp32 / self.loss_scale\n return unscaled_gradients.astype(np.float32)", + "example": { + "input": "import numpy as np\nmp = MixedPrecision(loss_scale=1024.0)\nweights = np.array([0.5, -0.3], dtype=np.float32)\ninputs = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)\ntargets = np.array([1.0, 0.0], dtype=np.float32)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.4f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")\ngrads = np.array([512.0, -256.0], dtype=np.float32)\nresult = mp.backward(grads)\nprint(f\"Gradients: {result}\")\nprint(f\"Grad dtype: {result.dtype}\")", + "output": "Loss: 665.0000\nLoss dtype: float\nGradients: [0.5 -0.25]\nGrad dtype: float32", + "reasoning": "Forward pass converts inputs to float16, computes loss, then scales and returns as Python float (float32). Backward converts gradients to float32 and unscales. Final gradients must be float32 type." + }, + "test_cases": [ + { + "test": "import numpy as np\nmp = MixedPrecision(loss_scale=1024.0)\nweights = np.array([0.5, -0.3], dtype=np.float32)\ninputs = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)\ntargets = np.array([1.0, 0.0], dtype=np.float32)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.4f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")", + "expected_output": "Loss: 665.0000\nLoss dtype: float" + }, + { + "test": "import numpy as np\nmp = MixedPrecision(loss_scale=1024.0)\ngrads = np.array([512.0, -256.0], dtype=np.float32)\nresult = mp.backward(grads)\nprint(f\"Gradients: {result}\")\nprint(f\"Grad dtype: {result.dtype}\")", + "expected_output": "Gradients: [ 0.5 -0.25]\nGrad dtype: float32" + }, + { + "test": "import numpy as np\nmp = MixedPrecision(loss_scale=512.0)\nweights = np.array([1.0, 0.5], dtype=np.float64)\ninputs = np.array([[2.0, 1.0]], dtype=np.float64)\ntargets = np.array([3.0], dtype=np.float64)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.1f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")", + "expected_output": "Loss: 128.0\nLoss dtype: float" + }, + { + "test": "import numpy as np\nmp = MixedPrecision(loss_scale=512.0)\ngrads = np.array([1024.0, 512.0], dtype=np.float16)\nresult = mp.backward(grads)\nprint(f\"Gradients: [{result[0]:.0f} {result[1]:.0f}]\")\nprint(f\"Grad dtype: {result.dtype}\")", + "expected_output": "Gradients: [2 1]\nGrad dtype: float32" + }, + { + "test": "import numpy as np\nmp = MixedPrecision(loss_scale=100.0)\nweights = np.array([0.1, 0.2], dtype=np.float32)\ninputs = np.array([[1.0, 1.0]], dtype=np.float32)\ntargets = np.array([0.5], dtype=np.float32)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.1f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")", + "expected_output": "Loss: 4.0\nLoss dtype: float" + }, + { + "test": "import numpy as np\nmp = MixedPrecision(loss_scale=100.0)\ngrads = np.array([200.0, 100.0], dtype=np.float64)\nresult = mp.backward(grads)\nprint(f\"Gradients: [{result[0]:.0f} {result[1]:.0f}]\")\nprint(f\"Grad dtype: {result.dtype}\")", + "expected_output": "Gradients: [2 1]\nGrad dtype: float32" + }, + { + "test": "import numpy as np\nmp = MixedPrecision(loss_scale=2048.0)\nweights = np.array([0.25], dtype=np.float64)\ninputs = np.array([[4.0]], dtype=np.float64)\ntargets = np.array([2.0], dtype=np.float64)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.1f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")", + "expected_output": "Loss: 2048.0\nLoss dtype: float" + }, + { + "test": "import numpy as np\nmp = MixedPrecision(loss_scale=2048.0)\ngrads = np.array([np.nan], dtype=np.float16)\nresult = mp.backward(grads)\nprint(f\"Gradients: [{result[0]:.0f}]\")\nprint(f\"Grad dtype: {result.dtype}\")", + "expected_output": "Gradients: [0]\nGrad dtype: float32" + }, + { + "test": "import numpy as np\nmp = MixedPrecision(loss_scale=256.0)\nweights = np.array([1.0], dtype=np.float16)\ninputs = np.array([[2.0]], dtype=np.float16)\ntargets = np.array([3.0], dtype=np.float16)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.1f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")", + "expected_output": "Loss: 256.0\nLoss dtype: float" + }, + { + "test": "import numpy as np\nmp = MixedPrecision(loss_scale=256.0)\ngrads = np.array([np.inf], dtype=np.float64)\nresult = mp.backward(grads)\nprint(f\"Gradients: [{result[0]:.0f}]\")\nprint(f\"Grad dtype: {result.dtype}\")", + "expected_output": "Gradients: [0]\nGrad dtype: float32" + } + ] +} \ No newline at end of file diff --git a/build/175.json b/build/175.json new file mode 100644 index 00000000..c3c87229 --- /dev/null +++ b/build/175.json @@ -0,0 +1,46 @@ +{ + "id": "175", + "title": "Gaussian Process for Regression", + "difficulty": "medium", + "category": "Machine Learning", + "video": "", + "likes": "0", + "dislikes": "0", + "contributor": [ + { + "profile_link": "https://github.com/Coder1010ayush", + "name": "Ayush" + } + ], + "description": "## Problem\n\nProblem Statement: Task is to implement GaussianProcessRegression class which is a guassian process model for prediction regression problems.", + "learn_section": "# **Gaussian Processes (GP): From-Scratch Regression Example**\n\n## **1. What’s a Gaussian Process?**\nA **Gaussian Process** defines a distribution over functions \\( f(\\cdot) \\). For any finite set of inputs \\(X=\\{x_i\\}_{i=1}^n\\), the function values \\(f(X)\\) follow a multivariate normal:\n\n\\[\nf(X) \\sim \\mathcal{N}\\big(0,\\; K(X,X)\\big),\n\\]\n\nwhere \\(K\\) is a **kernel** (covariance) function encoding similarity between inputs. With noisy targets \\(y=f(X)+\\varepsilon,\\; \\varepsilon\\sim\\mathcal{N}(0,\\sigma_n^2 I)\\), GP regression yields a closed-form posterior predictive mean and variance at new points \\(X_*\\).\n\n---\n\n## **2. The Implementation at a Glance**\nThe provided code builds a minimal yet complete GP regression stack:\n\n- **Kernels implemented**\n - Radial Basis Function (RBF / Squared Exponential)\n - Matérn (\\(\\nu=0.5, 1.5, 2.5\\), or general \\(\\nu\\))\n - Periodic\n - Linear\n - Rational Quadratic\n- **Core GP classes**\n - `_GaussianProcessBase`: kernel selection & covariance matrix computation\n - `GaussianProcessRegression`:\n - `fit`: builds \\(K\\), does **Cholesky decomposition**, solves \\(\\alpha\\)\n - `predict`: returns posterior mean & variance\n - `log_marginal_likelihood`: computes GP evidence\n - `optimize_hyperparameters`: basic optimizer (for RBF hyperparams)\n\n---\n\n## **3. Kernel Cheat-Sheet**\nLet \\(x, x'\\in\\mathbb{R}^d\\), \\(r=\\lVert x-x'\\rVert\\).\n\n- **RBF (SE):** \n \\[\n k_{\\text{RBF}}(x,x')=\\sigma^2\\exp\\!\\left(-\\tfrac{1}{2}\\tfrac{r^2}{\\ell^2}\\right)\n \\]\n\n- **Matérn (\\(\\nu=1.5\\)):** \n \\[\n k(x,x')=\\Big(1+\\tfrac{\\sqrt{3}\\,r}{\\ell}\\Big)\\exp\\!\\Big(-\\tfrac{\\sqrt{3}\\,r}{\\ell}\\Big)\n \\]\n\n- **Periodic:** \n \\[\n k(x,x')=\\sigma^2\\exp\\!\\left(-\\tfrac{2}{\\ell^2}\\sin^2\\!\\Big(\\tfrac{\\pi r}{p}\\Big)\\right)\n \\]\n\n- **Linear:** \n \\[\n k(x,x')=\\sigma_b^2+\\sigma_v^2\\,x^\\top x'\n \\]\n\n- **Rational Quadratic:** \n \\[\n k(x,x')=\\sigma^2\\Big(1+\\tfrac{r^2}{2\\alpha \\ell^2}\\Big)^{-\\alpha}\n \\]\n\n---\n\n## **4. GP Regression Mechanics**\n### Training\n1. Build covariance: \n \\(K = K(X,X) + \\sigma_n^2 I\\)\n2. Cholesky factorization: \n \\(K=LL^\\top\\)\n3. Solve \\(\\alpha\\): \n \\(L L^\\top \\alpha = y\\)\n\n### Prediction\nAt new inputs \\(X_*\\):\n- \\(K_* = K(X, X_*)\\), \\(K_{**} = K(X_*, X_*)\\)\n- **Mean:** \n \\(\\mu_* = K_*^\\top \\alpha\\)\n- **Covariance:** \n \\(\\Sigma_* = K_{**} - V^\\top V,\\;\\; V = L^{-1}K_*\\)\n\n### Model Selection\n- **Log Marginal Likelihood (LML):** \n \\[\n \\log p(y\\mid X)= -\\tfrac{1}{2}y^\\top \\alpha - \\sum\\nolimits_i \\log L_{ii} - \\tfrac{n}{2}\\log(2\\pi)\n \\]\n\n---\n\n## **5. Worked Example (Linear Kernel)**\n\n```python\nimport numpy as np\ngp = GaussianProcessRegression(kernel='linear',\n kernel_params={'sigma_b': 0.0, 'sigma_v': 1.0},\n noise=1e-8)\n\nX_train = np.array([[1], [2], [4]])\ny_train = np.array([3, 5, 9]) # y = 2x + 1\ngp.fit(X_train, y_train)\n\nX_test = np.array([[3.0]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\") # -> 7.0000\n```\n\n\n## **6. When to Use GP Regression**\n\n- **Small-to-medium datasets** where uncertainty estimates are valuable \n- Cases requiring **predictive intervals** (not just point predictions) \n- **Nonparametric modeling** with kernel priors \n- Automatic hyperparameter tuning via **marginal likelihood** \n\n---\n\n## **7. Practical Tips**\n\n- Always add **jitter** (`1e-6`) to the diagonal for numerical stability \n- **Standardize inputs/outputs** before training \n- Be aware: Exact GP has complexity **\\(\\mathcal{O}(n^3)\\)** in time and **\\(\\mathcal{O}(n^2)\\)** in memory \n- Choose kernels to match problem structure: \n - **RBF:** smooth functions \n - **Matérn:** rougher functions \n - **Periodic:** seasonal/cyclical data \n - **Linear:** global linear trends ", + "starter_code": "import math # ---------------------------------------- utf-8 encoding ---------------------------------\n\n# This file contains Gaussian Process implementation.\nimport numpy as np\nimport math\n\n\ndef matern_kernel(x: np.ndarray, x_prime: np.ndarray, length_scale=1.0, nu=1.5):\n pass\n\n\ndef rbf_kernel(x: np.ndarray, x_prime, sigma=1.0, length_scale=1.0):\n pass\n\n\ndef periodic_kernel(\n x: np.ndarray, x_prime: np.ndarray, sigma=1.0, length_scale=1.0, period=1.0\n):\n pass\n\n\ndef linear_kernel(x: np.ndarray, x_prime: np.ndarray, sigma_b=1.0, sigma_v=1.0):\n pass\n\n\ndef rational_quadratic_kernel(\n x: np.ndarray, x_prime: np.ndarray, sigma=1.0, length_scale=1.0, alpha=1.0\n):\n pass\n\n\n# --- BASE CLASS -------------------------------------------------------------\n\n\nclass _GaussianProcessBase:\n def __init__(self, kernel=\"rbf\", noise=1e-5, kernel_params=None):\n pass\n\n def _select_kernel(self, x1, x2):\n \"\"\"Selects and computes the kernel value for two single data points.\"\"\"\n pass\n\n def _compute_covariance(self, X1, X2):\n \"\"\"\n Computes the covariance matrix between two sets of points.\n This method fixes the vectorization bug from the original code.\n \"\"\"\n pass\n\n\n# --- REGRESSION MODEL -------------------------------------------------------\nclass GaussianProcessRegression(_GaussianProcessBase):\n def fit(self, X, y):\n pass\n\n def predict(self, X_test, return_std=False):\n pass\n\n def log_marginal_likelihood(self):\n pass\n\n def optimize_hyperparameters(self):\n pass", + "solution": "# ---------------------------------------- utf-8 encoding ---------------------------------\n# This file contains Gaussian Process implementation.\nimport numpy as np\nimport math\nfrom scipy.spatial.distance import euclidean\nfrom scipy.special import kv as bessel_kv\nfrom scipy.special import gamma\nfrom scipy.linalg import cholesky, solve_triangular\nfrom scipy.optimize import minimize\nfrom scipy.special import expit, softmax\n\n\n# --- KERNEL FUNCTIONS --------------------------------------------------------\ndef matern_kernel(x: np.ndarray, x_prime: np.ndarray, length_scale=1.0, nu=1.5):\n d = euclidean(x, x_prime)\n if d == 0:\n return 1.0 # Covariance with self is 1 before scaling\n if nu == 0.5:\n return np.exp(-d / length_scale)\n elif nu == 1.5:\n return (1 + np.sqrt(3) * d / length_scale) * np.exp(\n -np.sqrt(3) * d / length_scale\n )\n elif nu == 2.5:\n return (\n 1 + np.sqrt(5) * d / length_scale + 5 * d**2 / (3 * length_scale**2)\n ) * np.exp(-np.sqrt(5) * d / length_scale)\n else:\n factor = (2 ** (1 - nu)) / gamma(nu)\n scaled_d = np.sqrt(2 * nu) * d / length_scale\n return factor * (scaled_d**nu) * bessel_kv(nu, scaled_d)\n\n\ndef rbf_kernel(x: np.ndarray, x_prime, sigma=1.0, length_scale=1.0):\n # This is a squared exponential kernel\n\n # Calculate the squared euclidean distance\n sq_norm = np.linalg.norm(x - x_prime) ** 2\n\n # Correctly implement the formula\n return sigma**2 * np.exp(-sq_norm / (2 * length_scale**2))\n\n\ndef periodic_kernel(\n x: np.ndarray, x_prime: np.ndarray, sigma=1.0, length_scale=1.0, period=1.0\n):\n return sigma**2 * np.exp(\n -2 * np.sin(np.pi * np.linalg.norm(x - x_prime) / period) ** 2 / length_scale**2\n )\n\n\ndef linear_kernel(x: np.ndarray, x_prime: np.ndarray, sigma_b=1.0, sigma_v=1.0):\n return sigma_b**2 + sigma_v**2 * np.dot(x, x_prime)\n\n\ndef rational_quadratic_kernel(\n x: np.ndarray, x_prime: np.ndarray, sigma=1.0, length_scale=1.0, alpha=1.0\n):\n return sigma**2 * (\n 1 + np.linalg.norm(x - x_prime) ** 2 / (2 * alpha * length_scale**2)\n ) ** (-alpha)\n\n\n# --- BASE CLASS -------------------------------------------------------------\n\n\nclass _GaussianProcessBase:\n def __init__(self, kernel=\"rbf\", noise=1e-5, kernel_params=None):\n self.kernel_name = kernel\n self.noise = noise\n self.kernel_params = kernel_params if kernel_params else {}\n self.X_train = None\n self.y_train = None\n self.K = None\n\n def _select_kernel(self, x1, x2):\n \"\"\"Selects and computes the kernel value for two single data points.\"\"\"\n if self.kernel_name == \"rbf\":\n return rbf_kernel(x1, x2, **self.kernel_params)\n elif self.kernel_name == \"matern\":\n return matern_kernel(x1, x2, **self.kernel_params)\n elif self.kernel_name == \"periodic\":\n return periodic_kernel(x1, x2, **self.kernel_params)\n elif self.kernel_name == \"linear\":\n return linear_kernel(x1, x2, **self.kernel_params)\n elif self.kernel_name == \"rational_quadratic\":\n return rational_quadratic_kernel(x1, x2, **self.kernel_params)\n else:\n raise ValueError(\n \"Unsupported kernel. Choose from ['rbf', 'matern', 'periodic', 'linear', 'rational_quadratic'].\"\n )\n\n def _compute_covariance(self, X1, X2):\n \"\"\"\n Computes the covariance matrix between two sets of points.\n This method fixes the vectorization bug from the original code.\n \"\"\"\n # Ensuring X1 and X2 are 2D arrays\n X1 = np.atleast_2d(X1)\n X2 = np.atleast_2d(X2)\n\n n1, _ = X1.shape\n n2, _ = X2.shape\n K = np.zeros((n1, n2))\n for i in range(n1):\n for j in range(n2):\n K[i, j] = self._select_kernel(X1[i], X2[j])\n return K\n\n\n# --- REGRESSION MODEL -------------------------------------------------------\nclass GaussianProcessRegression(_GaussianProcessBase):\n def fit(self, X, y):\n self.X_train = np.asarray(X)\n self.y_train = np.asarray(y)\n self.K = self._compute_covariance(\n self.X_train, self.X_train\n ) + self.noise * np.eye(len(self.X_train))\n\n # Compute Cholesky decomposition for stable inversion\n self.L = cholesky(self.K, lower=True)\n # alpha = K_inv * y\n self.alpha = solve_triangular(\n self.L.T, solve_triangular(self.L, self.y_train, lower=True)\n )\n\n def predict(self, X_test, return_std=False):\n X_test = np.atleast_2d(X_test)\n K_s = self._compute_covariance(self.X_train, X_test)\n K_ss = self._compute_covariance(X_test, X_test)\n\n # Compute predictive mean\n mu = K_s.T @ self.alpha\n\n # Compute predictive variance\n v = solve_triangular(self.L, K_s, lower=True)\n cov = K_ss - v.T @ v\n\n if return_std:\n return mu, np.sqrt(np.diag(cov))\n return mu\n\n def log_marginal_likelihood(self):\n return (\n -0.5 * (self.y_train.T @ self.alpha)\n - np.sum(np.log(np.diag(self.L)))\n - len(self.X_train) / 2 * np.log(2 * np.pi)\n )\n\n def optimize_hyperparameters(self):\n # NOTE: This is a simplified optimizer for 'rbf' kernel's params.\n def objective(params):\n self.kernel_params = {\n \"length_scale\": np.exp(params[0]),\n \"sigma\": np.exp(params[1]),\n }\n self.fit(self.X_train, self.y_train)\n return -self.log_marginal_likelihood()\n\n init_params = np.log(\n [\n self.kernel_params.get(\"length_scale\", 1.0),\n self.kernel_params.get(\"sigma\", 1.0),\n ]\n )\n res = minimize(\n objective, init_params, method=\"L-BFGS-B\", bounds=[(-5, 5), (-5, 5)]\n )\n\n self.kernel_params = {\n \"length_scale\": np.exp(res.x[0]),\n \"sigma\": np.exp(res.x[1]),\n }\n # Re-fit with optimal hyperparameters\n self.fit(self.X_train, self.y_train)\n print(\"Optimized Hyperparameters:\", self.kernel_params)\n\n\n# if __name__ == \"__main__\":\n# gp = GaussianProcessRegression(\n# kernel=\"rbf\", kernel_params={\"sigma\": 1.0, \"length_scale\": 1.0}, noise=1e-8\n# )\n# X_train = np.array([[0], [2.5], [5.0]])\n# y_train = np.array([1.0, 3.0, 1.5])\n# gp.fit(X_train, y_train)\n# X_test = np.array([[2.5]])\n# mu, std = gp.predict(X_test, return_std=True)\n# print(f\"mu={mu[0]:.4f}, std={std[0]:.4f}\")", + "example": { + "input": "import numpy as np\ngp = GaussianProcessRegression(kernel='linear', kernel_params={'sigma_b': 0.0, 'sigma_v': 1.0}, noise=1e-8)\nX_train = np.array([[1], [2], [4]])\ny_train = np.array([3, 5, 9])\ngp.fit(X_train, y_train)\nX_test = np.array([[3.0]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\")", + "output": "7.0000", + "reasoning": "A Gaussian Process with a linear kernel is trained on perfectly linear data that follows the function y = 2x + 1. When asked to predict the value at x=3, the model perfectly interpolates the linear function it has learned, resulting in a prediction of 2*3 + 1 = 7. The near-zero noise ensures the prediction is exact." + }, + "test_cases": [ + { + "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='rbf', kernel_params={'sigma': 1.0, 'length_scale': 1.0}, noise=1e-8)\nX_train = np.array([[0], [2.5], [5.0], [7.5], [10.0]])\ny_train = np.sin(X_train).ravel()\ngp.fit(X_train, y_train)\nX_test = np.array([[1.25]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\")", + "expected_output": "0.2814" + }, + { + "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='rbf', kernel_params={'sigma': 1.0, 'length_scale': 1.0}, noise=1e-8)\nX_train = np.array([[0], [2.5], [5.0], [7.5], [10.0]])\ny_train = np.sin(X_train).ravel()\ngp.fit(X_train, y_train)\nX_test = np.array([[1.25]])\nmu, std = gp.predict(X_test, return_std=True)\nprint(f\"mu={mu[0]:.4f}, std={std[0]:.4f}\")", + "expected_output": "mu=0.2814, std=0.7734" + }, + { + "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='rbf', kernel_params={'sigma': 1.0, 'length_scale': 1.0}, noise=1e-8)\nX_train = np.array([[0], [2.5], [5.0]])\ny_train = np.array([1.0, 3.0, 1.5])\ngp.fit(X_train, y_train)\nX_test = np.array([[2.5]])\nmu, std = gp.predict(X_test, return_std=True)\nprint(f\"mu={mu[0]:.4f}, std={std[0]:.4f}\")", + "expected_output": "mu=3.0000, std=0.0001" + }, + { + "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='linear', kernel_params={'sigma_b': 0.1, 'sigma_v': 1.0}, noise=1e-8)\nX_train = np.array([[1], [2], [4]])\ny_train = np.array([3, 5, 9])\ngp.fit(X_train, y_train)\nX_test = np.array([[3.0]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\")", + "expected_output": "7.0000" + }, + { + "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='rbf', kernel_params={'sigma': 1.0, 'length_scale': 1.5}, noise=1e-8)\nX_train = np.array([[1, 2], [3, 4], [5, 1]])\ny_train = np.sum(X_train, axis=1)\ngp.fit(X_train, y_train)\nX_test = np.array([[2, 3]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\")", + "expected_output": "5.5553" + } + ] +} \ No newline at end of file diff --git a/questions/186_guassian_mixture_regression/__pycache__/solution.cpython-310.pyc b/questions/186_guassian_mixture_regression/__pycache__/solution.cpython-310.pyc new file mode 100644 index 00000000..74a21fbd Binary files /dev/null and b/questions/186_guassian_mixture_regression/__pycache__/solution.cpython-310.pyc differ diff --git a/questions/186_guassian_mixture_regression/description.md b/questions/186_guassian_mixture_regression/description.md new file mode 100644 index 00000000..8b8b22c9 --- /dev/null +++ b/questions/186_guassian_mixture_regression/description.md @@ -0,0 +1,3 @@ +## Problem + +Problem Statement: Task is to implement GaussianProcessRegression class which is a guassian process model for prediction regression problems. \ No newline at end of file diff --git a/questions/186_guassian_mixture_regression/example.json b/questions/186_guassian_mixture_regression/example.json new file mode 100644 index 00000000..a9e00f4f --- /dev/null +++ b/questions/186_guassian_mixture_regression/example.json @@ -0,0 +1,5 @@ +{ + "input": "import numpy as np\ngp = GaussianProcessRegression(kernel='linear', kernel_params={'sigma_b': 0.0, 'sigma_v': 1.0}, noise=1e-8)\nX_train = np.array([[1], [2], [4]])\ny_train = np.array([3, 5, 9])\ngp.fit(X_train, y_train)\nX_test = np.array([[3.0]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\")", + "output": "7.0000", + "reasoning": "A Gaussian Process with a linear kernel is trained on perfectly linear data that follows the function y = 2x + 1. When asked to predict the value at x=3, the model perfectly interpolates the linear function it has learned, resulting in a prediction of 2*3 + 1 = 7. The near-zero noise ensures the prediction is exact." +} \ No newline at end of file diff --git a/questions/186_guassian_mixture_regression/learn.md b/questions/186_guassian_mixture_regression/learn.md new file mode 100644 index 00000000..14f6a3df --- /dev/null +++ b/questions/186_guassian_mixture_regression/learn.md @@ -0,0 +1,155 @@ +# **Gaussian Processes (GP): From-Scratch Regression Example** + +## **1. What’s a Gaussian Process?** + +A **Gaussian Process** defines a distribution over functions $f(\cdot)$. +For any finite set of inputs $( X = {x_i}_{i=1}^n )$, the function values $f(X)$ follow a multivariate normal: + +$$ +f(X) \sim \mathcal{N}\big(0,; K(X,X)\big) +$$ + +where ( K ) is a **kernel** (covariance) function encoding similarity between inputs. +With noisy targets $( y = f(X) + \varepsilon, \varepsilon \sim \mathcal{N}(0,\sigma_n^2 I) )$, +GP regression yields a closed-form posterior predictive mean and variance at new points $( X_* )$. + +--- + +## **2. The Implementation at a Glance** + +The provided code builds a minimal yet complete GP regression stack: + +* **Kernels implemented** + + * Radial Basis Function (RBF / Squared Exponential) + * Matérn $(( \nu = 0.5, 1.5, 2.5 ), or general ( \nu ))$ + * Periodic + * Linear + * Rational Quadratic + +* **Core GP classes** + + * `_GaussianProcessBase`: kernel selection & covariance matrix computation + * `GaussianProcessRegression`: + + * `fit`: $builds ( K )$, does **Cholesky decomposition**, $solves ( \alpha )$ + * `predict`: returns posterior mean & variance + * `log_marginal_likelihood`: computes GP evidence + * `optimize_hyperparameters`: basic optimizer (for RBF hyperparams) + +--- + +## **3. Kernel Cheat-Sheet** + +Let $( x, x' \in \mathbb{R}^d ), ( r = \lVert x - x' \rVert )$. + +* **RBF (SE):** + $$ + k_{\text{RBF}}(x,x') = \sigma^2 \exp!\left(-\tfrac{1}{2}\tfrac{r^2}{\ell^2}\right) + $$ + +* **Matérn (( \nu = 1.5 )):** + $$ + k(x,x') = \Big(1 + \tfrac{\sqrt{3},r}{\ell}\Big)\exp!\Big(-\tfrac{\sqrt{3},r}{\ell}\Big) + $$ + +* **Periodic:** + $$ + k(x,x') = \sigma^2 \exp!\left(-\tfrac{2}{\ell^2}\sin^2!\Big(\tfrac{\pi r}{p}\Big)\right) + $$ + +* **Linear:** + $$ + k(x,x') = \sigma_b^2 + \sigma_v^2,x^\top x' + $$ + +* **Rational Quadratic:** + $$ + k(x,x') = \sigma^2\Big(1 + \tfrac{r^2}{2\alpha \ell^2}\Big)^{-\alpha} + $$ + +--- + +## **4. GP Regression Mechanics** + +### **Training** + +1. Build covariance: + $$ + K = K(X,X) + \sigma_n^2 I + $$ + +2. Cholesky factorization: + $$ + K = L L^\top + $$ + +3. Solve ( \alpha ): + $$ + L L^\top \alpha = y + $$ + +### **Prediction** + +At new inputs ( X_* ): + +* $( K_* = K(X, X_*) ), ( K_{**} = K(X_*, X_*) )$ + +* **Mean:** + $$ + \mu_* = K_*^\top \alpha + $$ + +* **Covariance:** + $$ + \Sigma_* = K_{**} - V^\top V, \quad V = L^{-1} K_* + $$ + +### **Model Selection** + +* **Log Marginal Likelihood (LML):** + $$ + \log p(y \mid X) = -\tfrac{1}{2} y^\top \alpha - \sum\nolimits_i \log L_{ii} - \tfrac{n}{2}\log(2\pi) + $$ + +--- + +## **5. Worked Example (Linear Kernel)** + +```python +import numpy as np +gp = GaussianProcessRegression(kernel='linear', + kernel_params={'sigma_b': 0.0, 'sigma_v': 1.0}, + noise=1e-8) + +X_train = np.array([[1], [2], [4]]) +y_train = np.array([3, 5, 9]) # y = 2x + 1 +gp.fit(X_train, y_train) + +X_test = np.array([[3.0]]) +mu = gp.predict(X_test) +print(f"{mu[0]:.4f}") # -> 7.0000 +``` + +--- + +## **6. When to Use GP Regression** + +* **Small-to-medium datasets** where uncertainty estimates are valuable +* Cases requiring **predictive intervals** (not just point predictions) +* **Nonparametric modeling** with kernel priors +* Automatic hyperparameter tuning via **marginal likelihood** + +--- + +## **7. Practical Tips** + +* Always add **jitter** $10^{-6}$ to the diagonal for numerical stability +* **Standardize inputs/outputs** before training +* Be aware: Exact GP has complexity **$\mathcal{O}(n^3)$** in time and **$\mathcal{O}(n^2)$** in memory +* Choose kernels to match problem structure: + + * **RBF:** smooth functions + * **Matérn:** rougher functions + * **Periodic:** seasonal/cyclical data + * **Linear:** global linear trends diff --git a/questions/186_guassian_mixture_regression/meta.json b/questions/186_guassian_mixture_regression/meta.json new file mode 100644 index 00000000..c74e23b3 --- /dev/null +++ b/questions/186_guassian_mixture_regression/meta.json @@ -0,0 +1,15 @@ +{ + "id": "186", + "title": "Gaussian Process for Regression", + "difficulty": "medium", + "category": "Machine Learning", + "video": "", + "likes": "0", + "dislikes": "0", + "contributor": [ + { + "profile_link": "https://github.com/Coder1010ayush", + "name": "Ayush" + } + ] +} \ No newline at end of file diff --git a/questions/186_guassian_mixture_regression/solution.py b/questions/186_guassian_mixture_regression/solution.py new file mode 100644 index 00000000..11307bf7 --- /dev/null +++ b/questions/186_guassian_mixture_regression/solution.py @@ -0,0 +1,186 @@ +# ---------------------------------------- utf-8 encoding --------------------------------- +# This file contains Gaussian Process implementation. +import numpy as np +import math +from scipy.spatial.distance import euclidean +from scipy.special import kv as bessel_kv +from scipy.special import gamma +from scipy.linalg import cholesky, solve_triangular +from scipy.optimize import minimize +from scipy.special import expit, softmax + + +# --- KERNEL FUNCTIONS -------------------------------------------------------- +def matern_kernel(x: np.ndarray, x_prime: np.ndarray, length_scale=1.0, nu=1.5): + d = euclidean(x, x_prime) + if d == 0: + return 1.0 # Covariance with self is 1 before scaling + if nu == 0.5: + return np.exp(-d / length_scale) + elif nu == 1.5: + return (1 + np.sqrt(3) * d / length_scale) * np.exp( + -np.sqrt(3) * d / length_scale + ) + elif nu == 2.5: + return ( + 1 + np.sqrt(5) * d / length_scale + 5 * d**2 / (3 * length_scale**2) + ) * np.exp(-np.sqrt(5) * d / length_scale) + else: + factor = (2 ** (1 - nu)) / gamma(nu) + scaled_d = np.sqrt(2 * nu) * d / length_scale + return factor * (scaled_d**nu) * bessel_kv(nu, scaled_d) + + +def rbf_kernel(x: np.ndarray, x_prime, sigma=1.0, length_scale=1.0): + # This is a squared exponential kernel + + # Calculate the squared euclidean distance + sq_norm = np.linalg.norm(x - x_prime) ** 2 + + # Correctly implement the formula + return sigma**2 * np.exp(-sq_norm / (2 * length_scale**2)) + + +def periodic_kernel( + x: np.ndarray, x_prime: np.ndarray, sigma=1.0, length_scale=1.0, period=1.0 +): + return sigma**2 * np.exp( + -2 * np.sin(np.pi * np.linalg.norm(x - x_prime) / period) ** 2 / length_scale**2 + ) + + +def linear_kernel(x: np.ndarray, x_prime: np.ndarray, sigma_b=1.0, sigma_v=1.0): + return sigma_b**2 + sigma_v**2 * np.dot(x, x_prime) + + +def rational_quadratic_kernel( + x: np.ndarray, x_prime: np.ndarray, sigma=1.0, length_scale=1.0, alpha=1.0 +): + return sigma**2 * ( + 1 + np.linalg.norm(x - x_prime) ** 2 / (2 * alpha * length_scale**2) + ) ** (-alpha) + + +# --- BASE CLASS ------------------------------------------------------------- + + +class _GaussianProcessBase: + def __init__(self, kernel="rbf", noise=1e-5, kernel_params=None): + self.kernel_name = kernel + self.noise = noise + self.kernel_params = kernel_params if kernel_params else {} + self.X_train = None + self.y_train = None + self.K = None + + def _select_kernel(self, x1, x2): + """Selects and computes the kernel value for two single data points.""" + if self.kernel_name == "rbf": + return rbf_kernel(x1, x2, **self.kernel_params) + elif self.kernel_name == "matern": + return matern_kernel(x1, x2, **self.kernel_params) + elif self.kernel_name == "periodic": + return periodic_kernel(x1, x2, **self.kernel_params) + elif self.kernel_name == "linear": + return linear_kernel(x1, x2, **self.kernel_params) + elif self.kernel_name == "rational_quadratic": + return rational_quadratic_kernel(x1, x2, **self.kernel_params) + else: + raise ValueError( + "Unsupported kernel. Choose from ['rbf', 'matern', 'periodic', 'linear', 'rational_quadratic']." + ) + + def _compute_covariance(self, X1, X2): + """ + Computes the covariance matrix between two sets of points. + This method fixes the vectorization bug from the original code. + """ + # Ensuring X1 and X2 are 2D arrays + X1 = np.atleast_2d(X1) + X2 = np.atleast_2d(X2) + + n1, _ = X1.shape + n2, _ = X2.shape + K = np.zeros((n1, n2)) + for i in range(n1): + for j in range(n2): + K[i, j] = self._select_kernel(X1[i], X2[j]) + return K + + +# --- REGRESSION MODEL ------------------------------------------------------- +class GaussianProcessRegression(_GaussianProcessBase): + def fit(self, X, y): + self.X_train = np.asarray(X) + self.y_train = np.asarray(y) + self.K = self._compute_covariance( + self.X_train, self.X_train + ) + self.noise * np.eye(len(self.X_train)) + + # Compute Cholesky decomposition for stable inversion + self.L = cholesky(self.K, lower=True) + # alpha = K_inv * y + self.alpha = solve_triangular( + self.L.T, solve_triangular(self.L, self.y_train, lower=True) + ) + + def predict(self, X_test, return_std=False): + X_test = np.atleast_2d(X_test) + K_s = self._compute_covariance(self.X_train, X_test) + K_ss = self._compute_covariance(X_test, X_test) + + # Compute predictive mean + mu = K_s.T @ self.alpha + + # Compute predictive variance + v = solve_triangular(self.L, K_s, lower=True) + cov = K_ss - v.T @ v + + if return_std: + return mu, np.sqrt(np.diag(cov)) + return mu + + def log_marginal_likelihood(self): + return ( + -0.5 * (self.y_train.T @ self.alpha) + - np.sum(np.log(np.diag(self.L))) + - len(self.X_train) / 2 * np.log(2 * np.pi) + ) + + def optimize_hyperparameters(self): + # NOTE: This is a simplified optimizer for 'rbf' kernel's params. + def objective(params): + self.kernel_params = { + "length_scale": np.exp(params[0]), + "sigma": np.exp(params[1]), + } + self.fit(self.X_train, self.y_train) + return -self.log_marginal_likelihood() + + init_params = np.log( + [ + self.kernel_params.get("length_scale", 1.0), + self.kernel_params.get("sigma", 1.0), + ] + ) + res = minimize( + objective, init_params, method="L-BFGS-B", bounds=[(-5, 5), (-5, 5)] + ) + + self.kernel_params = { + "length_scale": np.exp(res.x[0]), + "sigma": np.exp(res.x[1]), + } + # Re-fit with optimal hyperparameters + self.fit(self.X_train, self.y_train) + + +if __name__ == "__main__": + gp = GaussianProcessRegression( + kernel="linear", kernel_params={"sigma_b": 0.0, "sigma_v": 1.0}, noise=1e-8 + ) + X_train = np.array([[1], [2], [4]]) + y_train = np.array([3, 5, 9]) + gp.fit(X_train, y_train) + X_test = np.array([[3.0]]) + mu = gp.predict(X_test) diff --git a/questions/186_guassian_mixture_regression/starter_code.py b/questions/186_guassian_mixture_regression/starter_code.py new file mode 100644 index 00000000..89bf9c9e --- /dev/null +++ b/questions/186_guassian_mixture_regression/starter_code.py @@ -0,0 +1,63 @@ +import math # ---------------------------------------- utf-8 encoding --------------------------------- + +# This file contains Gaussian Process implementation. +import numpy as np +import math + + +def matern_kernel(x: np.ndarray, x_prime: np.ndarray, length_scale=1.0, nu=1.5): + pass + + +def rbf_kernel(x: np.ndarray, x_prime, sigma=1.0, length_scale=1.0): + pass + + +def periodic_kernel( + x: np.ndarray, x_prime: np.ndarray, sigma=1.0, length_scale=1.0, period=1.0 +): + pass + + +def linear_kernel(x: np.ndarray, x_prime: np.ndarray, sigma_b=1.0, sigma_v=1.0): + pass + + +def rational_quadratic_kernel( + x: np.ndarray, x_prime: np.ndarray, sigma=1.0, length_scale=1.0, alpha=1.0 +): + pass + + +# --- BASE CLASS ------------------------------------------------------------- + + +class _GaussianProcessBase: + def __init__(self, kernel="rbf", noise=1e-5, kernel_params=None): + pass + + def _select_kernel(self, x1, x2): + """Selects and computes the kernel value for two single data points.""" + pass + + def _compute_covariance(self, X1, X2): + """ + Computes the covariance matrix between two sets of points. + This method fixes the vectorization bug from the original code. + """ + pass + + +# --- REGRESSION MODEL ------------------------------------------------------- +class GaussianProcessRegression(_GaussianProcessBase): + def fit(self, X, y): + pass + + def predict(self, X_test, return_std=False): + pass + + def log_marginal_likelihood(self): + pass + + def optimize_hyperparameters(self): + pass diff --git a/questions/186_guassian_mixture_regression/tests.json b/questions/186_guassian_mixture_regression/tests.json new file mode 100644 index 00000000..b2d6014d --- /dev/null +++ b/questions/186_guassian_mixture_regression/tests.json @@ -0,0 +1,22 @@ +[ + { + "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='rbf', kernel_params={'sigma': 1.0, 'length_scale': 1.0}, noise=1e-8)\nX_train = np.array([[0], [2.5], [5.0], [7.5], [10.0]])\ny_train = np.sin(X_train).ravel()\ngp.fit(X_train, y_train)\nX_test = np.array([[1.25]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\")", + "expected_output": "0.2814" + }, + { + "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='rbf', kernel_params={'sigma': 1.0, 'length_scale': 1.0}, noise=1e-8)\nX_train = np.array([[0], [2.5], [5.0], [7.5], [10.0]])\ny_train = np.sin(X_train).ravel()\ngp.fit(X_train, y_train)\nX_test = np.array([[1.25]])\nmu, std = gp.predict(X_test, return_std=True)\nprint(f\"mu={mu[0]:.4f}, std={std[0]:.4f}\")", + "expected_output": "mu=0.2814, std=0.7734" + }, + { + "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='rbf', kernel_params={'sigma': 1.0, 'length_scale': 1.0}, noise=1e-8)\nX_train = np.array([[0], [2.5], [5.0]])\ny_train = np.array([1.0, 3.0, 1.5])\ngp.fit(X_train, y_train)\nX_test = np.array([[2.5]])\nmu, std = gp.predict(X_test, return_std=True)\nprint(f\"mu={mu[0]:.4f}, std={std[0]:.4f}\")", + "expected_output": "mu=3.0000, std=0.0001" + }, + { + "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='linear', kernel_params={'sigma_b': 0.1, 'sigma_v': 1.0}, noise=1e-8)\nX_train = np.array([[1], [2], [4]])\ny_train = np.array([3, 5, 9])\ngp.fit(X_train, y_train)\nX_test = np.array([[3.0]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\")", + "expected_output": "7.0000" + }, + { + "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='rbf', kernel_params={'sigma': 1.0, 'length_scale': 1.5}, noise=1e-8)\nX_train = np.array([[1, 2], [3, 4], [5, 1]])\ny_train = np.sum(X_train, axis=1)\ngp.fit(X_train, y_train)\nX_test = np.array([[2, 3]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\")", + "expected_output": "5.5553" + } +] \ No newline at end of file