scikit-learn-contrib
diff --git a/‎docs/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎docs/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/FeatureSelection-WHODataset.ipynb‎
Lines changed: 376 additions & 0 deletions b/‎examples/FeatureSelection-WHODataset.ipynb‎
Lines changed: 376 additions & 0 deletions
@@ -9,3 +9,4 @@ sphinx_rtd_theme
 tqdm
 traitlets>=5.0
 jinja2 < 3.1
+pandas
@@ -0,0 +1,376 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2a30aece",
+   "metadata": {},
+   "source": [
+    "# Feature Selection on the WHO Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6857fae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from matplotlib import pyplot as plt\n",
+    "import numpy as np\n",
+    "from tqdm.auto import tqdm\n",
+    "from sklearn.kernel_ridge import KernelRidge\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from skcosmo.preprocessing import StandardFlexibleScaler\n",
+    "from skcosmo.feature_selection import PCovFPS, PCovCUR, FPS, CUR\n",
+    "from skcosmo.datasets import load_who_dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "de5f2f17",
+   "metadata": {},
+   "source": [
+    "## Load the Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b816f2fb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = load_who_dataset()['data']\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "472af9a2",
+   "metadata": {
+    "code_folding": []
+   },
+   "outputs": [],
+   "source": [
+    "columns = np.array([\n",
+    "    \"SP.POP.TOTL\",\n",
+    "    \"SH.TBS.INCD\",\n",
+    "    \"SH.IMM.MEAS\",\n",
+    "    \"SE.XPD.TOTL.GD.ZS\",\n",
+    "    \"SH.DYN.AIDS.ZS\",\n",
+    "    \"SH.IMM.IDPT\",\n",
+    "    \"SH.XPD.CHEX.GD.ZS\",\n",
+    "    \"SN.ITK.DEFC.ZS\",\n",
+    "    \"NY.GDP.PCAP.CD\",\n",
+    "])\n",
+    "\n",
+    "column_names = np.array([\n",
+    "    \"Population\",\n",
+    "    \"Tuberculosis\",\n",
+    "    \"Immunization, measles\",\n",
+    "    \"Educ. Expenditure\",\n",
+    "    \"HIV\",\n",
+    "    \"Immunization, DPT\",\n",
+    "    \"Health Expenditure\",\n",
+    "    \"Undernourishment\",\n",
+    "    \"GDP per capita\",\n",
+    "])\n",
+    "\n",
+    "columns = columns[[8, 4, 5, 6, 1, 0, 7, 3, 2]].tolist()\n",
+    "column_names = column_names[[8, 4, 5, 6, 1, 0, 7, 3, 2]].tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a06715d8",
+   "metadata": {
+    "code_folding": []
+   },
+   "outputs": [],
+   "source": [
+    "X_raw = np.array(df[columns]) \n",
+    "\n",
+    "# We are taking the logarithm of the population and GDP to avoid extreme distributions\n",
+    "log_scaled = ['SP.POP.TOTL', 'NY.GDP.PCAP.CD']\n",
+    "for ls in log_scaled:\n",
+    "    print(X_raw[:, columns.index(ls)].min(), X_raw[:, columns.index(ls)].max())\n",
+    "    if ls in columns:\n",
+    "        X_raw[:, columns.index(ls)] = np.log10(\n",
+    "            X_raw[:, columns.index(ls)]\n",
+    "        )\n",
+    "y_raw = np.array(df[\"SP.DYN.LE00.IN\"])  # [np.where(df['Year']==2000)[0]])\n",
+    "y_raw = y_raw.reshape(-1, 1)\n",
+    "X_raw.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f8cccebd",
+   "metadata": {},
+   "source": [
+    "## Scale and Center the Features and Targets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43241e40",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_scaler = StandardFlexibleScaler(column_wise=True)\n",
+    "X = x_scaler.fit_transform(X_raw)\n",
+    "\n",
+    "y_scaler = StandardFlexibleScaler(column_wise=True)\n",
+    "y = y_scaler.fit_transform(y_raw)\n",
+    "\n",
+    "n_components = 2\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e623dc38",
+   "metadata": {},
+   "source": [
+    "## Provide an estimated target for the feature selector"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3d307bdc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kernel_params = {\"kernel\": \"rbf\", \"gamma\": 0.08858667904100832}\n",
+    "krr = KernelRidge(alpha=0.006158482110660267, **kernel_params)\n",
+    "\n",
+    "yp_train = krr.fit(X_train, y_train).predict(X_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bb6adcbb",
+   "metadata": {},
+   "source": [
+    "## Compute the Selections for Each Selector Type"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73b012f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_select = X.shape[1]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d54fd7e0",
+   "metadata": {},
+   "source": [
+    "### PCov-CUR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "40469566",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "pcur = PCovCUR(n_to_select=n_select, progress_bar=True, mixing=0.0)\n",
+    "pcur.fit(X_train, yp_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "74feb992",
+   "metadata": {},
+   "source": [
+    "### PCov-FPS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17eb69d7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pfps = PCovFPS(n_to_select=n_select, progress_bar=True, mixing=0.0, initialize=pcur.selected_idx_[0])\n",
+    "pfps.fit(X_train, yp_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2d7c1762",
+   "metadata": {},
+   "source": [
+    "### CUR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ef80f649",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cur = CUR(n_to_select=n_select, progress_bar=True)\n",
+    "cur.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "29536065",
+   "metadata": {},
+   "source": [
+    "### FPS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4c934cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fps = FPS(n_to_select=n_select, progress_bar=True, initialize=cur.selected_idx_[0])\n",
+    "fps.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "275587cd",
+   "metadata": {},
+   "source": [
+    "### (For Comparison) Recurisive Feature Addition"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e5510bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class RecursiveFeatureAddition:\n",
+    "    def __init__(self, n_to_select):\n",
+    "        self.n_to_select = n_to_select\n",
+    "        self.selected_idx_ = np.zeros(n_to_select, dtype=int)\n",
+    "    def fit(self, X, y):\n",
+    "        remaining = np.arange(X.shape[1])\n",
+    "        for n in range(self.n_to_select):\n",
+    "            errors = np.zeros(len(remaining))\n",
+    "            for i, pp in enumerate(remaining):\n",
+    "                krr.fit(\n",
+    "                    X[:, [*self.selected_idx_[:n], pp]], y\n",
+    "                )\n",
+    "                errors[i] = krr.score(X[:, [*self.selected_idx_[:n], pp]], y)\n",
+    "            self.selected_idx_[n] = remaining[np.argmax(errors)]\n",
+    "            remaining = np.array(np.delete(remaining, np.argmax(errors)), dtype=int)\n",
+    "        return self\n",
+    "rfa = RecursiveFeatureAddition(n_select).fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5975fde7",
+   "metadata": {},
+   "source": [
+    "## Plot our Results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6b7a203",
+   "metadata": {
+    "code_folding": [],
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(2, 1,figsize=(3.75, 5), gridspec_kw=dict(height_ratios=(1,1.5)), sharex=True, dpi=150)\n",
+    "ns = np.arange(1, n_select, dtype=int)\n",
+    "\n",
+    "all_errors = {}\n",
+    "for selector, color, linestyle, label in zip(\n",
+    "    [cur, fps, pcur, pfps, rfa],\n",
+    "    [\"red\", \"lightcoral\",  \"blue\", \"dodgerblue\", \"black\"],\n",
+    "    [\"solid\", \"solid\", \"solid\", \"solid\", \"dashed\"],\n",
+    "    [\n",
+    "        \"CUR\",\n",
+    "        \"FPS\",\n",
+    "        \"PCov-CUR\\n\"+r\"($\\alpha=0.0$)\",\n",
+    "        \"PCov-FPS\\n\"+r\"($\\alpha=0.0$)\",\n",
+    "        \"Recursive\\nFeature\\nSelection\",\n",
+    "    ],  \n",
+    "):\n",
+    "    if label not in all_errors:\n",
+    "        errors = np.zeros(len(ns))\n",
+    "        for i, n in enumerate(ns):\n",
+    "            krr.fit(X_train[:, selector.selected_idx_[:n]], y_train)\n",
+    "            errors[i] = krr.score(X_test[:, selector.selected_idx_[:n]], y_test)\n",
+    "        all_errors[label] = errors\n",
+    "    axes[0].plot(ns, all_errors[label], c=color, label=label, linestyle=linestyle)\n",
+    "    axes[1].plot(ns, selector.selected_idx_[:max(ns)], c=color, marker='.', linestyle=linestyle)\n",
+    "\n",
+    "axes[1].set_xlabel(r\"$n_{select}$\")\n",
+    "axes[1].set_xticks(range(1, n_select))\n",
+    "axes[0].set_ylabel(r\"R$^2$\")\n",
+    "axes[1].set_yticks(np.arange(X.shape[1]))\n",
+    "axes[1].set_yticklabels(column_names, rotation=30, fontsize=10)\n",
+    "axes[0].legend(ncol=2, fontsize=8, bbox_to_anchor=(0.5, 1.0), loc='lower center')\n",
+    "axes[1].invert_yaxis()\n",
+    "axes[1].grid(axis='y', alpha=0.5)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}