bin reg with hidden layers: torch and tf more consistent, add roc in torch

fs446 · fs446 · commit 1154726bf6d8 · 2025-10-20T22:41:05.000+02:00
diff --git a/binary_logistic_regression_tf_with_hidden_layers.ipynb b/binary_logistic_regression_tf_with_hidden_layers.ipynb
@@ -97,7 +97,7 @@
     "\n",
     "# these seeds produce 'nice' two classes each with\n",
     "# two clusters for chosen M, N and train_size\n",
-    "random_state_idx = 0\n",
+    "random_state_idx = 5\n",
     "random_state = np.array([7, 21, 24, 25, 29, 33, 38])\n",
     "X, Y = make_classification(\n",
     "    n_samples=M,\n",
@@ -210,8 +210,9 @@
    "id": "d32b7d1a",
    "metadata": {},
    "source": [
-    "- define model architecture based on fully connected layers\n",
-    "- in practice number and dimension of hidden layers should be hyper parameters to be learned"
+    "## Potential Model Architectures\n",
+    "- models based on fully connected layers\n",
+    "- in practice number and dimension of hidden layers should be hyper parameters which have to be learned"
    ]
   },
   {
@@ -241,8 +242,8 @@
     "# no_perceptron_in_hl = np.array([8])  # trainable params 33\n",
     "# no_perceptron_in_hl = np.array([5, 2])  # trainable params 30\n",
     "# no_perceptron_in_hl = np.array([5])  # trainable params 21\n",
-    "# no_perceptron_in_hl = np.array([3, 2])  # trainable params 20\n",
-    "no_perceptron_in_hl = np.array([2, 2])  # trainable params 15\n",
+    "no_perceptron_in_hl = np.array([3, 2])  # trainable params 20\n",
+    "# no_perceptron_in_hl = np.array([2, 2])  # trainable params 15\n",
     "# model too simple?! :\n",
     "# no_perceptron_in_hl = np.array([2])  # trainable params 9\n",
     "# -> try to train this last model longer, i.e. with more epochs"
@@ -354,7 +355,7 @@
    "id": "7a59cc9e",
    "metadata": {},
    "source": [
-    "### Train Data"
+    "### Metrics on Train Data"
    ]
   },
   {
@@ -393,7 +394,7 @@
    "id": "21166f87",
    "metadata": {},
    "source": [
-    "### Test Data"
+    "### Metrics on Test Data"
    ]
   },
   {
diff --git a/binary_logistic_regression_torch_with_hidden_layers.ipynb b/binary_logistic_regression_torch_with_hidden_layers.ipynb
@@ -81,7 +81,7 @@
     "\n",
     "# these seeds produce 'nice' two classes each with\n",
     "# two clusters for chosen M, N and train_size\n",
-    "random_state_idx = 0\n",
+    "random_state_idx = 5\n",
     "random_state = np.array([7, 21, 24, 25, 29, 33, 38])\n",
     "X, Y = make_classification(\n",
     "    n_samples=M,\n",
@@ -97,12 +97,14 @@
     "X_train, X_test, Y_train, Y_test = train_test_split(\n",
     "    X, Y, train_size=train_size, random_state=None\n",
     ")\n",
+    "Y_train = Y_train[:, None]\n",
+    "Y_test = Y_test[:, None]\n",
     "M_train = X_train.shape[0]\n",
     "M_test = X_test.shape[0]\n",
     "print(\"M_train\", M_train)\n",
-    "print(\"X train dim\", X_train.shape, \"Y train dim\", Y_train.shape)\n",
+    "print(\"X train dim\", X_train.shape, \", Y train dim\", Y_train.shape)\n",
     "print(\"M_test\", M_test)\n",
-    "print(\"X test dim\", X_test.shape, \"Y test dim\", Y_test.shape)"
+    "print(\"X test dim\", X_test.shape, \", Y test dim\", Y_test.shape)"
    ]
   },
   {
@@ -202,14 +204,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "Y_train = Y_train[:, None]\n",
-    "Y_test = Y_test[:, None]\n",
-    "\n",
-    "print(\"X train dim\", X_train.shape, \"Y train dim\", Y_train.shape)\n",
-    "print(\"X test dim\", X_test.shape, \"Y test dim\", Y_test.shape)\n",
-    "\n",
     "data_train = TensorDataset(torch.FloatTensor(X_train),\n",
     "                           torch.FloatTensor(Y_train))\n",
+    "data_test = TensorDataset(torch.FloatTensor(X_test),\n",
+    "                          torch.FloatTensor(Y_test))\n",
     "data_train_loader = DataLoader(dataset=data_train,\n",
     "                               batch_size=batch_size,\n",
     "                               shuffle=True)"
@@ -235,9 +233,9 @@
     "    def __init__(self, input_size):\n",
     "        super(Model, self).__init__()\n",
     "\n",
-    "        self.linear1 = torch.nn.Linear(input_size, 2)\n",
+    "        self.linear1 = torch.nn.Linear(input_size, 3)\n",
     "        self.act1 = torch.nn.Tanh()\n",
-    "        self.linear2 = torch.nn.Linear(2, 2)\n",
+    "        self.linear2 = torch.nn.Linear(3, 2)\n",
     "        self.act2 = torch.nn.Tanh()\n",
     "        self.linear3 = torch.nn.Linear(2, 1)\n",
     "        self.sigmoid = torch.nn.Sigmoid()\n",
@@ -335,17 +333,18 @@
     "        loss.backward()  # back prop\n",
     "        optimizer.step()  # gradient descent\n",
     "        optimizer.zero_grad()  # reset gradients for next iter\n",
-    "    # all batches per epoch, now do a prediction\n",
-    "    # on train & test so check where we are:\n",
+    "    # all batches per epoch done\n",
+    "    # next, do a prediction\n",
+    "    # on train & test to check where we are:\n",
     "    with torch.no_grad():  # no_grad !!! to not influence the back prop\n",
-    "        er = empirical_risk(model(\n",
-    "            torch.tensor(X_train, dtype=torch.float32).to(device)),\n",
-    "            torch.tensor(Y_train, dtype=torch.float32).to(device))\n",
-    "        print('train loss', er)\n",
-    "        er = empirical_risk(model(\n",
-    "            torch.tensor(X_test, dtype=torch.float32).to(device)),\n",
-    "            torch.tensor(Y_test, dtype=torch.float32).to(device))\n",
-    "        print('test loss', er)\n",
+    "        er_train = empirical_risk(\n",
+    "            model(data_train[:][0]),  # model prediction\n",
+    "            data_train[:][1])  # ground truth\n",
+    "        print('train loss', '%0.15f' % er_train)\n",
+    "        er_test = empirical_risk(\n",
+    "            model(data_test[:][0]),  # model prediction\n",
+    "            data_test[:][1])  # ground truth\n",
+    "        print('test loss', '%0.15f' % er_test)\n",
     "    print('#####\\n')"
    ]
   },
@@ -372,17 +371,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "with torch.no_grad():\n",
-    "\n",
-    "    er = empirical_risk(model.forward(\n",
-    "        torch.tensor(X_train, dtype=torch.float32).to(device)),\n",
-    "        torch.tensor(Y_train, dtype=torch.float32).to(device))\n",
-    "    print('final train loss', er)\n",
-    "\n",
-    "    er = empirical_risk(model.forward(\n",
-    "        torch.tensor(X_test, dtype=torch.float32).to(device)),\n",
-    "        torch.tensor(Y_test, dtype=torch.float32).to(device))\n",
-    "    print('final test loss', er)"
+    "with torch.no_grad():  # no_grad !!! to not influence the back prop\n",
+    "    er_train = empirical_risk(\n",
+    "        model(data_train[:][0]),  # model prediction\n",
+    "        data_train[:][1])  # ground truth\n",
+    "    print('train loss', '%0.15f' % er_train)\n",
+    "    er_test = empirical_risk(\n",
+    "        model(data_test[:][0]),  # model prediction\n",
+    "        data_test[:][1])  # ground truth\n",
+    "    print('test loss', '%0.15f' % er_test)"
    ]
   },
   {
@@ -400,13 +397,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# from here, we work with 2D numpy arrays, i.e. (80000,1) and (20000,1)\n",
     "with torch.no_grad():\n",
-    "\n",
-    "    X_tmp = torch.tensor(X_train, dtype=torch.float32).to(device)\n",
-    "    Y_pred_train = model.predict_class(X_tmp).cpu()\n",
-    "\n",
-    "    X_tmp = torch.tensor(X_test, dtype=torch.float32).to(device)\n",
-    "    Y_pred_test = model.predict_class(X_tmp).cpu()"
+    "    Y_pred_train = model.predict_class(data_train[:][0]).cpu().numpy()\n",
+    "    Y_pred_test = model.predict_class(data_test[:][0]).cpu().numpy()\n",
+    "Y_train.shape, Y_pred_train.shape, Y_test.shape, Y_pred_test.shape"
    ]
   },
   {
@@ -564,6 +559,97 @@
     "    plt.ylabel(\"feature 2\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "82be2710",
+   "metadata": {},
+   "source": [
+    "## Plot Decision Curves"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2b8382b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(10, 10))\n",
+    "N = 500\n",
+    "predict_threshold = np.linspace(0, 1, N, endpoint=False)\n",
+    "TPR = np.zeros_like(predict_threshold)\n",
+    "FPR = np.zeros_like(predict_threshold)\n",
+    "TNR = np.zeros_like(predict_threshold)\n",
+    "FNR = np.zeros_like(predict_threshold)\n",
+    "with torch.no_grad():\n",
+    "    Y_pred_test = model.forward(data_test[:][0]).cpu().numpy()\n",
+    "\n",
+    "for idx, val in enumerate(predict_threshold):\n",
+    "    Y_pred_threshold = (Y_pred_test >= val) * 1\n",
+    "    cm = confusion_matrix(Y_test, Y_pred_threshold)\n",
+    "    TN, FP = cm[0, 0], cm[0, 1]\n",
+    "    FN, TP = cm[1, 0], cm[1, 1]\n",
+    "    FPR[idx] = FP / (TN+FP)  # type I error\n",
+    "    TPR[idx] = TP / (FN+TP)  # recall, sensitivity, test power\n",
+    "    FNR[idx] = FN / (FN+TP)  # type II error\n",
+    "    TNR[idx] = TN / (TN+FP)  # specificity, selectivity\n",
+    "    if idx == N//2:  # indicate 0.5 probability decision point\n",
+    "        plt.subplot(2, 2, 1)\n",
+    "        plt.text(FPR[idx], TPR[idx], '. %0.2f' % val)\n",
+    "        plt.subplot(2, 2, 2)\n",
+    "        plt.text(FPR[idx], FNR[idx], '. %0.2f' % val)\n",
+    "        plt.subplot(2, 2, 3)\n",
+    "        plt.text(FNR[idx], TNR[idx], '. %0.2f' % val)\n",
+    "        plt.subplot(2, 2, 4)\n",
+    "        plt.text(TPR[idx], TNR[idx], '. %0.2f' % val)\n",
+    "\n",
+    "# receiver operating characteristic (ROC) curve:\n",
+    "plt.subplot(2, 2, 1)\n",
+    "plt.plot(FPR, TPR, lw=2)\n",
+    "plt.plot(0.01, 0.99, 'C3x', label='1%, 99% target'),\n",
+    "plt.plot([0, 1], [0, 1])\n",
+    "plt.xlabel('FPR = type I error')\n",
+    "plt.ylabel('TPR = recall = sensitivity = power')\n",
+    "plt.grid(True)\n",
+    "plt.legend()\n",
+    "plt.axis([0, 0.1, 0.9, 1])\n",
+    "\n",
+    "plt.subplot(2, 2, 2)\n",
+    "plt.plot(FPR, FNR, lw=2)\n",
+    "plt.plot(0.01, 0.01, 'C3x')\n",
+    "plt.plot([0, 1], [1, 0])\n",
+    "plt.xlabel('FPR = type I error')\n",
+    "plt.ylabel('FNR = type II error')\n",
+    "plt.grid(True)\n",
+    "plt.axis([0, 0.1, 0, 0.1])\n",
+    "\n",
+    "plt.subplot(2, 2, 3)\n",
+    "plt.plot(FNR, TNR, lw=2)\n",
+    "plt.plot(0.01, 0.99, 'C3x')\n",
+    "plt.plot([0, 1], [0, 1])\n",
+    "plt.xlabel('FNR = type II error')\n",
+    "plt.ylabel('TNR = specificity = selectivity')\n",
+    "plt.grid(True)\n",
+    "plt.axis([0, 0.1, 0.9, 1])\n",
+    "\n",
+    "plt.subplot(2, 2, 4)\n",
+    "plt.plot(TPR, TNR, lw=2)\n",
+    "plt.plot(0.99, 0.99, 'C3x')\n",
+    "plt.plot([0, 1], [1, 0])\n",
+    "plt.xlabel('TPR = recall = sensitivity = power')\n",
+    "plt.ylabel('TNR = specificity = selectivity')\n",
+    "plt.grid(True)\n",
+    "plt.axis([0.9, 1, 0.9, 1])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "69e3213d",
+   "metadata": {},
+   "source": [
+    "Using the typical 50% decision boundary, the trained model almost ideally performs with 1% / 99% power on the unseen test data. "
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "43218100",