imedslab · lext · Jan 2, 2026 · Jan 2, 2026 · Jan 2, 2026 · Jan 2, 2026
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -33,3 +33,4 @@ jobs:
 
       - name: Run tests
         run: pytest
+
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@
 This package is aimed to be a one-stop-shop for statistical testing in machine learning when it comes to evaluating models on a test set and comparing whether our *improved* model is really beating the baseline. That is, we cover the following very typical use-case in machine learning:
 ![usecase](docs/source/_static/usecase.png)
 
-Currently, we support the cases of classification, regresson, and semantic segmentation. We do not yet support the significance of ranking, as well as grouped data. It is coming in the future releases.
+Currently, we support the cases of classification, regresson, and semantic segmentation.  
 
 ## In practice
 Install from PyPI:

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -11,7 +11,7 @@ About
 
 
 Statistical Model Comparison with Bootstrap (STAMBO) focuses on statistically sound comparisons between models and samples by implementing
-the one-tailed bootstrap hypothesis tests:
+the two-tailed bootstrap hypothesis tests:
 
 .. figure:: /_static/banner.png
    :alt: stambo banner

diff --git a/notebooks/Classification.ipynb b/notebooks/Classification.ipynb
@@ -32,7 +32,7 @@
     {
      "data": {
       "text/plain": [
-       "'0.1.5'"
+       "'0.1.6'"
       ]
      },
      "execution_count": 1,
@@ -147,7 +147,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2ac3e8d1720c49719e4b286464de2da1",
+       "model_id": "94d13ceeeac94889877ef71f603340a6",
        "version_major": 2,
        "version_minor": 0
       },
@@ -180,16 +180,41 @@
     {
      "data": {
       "text/plain": [
-       "{'ROCAUC': array([0.05494505, 0.01607463, 0.00116143, 0.03622366, 0.97275219,\n",
-       "        0.94904937, 0.99252755, 0.98882682, 0.97478143, 0.99825874]),\n",
-       " 'AP': array([0.02797203, 0.02314431, 0.00502882, 0.04678529, 0.9689868 ,\n",
-       "        0.94193877, 0.99255722, 0.99213111, 0.9810929 , 0.99893794]),\n",
-       " 'QKappa': array([ 0.92007992, -0.0320894 , -0.07893126,  0.00764438,  0.90810898,\n",
-       "         0.85387269,  0.95589713,  0.87601958,  0.81299213,  0.93027112]),\n",
-       " 'BACC': array([ 0.93906094, -0.02079161, -0.04718104,  0.00252861,  0.94531991,\n",
-       "         0.91380672,  0.97414673,  0.9245283 ,  0.88886054,  0.95691282]),\n",
-       " 'MCC': array([ 0.90709291, -0.02795235, -0.07123844,  0.0096027 ,  0.91078326,\n",
-       "         0.86011542,  0.95680034,  0.88283091,  0.82759216,  0.93254094])}"
+       "{'ROCAUC': {'p_value': 0.10989010989010989,\n",
+       "  'diff': 0.016074628438916494,\n",
+       "  'ci_es': (0.0011614303013786215, 0.03622365794387381),\n",
+       "  'ci_s1': (0.9490493728705037, 0.9925275462036619),\n",
+       "  'ci_s2': (0.9747814271953842, 0.9982587433114521),\n",
+       "  'emp_s1': 0.9727521872035416,\n",
+       "  'emp_s2': 0.9888268156424581},\n",
+       " 'AP': {'p_value': 0.055944055944055944,\n",
+       "  'diff': 0.023144311990304867,\n",
+       "  'ci_es': (0.005028817149312964, 0.04678528920344301),\n",
+       "  'ci_s1': (0.941938772504055, 0.992557217298247),\n",
+       "  'ci_s2': (0.9810928996823938, 0.9989379380439234),\n",
+       "  'emp_s1': 0.9689867972199498,\n",
+       "  'emp_s2': 0.9921311092102547},\n",
+       " 'QKappa': {'p_value': 0.16183816183816183,\n",
+       "  'diff': -0.03208940366959201,\n",
+       "  'ci_es': (-0.0789312628900082, 0.007644376319408607),\n",
+       "  'ci_s1': (0.8538726858185821, 0.9558971346535168),\n",
+       "  'ci_s2': (0.812992125984252, 0.9302711162834225),\n",
+       "  'emp_s1': 0.9081089795260358,\n",
+       "  'emp_s2': 0.8760195758564437},\n",
+       " 'BACC': {'p_value': 0.12387612387612387,\n",
+       "  'diff': -0.02079160957099191,\n",
+       "  'ci_es': (-0.047181042228212136, 0.0025286105738232976),\n",
+       "  'ci_s1': (0.9138067241897886, 0.9741467276565798),\n",
+       "  'ci_s2': (0.888860544217687, 0.9569128171763175),\n",
+       "  'emp_s1': 0.9453199114577844,\n",
+       "  'emp_s2': 0.9245283018867925},\n",
+       " 'MCC': {'p_value': 0.1878121878121878,\n",
+       "  'diff': -0.0279523458715083,\n",
+       "  'ci_es': (-0.07123843522834138, 0.009602702001584999),\n",
+       "  'ci_s1': (0.8601154159583538, 0.9568003358464806),\n",
+       "  'ci_s2': (0.8275921636811222, 0.932540941432413),\n",
+       "  'emp_s1': 0.9107832588440067,\n",
+       "  'emp_s2': 0.8828309129724984}}"
       ]
      },
      "execution_count": 5,
@@ -229,7 +254,7 @@
       "\\midrule\n",
       "Effect size & $0.02$ [$0.00$-$0.04]$ & $0.02$ [$0.01$-$0.05]$ & $-0.03$ [$-0.08$-$0.01]$ & $-0.02$ [$-0.05$-$0.00]$ & $-0.03$ [$-0.07$-$0.01]$ \\\\ \n",
       "\\midrule\n",
-      "$p$-value & $0.05$ & $0.03$ & $0.92$ & $0.94$ & $0.91$ \\\\ \n",
+      "$p$-value & $0.11$ & $0.06$ & $0.16$ & $0.12$ & $0.19$ \\\\ \n",
       "\\bottomrule\n",
       "\\end{tabular}\n"
      ]
@@ -278,14 +303,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "id": "afd3e34e-4d18-4045-a49f-121a1b178551",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "18a9e339d56b40eaa9ce43e50cd7c5ed",
+       "model_id": "cc4126fdca934dc09f3f52f6c2185786",
        "version_major": 2,
        "version_minor": 0
       },
@@ -304,7 +329,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "id": "10c6e6ab-787f-48b7-98f5-48de84564d94",
    "metadata": {},
    "outputs": [
@@ -322,7 +347,7 @@
       "\\midrule\n",
       "Effect size & $0.02$ [$0.00$-$0.04]$ & $0.02$ [$0.01$-$0.05]$ & $-0.00$ [$-0.01$-$0.01]$ \\\\ \n",
       "\\midrule\n",
-      "$p$-value & $0.05$ & $0.03$ & $0.52$ \\\\ \n",
+      "$p$-value & $0.11$ & $0.06$ & $0.96$ \\\\ \n",
       "\\bottomrule\n",
       "\\end{tabular}\n"
      ]
@@ -334,23 +359,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "id": "16f56d4c-114f-4701-80fb-783c1b3d1207",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'ROCAUC': array([0.05494505, 0.01607463, 0.00116143, 0.03622366, 0.97275219,\n",
-       "        0.94904937, 0.99252755, 0.98882682, 0.97478143, 0.99825874]),\n",
-       " 'AP': array([0.02797203, 0.02314431, 0.00502882, 0.04678529, 0.9689868 ,\n",
-       "        0.94193877, 0.99255722, 0.99213111, 0.9810929 , 0.99893794]),\n",
-       " 'F2Score': array([ 5.21478521e-01, -9.88531818e-04, -1.01705787e-02,  1.07904249e-02,\n",
-       "         9.83425414e-01,  9.70317291e-01,  9.93377483e-01,  9.82436883e-01,\n",
-       "         9.72540046e-01,  9.90712074e-01])}"
+       "{'ROCAUC': {'p_value': 0.10989010989010989,\n",
+       "  'diff': 0.016074628438916494,\n",
+       "  'ci_es': (0.0011614303013786215, 0.03622365794387381),\n",
+       "  'ci_s1': (0.9490493728705037, 0.9925275462036619),\n",
+       "  'ci_s2': (0.9747814271953842, 0.9982587433114521),\n",
+       "  'emp_s1': 0.9727521872035416,\n",
+       "  'emp_s2': 0.9888268156424581},\n",
+       " 'AP': {'p_value': 0.055944055944055944,\n",
+       "  'diff': 0.023144311990304867,\n",
+       "  'ci_es': (0.005028817149312964, 0.04678528920344301),\n",
+       "  'ci_s1': (0.941938772504055, 0.992557217298247),\n",
+       "  'ci_s2': (0.9810928996823938, 0.9989379380439234),\n",
+       "  'emp_s1': 0.9689867972199498,\n",
+       "  'emp_s2': 0.9921311092102547},\n",
+       " 'F2Score': {'p_value': 0.9590409590409591,\n",
+       "  'diff': -0.000988531817988858,\n",
+       "  'ci_es': (-0.01017057865486726, 0.010790424857444192),\n",
+       "  'ci_s1': (0.970317290652865, 0.9933774834437086),\n",
+       "  'ci_s2': (0.9725400457665904, 0.9907120743034056),\n",
+       "  'emp_s1': 0.9834254143646409,\n",
+       "  'emp_s2': 0.9824368825466521}}"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -361,7 +400,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "id": "86c223df-2833-42a2-9b50-fae76accb6d4",
    "metadata": {},
    "outputs": [
@@ -379,7 +418,7 @@
       "\\midrule\n",
       "Effect size & $0.02$ [$0.00$-$0.04]$ & $0.02$ [$0.01$-$0.05]$ & $-0.00$ [$-0.01$-$0.01]$ \\\\ \n",
       "\\midrule\n",
-      "$p$-value & $0.05$ & $0.03$ & $0.52$ \\\\ \n",
+      "$p$-value & $0.11$ & $0.06$ & $0.96$ \\\\ \n",
       "\\bottomrule\n",
       "\\end{tabular}\n"
      ]
@@ -388,14 +427,6 @@
    "source": [
     "print(stambo.to_latex(testing_result, m1_name=\"kNN\", m2_name=\"LR\"))"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9cca7848",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

diff --git a/notebooks/Classification_non_iid.ipynb b/notebooks/Classification_non_iid.ipynb
diff --git a/notebooks/Regression.ipynb b/notebooks/Regression.ipynb
@@ -31,7 +31,7 @@
     {
      "data": {
       "text/plain": [
-       "'0.1.5'"
+       "'0.1.6'"
       ]
      },
      "execution_count": 1,
@@ -126,7 +126,7 @@
     "\n",
     "As stated in the documentation, the testing routine returns the `dict` of `tuple`. The keys in the dict are the metric tags, and the values are tuples that store the data in the following format:\n",
     "\n",
-    "* p-value ($H_0: model_1 \\leq model_2$)\n",
+    "* p-value ($H_0: model_1 = model_2$)\n",
     "* Empirical value (model 1)\n",
     "* CI low (model 1)\n",
     "* CI high (model 1)\n",
@@ -156,7 +156,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5ef125fc1d67468ab28cb2f52a47f2fe",
+       "model_id": "8966ef7c555d4b0aa12ba0ab900b2ddb",
        "version_major": 2,
        "version_minor": 0
       },
@@ -189,12 +189,20 @@
     {
      "data": {
       "text/plain": [
-       "{'MAE': array([ 3.29934013e-02,  3.85996728e+00, -1.36873670e-01,  7.93333005e+00,\n",
-       "         4.60042861e+01,  4.17109824e+01,  5.05579091e+01,  4.98642534e+01,\n",
-       "         4.47314857e+01,  5.51450226e+01]),\n",
-       " 'MSE': array([4.99900020e-03, 8.02532883e+02, 2.27838928e+02, 1.41044449e+03,\n",
-       "        3.22590754e+03, 2.69030996e+03, 3.82864762e+03, 4.02844042e+03,\n",
-       "        3.25671296e+03, 4.88786525e+03])}"
+       "{'MAE': {'p_value': 0.06598680263947211,\n",
+       "  'diff': 3.859967279672837,\n",
+       "  'ci_es': (-0.13687366958812033, 7.933330050547908),\n",
+       "  'ci_s1': (41.71098240871023, 50.557909066351826),\n",
+       "  'ci_s2': (44.731485671191564, 55.14502262443438),\n",
+       "  'emp_s1': 46.00428611399232,\n",
+       "  'emp_s2': 49.86425339366516},\n",
+       " 'MSE': {'p_value': 0.009998000399920015,\n",
+       "  'diff': 802.5328829652262,\n",
+       "  'ci_es': (227.83892760580275, 1410.444493286724),\n",
+       "  'ci_s1': (2690.309962073994, 3828.647621254201),\n",
+       "  'ci_s2': (3256.712958773253, 4887.865246354953),\n",
+       "  'emp_s1': 3225.907539357549,\n",
+       "  'emp_s2': 4028.4404223227752}}"
       ]
      },
      "execution_count": 5,
@@ -234,7 +242,7 @@
       "\\midrule\n",
       "Effect size & $3.86$ [$-0.14$-$7.93]$ & $802.53$ [$227.84$-$1410.44]$ \\\\ \n",
       "\\midrule\n",
-      "$p$-value & $0.03$ & $0.00$ \\\\ \n",
+      "$p$-value & $0.07$ & $0.01$ \\\\ \n",
       "\\bottomrule\n",
       "\\end{tabular}\n"
      ]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -33,3 +33,4 @@ jobs:

		- name: Run tests
		run: pytest