add get_sparsity; add get_roughness; version 0.1.6

[zebinyang] · [zebinyang] · commit 5b5fed2aa891 · 2021-09-02T01:56:44.000+08:00
diff --git a/examples/demo.ipynb b/examples/demo.ipynb
@@ -193,13 +193,34 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2021-09-01T17:37:22.054020Z",
-     "start_time": "2021-09-01T17:29:18.371Z"
+     "end_time": "2021-09-01T17:49:13.840143Z",
+     "start_time": "2021-09-01T17:45:39.855056Z"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "'>=' not supported between instances of 'numpy.ndarray' and 'str'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-7-8582c77c0a6d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m                         n_split_grid=20, n_screen_grid=5, n_feature_search=10)\n\u001b[1;32m      3\u001b[0m \u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_x\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_y\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mpred_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_proba\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_x\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m \u001b[0mpred_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_proba\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_x\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0mroc_auc_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_y\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpred_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mroc_auc_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_y\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpred_test\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda2_local/envs/py37/lib/python3.7/site-packages/simtree/mobtree.py\u001b[0m in \u001b[0;36mpredict_proba\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m    651\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    652\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mpredict_proba\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 653\u001b[0;31m         \u001b[0mproba\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecision_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    654\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mproba\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproba\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    655\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda2_local/envs/py37/lib/python3.7/site-packages/simtree/mobtree.py\u001b[0m in \u001b[0;36mdecision_function\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m    503\u001b[0m             \u001b[0mnode\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtree\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    504\u001b[0m             \u001b[0;32mwhile\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mnode\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'is_leaf'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 505\u001b[0;31m                 \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnode\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'feature'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mnode\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'threshold'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    506\u001b[0m                     \u001b[0mnode\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtree\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnode\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'left_child_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    507\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mTypeError\u001b[0m: '>=' not supported between instances of 'numpy.ndarray' and 'str'"
+     ]
+    }
+   ],
    "source": [
     "clf = GLMTreeClassifier(max_depth=3, min_samples_leaf=50, reg_lambda=np.logspace(-5, 5, 10).tolist(),\n",
     "                        n_split_grid=20, n_screen_grid=5, n_feature_search=10)\n",
@@ -211,9 +232,53 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 13,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-09-01T17:53:47.559641Z",
+     "start_time": "2021-09-01T17:53:47.530587Z"
+    }
+   },
    "outputs": [],
+   "source": [
+    "leaf_idx = []\n",
+    "for row in train_x.values:\n",
+    "    node = clf.tree[1]\n",
+    "    while not node['is_leaf']:\n",
+    "        if row[node['feature']] <= node['threshold']:\n",
+    "            node = clf.tree[node['left_child_id']]\n",
+    "        else:\n",
+    "            node = clf.tree[node['right_child_id']]\n",
+    "    leaf_idx.append(node['node_id'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-09-01T17:54:00.517384Z",
+     "start_time": "2021-09-01T17:54:00.511025Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[ 57.5644,  20.2196,   2.683 , ...,  11.8193,  26.1085, 217.544 ],\n",
+       "       [ 27.7996,  14.0561,   2.6839, ...,  -9.2172,  38.023 ,  97.7341],\n",
+       "       [ 48.4661,  22.7264,   2.953 , ...,  24.8371,   4.825 , 266.665 ],\n",
+       "       ...,\n",
+       "       [ 35.8286,  16.8952,   2.8802, ...,  11.3048,   0.472 , 234.868 ],\n",
+       "       [ 20.0986,  12.8671,   2.4057, ...,   7.875 ,  21.675 , 212.098 ],\n",
+       "       [ 27.2726,  12.6129,   2.7288, ...,  -9.9008,   3.789 , 185.431 ]])"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": []
   }
  ],
diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(name='simtree',
-      version='0.1.5',
+      version='0.1.6',
       description='Single-index model tree',
       url='https://github.com/ZebinYang/SIMTree',
       author='Zebin Yang',
diff --git a/simtree/__init__.py b/simtree/__init__.py
@@ -8,5 +8,5 @@
         "SIMTreeRegressor", "SIMTreeClassifier",
         "CustomMobTreeRegressor", "CustomMobTreeClassifier"]
 
-__version__ = '0.1.5'
+__version__ = '0.1.6'
 __author__ = 'Zebin Yang'
diff --git a/simtree/mobtree.py b/simtree/mobtree.py
@@ -499,7 +499,7 @@ def decision_function(self, x):
         check_is_fitted(self, "tree")
 
         leaf_idx = []
-        for row in x:
+        for row in np.array(x):
             node = self.tree[1]
             while not node['is_leaf']:
                 if row[node['feature']] <= node['threshold']:
diff --git a/simtree/simtree.py b/simtree/simtree.py
@@ -95,6 +95,12 @@ def get_projection_index(self, node_id):
         node_id : int
             the id of leaf node
         """
+
+        check_is_fitted(self, "tree")
+        if node_id not in self.leaf_estimators_.keys():
+            print("Invalid leaf node id.")
+            return
+
         return self.leaf_estimators_[node_id].beta_.flatten()
 
     def get_feature_importance(self, node_id):
@@ -106,6 +112,12 @@ def get_feature_importance(self, node_id):
         node_id : int
             the id of leaf node
         """
+
+        check_is_fitted(self, "tree")
+        if node_id not in self.leaf_estimators_.keys():
+            print("Invalid leaf node id.")
+            return
+
         importance = (self.x[self.decision_path_indice(self.x, node_id)] * self.leaf_estimators_[node_id].beta_.ravel()).std(0)
         return importance
 
@@ -120,6 +132,12 @@ def get_projection_equation(self, node_id, precision=3):
         precision : int
             the precision of coefficients
         """
+
+        check_is_fitted(self, "tree")
+        if node_id not in self.leaf_estimators_.keys():
+            print("Invalid leaf node id.")
+            return
+
         equation = ""
         importance = self.get_feature_importance(node_id)
         sortind = np.argsort(importance)[::-1]
@@ -135,6 +153,51 @@ def get_projection_equation(self, node_id, precision=3):
                     equation += " - "
                 equation += str(round(np.abs(est.beta_[sortind[i], 0]), 3)) + self.feature_names[sortind[i]]
         return equation
+    
+    def get_sparsity(self, node_id, grid_size=100):
+                
+        """return the sparsity of the projection index in one leaf node, i.e., the percentage of zero coefficients.
+
+        Parameters
+        ---------
+        node_id : int
+            the id of leaf node
+        """
+
+        check_is_fitted(self, "tree")
+        if node_id not in self.leaf_estimators_.keys():
+            print("Invalid leaf node id.")
+            return
+
+        est = self.leaf_estimators_[node_id]
+        sparsity = np.mean(est.beta_ == 0)
+        return sparsity
+
+    def get_roughness(self, node_id, grid_size=100):
+                
+        """return the roughness of the ridge function in one leaf node, i.e., the root-mean-square second derivative of the ridge function.
+
+        Parameters
+        ---------
+        node_id : int
+            the id of leaf node
+        grid_size : int
+            the number of grid points for approximation
+        """
+
+        check_is_fitted(self, "tree")
+        if node_id not in self.leaf_estimators_.keys():
+            print("Invalid leaf node id.")
+            return
+
+        if self.leaf_estimators_[node_id] is None:
+            print("This is a constant node, and SIM is not available.")
+            return
+
+        est = self.leaf_estimators_[node_id]
+        xgrid = np.linspace(est.shape_fit_.xmin, est.shape_fit_.xmax, grid_size + 2)[1:-1]
+        roughness = np.sqrt(np.mean([est.shape_fit_.diff(x, order=2) ** 2 for x in xgrid]))
+        return roughness
 
     def visualize_one_leaf(self, node_id, folder="./results/", name="leaf_sim", save_png=False, save_eps=False):