Logistic regression: Added the help message

Qian, Hai · Qian, Hai · commit ebdd372a0c0b · 2013-11-22T14:59:32.000-08:00
Also updated the help messages for linear, estimators of regressions to
add the description of summary tables.
diff --git a/src/ports/postgres/modules/regress/clustered_variance.py_in b/src/ports/postgres/modules/regress/clustered_variance.py_in
@@ -288,6 +288,9 @@ def clustered_variance_linregr_help(schema_madlib, msg=None, **kwargs):
             std_err   DOUBLE PRECISION[],  -- Clustered standard errors for coef
             t_stats   DOUBLE PRECISION[],  -- t-stats of the errors
             p_values  DOUBLE PRECISION[]   -- p-values of the errors
+
+        The output summary table is the same as linregr_train(), see also:
+            SELECT linregr_train('usage');
         """.format(schema_madlib=schema_madlib)
 
 # ========================================================================
@@ -451,6 +454,9 @@ def clustered_variance_logregr_help(schema_madlib, msg=None, **kwargs):
             std_err   DOUBLE PRECISION[],  -- Clustered standard errors for coef
             z_stats   DOUBLE PRECISION[],  -- z-stats of the errors
             p_values  DOUBLE PRECISION[]   -- p-values of the errors
+
+        The output summary table is the same as logregr_train(), see also:
+            SELECT logregr_train('usage');
         """.format(schema_madlib=schema_madlib)
 
 
@@ -720,4 +726,7 @@ def clustered_variance_mlogregr_help(schema_madlib, msg=None, **kwargs):
             std_err         DOUBLE PRECISION[],  -- Clustered standard errors for coef
             z_stats         DOUBLE PRECISION[],  -- z-stats of the errors
             p_values        DOUBLE PRECISION[]   -- p-values of the errors
+
+        The output summary table is the same as mlogregr_train(), see also:
+            SELECT mlogregr_train('usage');
         """.format(schema_madlib=schema_madlib)
diff --git a/src/ports/postgres/modules/regress/linear.py_in b/src/ports/postgres/modules/regress/linear.py_in
@@ -208,8 +208,8 @@ def linregr_help_message(schema_madlib, message, **kwargs):
         Ordinary Least Squares Regression, also called Linear Regression, is a
         statistical model used to fit linear models.
 
-        It models a linear relationship of a scalar dependent variable \f$ y \f$ to one
-        or more explanatory independent variables \f$ x \f$ to build a
+        It models a linear relationship of a scalar dependent variable y to one
+        or more explanatory independent variables x to build a
         model of coefficients.
 
         For more details on function usage:
@@ -224,29 +224,39 @@ def linregr_help_message(schema_madlib, message, **kwargs):
                                         USAGE
         -----------------------------------------------------------------------
          SELECT {schema_madlib}.linregr_train(
-            source_table,                -- name of input  table
+            source_table,                -- name of input table
             out_table,                   -- name of output table
             dependent_varname,           -- name of dependent variable
-            independent_varname,         -- name of independent variable
+            independent_varname,         -- name of independent variables
             grouping_cols,               -- names of columns to group-by
-            heteroskedasticity_option,   -- perform heteroskedasticity test?
+            heteroskedasticity_option    -- perform heteroskedasticity test?
          );
 
         -----------------------------------------------------------------------
-                                        OUTUPT
+                                        OUTPUT
         -----------------------------------------------------------------------
-        The output table ('out_table' above) has the following columns
-             <...>,                               -- Grouping columns used during training
-             'coef'          DOUBLE PRECISION[],  -- Vector of coefficients
-             'r2'            DOUBLE PRECISION,    -- R-squared coefficient
-             'std_err'       DOUBLE PRECISION[],  -- Standard errors of coefficients
-             't_stats'       DOUBLE PRECISION[],  -- t-stats of the coefficients
-             'p_values'      DOUBLE PRECISION[],  -- p-values of the coefficients
-             'condition_no'  INTEGER,             -- The condition number of the covariance matrix.
-             'bp_stats'      DOUBLE PRECISION,    -- The Breush-Pagan statistic of heteroskedacity.
-                                                      (if heteroskedasticity_option=TRUE)
-             'bp_p_value'    DOUBLE PRECISION     -- The Breush-Pagan calculated p-value.
-                                                      (if heteroskedasticity_option=TRUE)
+        The output table ('out_table' above) has the following columns:
+             <...>,                                          -- Grouping columns used during training
+             'coef'                     DOUBLE PRECISION[],  -- Vector of coefficients
+             'r2'                       DOUBLE PRECISION,    -- R-squared coefficient
+             'std_err'                  DOUBLE PRECISION[],  -- Standard errors of coefficients
+             't_stats'                  DOUBLE PRECISION[],  -- t-stats of the coefficients
+             'p_values'                 DOUBLE PRECISION[],  -- p-values of the coefficients
+             'condition_no'             INTEGER,             -- The condition number of the covariance matrix.
+             'bp_stats'                 DOUBLE PRECISION,    -- The Breush-Pagan statistic of heteroskedacity.
+                                                            (if heteroskedasticity_option=TRUE)
+             'bp_p_value'               DOUBLE PRECISION,    -- The Breush-Pagan calculated p-value.
+                                                            (if heteroskedasticity_option=TRUE)
+             'num_rows_processed'       INTEGER,            -- Number of rows that are actually used in each group
+             'num_missing_rows_skipped' INTEGER             -- Number of rows that have NULL and are skipped in each group
+
+        A summary table is also created at the same time, which has:
+            'source_table'              VARCHAR,    -- the data source table name
+            'out_table'                 VARCHAR,    -- the output table name
+            'dependent_varname'         VARCHAR,    -- the dependent variable
+            'independent_varname'       VARCHAR,    -- the independent variable
+            'num_rows_processed'        INTEGER,    -- total number of rows that are used
+            'num_missing_rows_skipped'  INTEGER     -- total number of rows that are skipped because of NULL values
         """
     elif message in ['example', 'examples']:
         help_string = """
diff --git a/src/ports/postgres/modules/regress/logistic.py_in b/src/ports/postgres/modules/regress/logistic.py_in
@@ -333,7 +333,7 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
     if num_rows['num_rows_processed'] is None:
         num_rows['num_rows_processed'] = "NULL"
         num_rows['num_missing_rows_skipped'] = "NULL"
-    
+
     args.update(num_rows)
 
     plpy.execute(
@@ -364,3 +364,120 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
 
     plpy.execute("set client_min_messages to " + old_msg_level)
     return None
+
+# --------------------------------------------------------------------
+
+def logregr_help_msg (schema_madlib, message, **kwargs):
+    """ Help message for logistic regression
+
+    @param message A string, the help message indicator
+
+    Returns:
+      A string, contains the help message
+    """
+    if message is None:
+        help_string = """
+        ----------------------------------------------------------------
+                                SUMMARY
+        ----------------------------------------------------------------
+        Binomial logistic regression models the relationship between a
+        dichotomous dependent variable and one or more predictor variables.
+
+        The dependent variable may be a Boolean value or a categorial variable
+        that can be represented with a Boolean expression.
+
+        For more details on function usage:
+            SELECT {schema_madlib}.logregr_train('usage')
+
+        For a small example on using the function:
+            SELECT {schema_madlib}.logregr_train('example')
+        """
+    elif message in ['usage', 'help', '?']:
+        help_string = """
+        ------------------------------------------------------------------
+                                USAGE
+        ------------------------------------------------------------------
+        SELECT {schema_madlib}.logregr_train(
+            source_table,         -- name of input table
+            out_table,            -- name of output table
+            dependent_varname,    -- name of dependent variable
+            independent_varname,  -- names of independent variables
+            grouping_cols,        -- optional, default NULL, names of columns to group-by
+            max_iter,             -- optional, default 20, maximum iteration number
+            optimizer,            -- optional, default 'irls', name of optimization method
+            tolerance,            -- optional, default 0.0001, the stopping threshold
+            verbose               -- optional, default FALSE, whether to print useful info
+        );
+
+        ------------------------------------------------------------------
+                                OUTPUT
+        ------------------------------------------------------------------
+        The output table ('out_table' above) has the following columns:
+            <...>,                                          -- Grouping column values used during training
+            'coef',                     double precision[], -- vector of fitting coefficients
+            'log_likelihood',           double precision,   -- log likelihood
+            'std_err',                  double precision[], -- vector of standard errors of the fitting coefficients
+            'z_stats',                  double precision[], -- vector of the z-statistics of the coefficients
+            'p_values',                 double precision[], -- vector of the p values
+            'odds_ratios',              double precision[], -- vector of odds ratios, exp(coefficients)
+            'condition_no',             double precision,   -- the condition number
+            'num_rows_processed',       integer,            -- how many rows are actually used in the computation
+            'num_missing_rows_skipped', integer,            -- number of rows that contain NULL and were skipped per group
+            'num_iterations'            double precision    -- how many iterations are used in the computation per group
+
+        A summary table is also created at the same time, which has:
+            'source_table'              varchar,    -- the data source table name
+            'out_table'                 varchar,    -- the output table name
+            'dependent_varname'         varchar,    -- the dependent variable
+            'independent_varname'       varchar,    -- the independent variable
+            'optimizer_params'          varchar,    -- 'optimizer=..., max_iter=..., tolerance=...'
+            'num_all_groups'            integer,    -- how many groups
+            'num_failed_groups'         integer,    -- how many groups' fitting processes failed
+            'num_rows_processed'        integer,    -- total number of rows used in the computation
+            'num_missing_rows_skipped'  integer     -- total number of rows skipped
+        """
+    elif message in ['example', 'examples']:
+        help_string = """
+        CREATE TABLE patients( id INTEGER NOT NULL,
+                               second_attack INTEGER,
+                               treatment INTEGER,
+                               trait_anxiety INTEGER);
+        COPY patients FROM STDIN WITH DELIMITER '|';
+          1 |             1 |         1 |            70
+          3 |             1 |         1 |            50
+          5 |             1 |         0 |            40
+          7 |             1 |         0 |            75
+          9 |             1 |         0 |            70
+         11 |             0 |         1 |            65
+         13 |             0 |         1 |            45
+         15 |             0 |         1 |            40
+         17 |             0 |         0 |            55
+         19 |             0 |         0 |            50
+          2 |             1 |         1 |            80
+          4 |             1 |         0 |            60
+          6 |             1 |         0 |            65
+          8 |             1 |         0 |            80
+         10 |             1 |         0 |            60
+         12 |             0 |         1 |            50
+         14 |             0 |         1 |            35
+         16 |             0 |         1 |            50
+         18 |             0 |         0 |            45
+         20 |             0 |         0 |            60
+        \.
+
+        SELECT madlib.logregr_train( 'patients',
+                                     'patients_logregr',
+                                     'second_attack',
+                                     'ARRAY[1, treatment, trait_anxiety]',
+                                     NULL,
+                                     20,
+                                     'irls'
+                                   );
+
+        SELECT * from patients_logregr;
+        """
+    else:
+        help_string = "No such option. Use {schema_madlib}.logregr_train()"
+
+    return help_string.format(schema_madlib=schema_madlib)
+
diff --git a/src/ports/postgres/modules/regress/logistic.sql_in b/src/ports/postgres/modules/regress/logistic.sql_in
@@ -10,7 +10,7 @@
  *
  *//* ----------------------------------------------------------------------- */
 
-m4_include(`SQLCommon.m4') --'
+m4_include(`SQLCommon.m4')
 
 /**
 @addtogroup grp_logreg
@@ -750,6 +750,23 @@ RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.logregr_train($1, $2, $3, $4, $5, $6, $7, $8, False);
 $$ LANGUAGE sql VOLATILE;
 
+-----------------------------------------------------------------------
+
+-- Help messages
+
+CREATE FUNCTION MADLIB_SCHEMA.logregr_train ()
+RETURNS TEXT AS $$
+BEGIN
+    RETURN MADLIB_SCHEMA.logregr_train(NULL);
+END;
+$$ LANGUAGE plpgsql VOLATILE;
+
+CREATE FUNCTION MADLIB_SCHEMA.logregr_train(
+    message    TEXT
+) RETURNS TEXT AS $$
+    PythonFunction(regress, logistic, logregr_help_msg)
+$$ LANGUAGE plpythonu VOLATILE;
+
 ------------------------------------------------------------------------
 
 /**
diff --git a/src/ports/postgres/modules/regress/marginal.py_in b/src/ports/postgres/modules/regress/marginal.py_in
@@ -274,6 +274,9 @@ The output table ('output_table' above) has the following columns
     std_err       DOUBLE PRECISION[], -- Standard errors using delta method
     z_stats       DOUBLE PRECISION[], -- z-stats of the standard errors
     p_values      DOUBLE PRECISION[], -- p-values of the standard errors
+
+The output summary table is the same as logregr_train(), see also:
+    SELECT logregr_train('usage');
         """
     else:
         help_string = "No such option. Use {schema_madlib}.margins_mlogregr()"
@@ -675,6 +678,9 @@ The output table ('output_table' above) has the following columns
     std_err       DOUBLE PRECISION[], -- Standard errors using delta method
     z_stats       DOUBLE PRECISION[], -- z-stats of the standard errors
     p_values      DOUBLE PRECISION[], -- p-values of the standard errors
+
+The output summary table is the same as mlogregr_train(), see also:
+    SELECT mlogregr_train('usage');
         """
     else:
         help_string = "No such option. Use {schema_madlib}.margins_mlogregr()"
diff --git a/src/ports/postgres/modules/regress/multilogistic.py_in b/src/ports/postgres/modules/regress/multilogistic.py_in
@@ -584,8 +584,11 @@ The output table ('output_table' above) has the following columns
 
 The output summary table is named as <'output_table'>_summary has the following columns
     source_table             -- VARCHAR, Source table name
+    out_table                -- VARCHAR, Output table name
     dep_var                  -- VARCHAR, Dependent variable name
     ind_var                  -- VARCHAR, Independent variable name
+    optimizer_params         -- VARCHAR, Optimizer parameters used
+    ref_category             -- INTEGER, The value of reference category used
     num_rows_processed       -- INTEGER, Number of rows processed during training
     num_missing_rows_skipped -- INTEGER, Number of rows skipped during training due
                                          to missing values
diff --git a/src/ports/postgres/modules/regress/robust_linear.py_in b/src/ports/postgres/modules/regress/robust_linear.py_in
@@ -93,6 +93,9 @@ The output table (''output_table'' above) has the following columns:
     'std_err'   DOUBLE PRECISION[],  -- Huber-White standard errors
     'stats'     DOUBLE PRECISION[],  -- T-stats of the standard errors
     'p_values'  DOUBLE PRECISION[]   -- p-values of the standard errors
+
+The output summary table is the same as linregr_train(), see also:
+    SELECT linregr_train('usage');
         """
     else:
         help_string = "No such option. Use {schema_madlib}.robust_variance_linregr()"
diff --git a/src/ports/postgres/modules/regress/robust_logistic.py_in b/src/ports/postgres/modules/regress/robust_logistic.py_in
@@ -78,6 +78,9 @@ The output table ('output_table' above) has the following columns:
     'std_err'   DOUBLE PRECISION[],  -- Huber-White standard errors
     'stats'     DOUBLE PRECISION[],  -- Z-stats of the standard errors
     'p_values'  DOUBLE PRECISION[]   -- p-values of the standard errors
+
+The output summary table is the same as logregr_train(), see also:
+    SELECT logregr_train('usage');
         """
     else:
         help_string = "No such option. Use {schema_madlib}.robust_variance_linregr()"
diff --git a/src/ports/postgres/modules/regress/robust_mlogistic.py_in b/src/ports/postgres/modules/regress/robust_mlogistic.py_in
@@ -329,6 +329,9 @@ The output table ('out_table' above) has the following columns
     std_err       DOUBLE PRECISION[], -- Huber-White standard errors
     z_stats       DOUBLE PRECISION[], -- Z-stats of the standard errors
     p_values      DOUBLE PRECISION[]  -- p-values of the standard errors
+
+The output summary table is the same as mlogregr_train(), see also:
+    SELECT mlogregr_train('usage');
         """
     else:
         help_string = "No such option. Use {schema_madlib}.robust_variance_mlogregr()"