@@ -333,7 +333,7 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
333333 if num_rows['num_rows_processed'] is None:
334334 num_rows['num_rows_processed'] = "NULL"
335335 num_rows['num_missing_rows_skipped'] = "NULL"
336-
336+
337337 args.update(num_rows)
338338
339339 plpy.execute(
@@ -364,3 +364,120 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
364364
365365 plpy.execute("set client_min_messages to " + old_msg_level)
366366 return None
367+
368+ # --------------------------------------------------------------------
369+
370+ def logregr_help_msg (schema_madlib, message, **kwargs):
371+ """ Help message for logistic regression
372+
373+ @param message A string, the help message indicator
374+
375+ Returns:
376+ A string, contains the help message
377+ """
378+ if message is None:
379+ help_string = """
380+ ----------------------------------------------------------------
381+ SUMMARY
382+ ----------------------------------------------------------------
383+ Binomial logistic regression models the relationship between a
384+ dichotomous dependent variable and one or more predictor variables.
385+
386+ The dependent variable may be a Boolean value or a categorial variable
387+ that can be represented with a Boolean expression.
388+
389+ For more details on function usage:
390+ SELECT {schema_madlib}.logregr_train('usage')
391+
392+ For a small example on using the function:
393+ SELECT {schema_madlib}.logregr_train('example')
394+ """
395+ elif message in ['usage', 'help', '?']:
396+ help_string = """
397+ ------------------------------------------------------------------
398+ USAGE
399+ ------------------------------------------------------------------
400+ SELECT {schema_madlib}.logregr_train(
401+ source_table, -- name of input table
402+ out_table, -- name of output table
403+ dependent_varname, -- name of dependent variable
404+ independent_varname, -- names of independent variables
405+ grouping_cols, -- optional, default NULL, names of columns to group-by
406+ max_iter, -- optional, default 20, maximum iteration number
407+ optimizer, -- optional, default 'irls', name of optimization method
408+ tolerance, -- optional, default 0.0001, the stopping threshold
409+ verbose -- optional, default FALSE, whether to print useful info
410+ );
411+
412+ ------------------------------------------------------------------
413+ OUTPUT
414+ ------------------------------------------------------------------
415+ The output table ('out_table' above) has the following columns:
416+ <...>, -- Grouping column values used during training
417+ 'coef', double precision[], -- vector of fitting coefficients
418+ 'log_likelihood', double precision, -- log likelihood
419+ 'std_err', double precision[], -- vector of standard errors of the fitting coefficients
420+ 'z_stats', double precision[], -- vector of the z-statistics of the coefficients
421+ 'p_values', double precision[], -- vector of the p values
422+ 'odds_ratios', double precision[], -- vector of odds ratios, exp(coefficients)
423+ 'condition_no', double precision, -- the condition number
424+ 'num_rows_processed', integer, -- how many rows are actually used in the computation
425+ 'num_missing_rows_skipped', integer, -- number of rows that contain NULL and were skipped per group
426+ 'num_iterations' double precision -- how many iterations are used in the computation per group
427+
428+ A summary table is also created at the same time, which has:
429+ 'source_table' varchar, -- the data source table name
430+ 'out_table' varchar, -- the output table name
431+ 'dependent_varname' varchar, -- the dependent variable
432+ 'independent_varname' varchar, -- the independent variable
433+ 'optimizer_params' varchar, -- 'optimizer=..., max_iter=..., tolerance=...'
434+ 'num_all_groups' integer, -- how many groups
435+ 'num_failed_groups' integer, -- how many groups' fitting processes failed
436+ 'num_rows_processed' integer, -- total number of rows used in the computation
437+ 'num_missing_rows_skipped' integer -- total number of rows skipped
438+ """
439+ elif message in ['example', 'examples']:
440+ help_string = """
441+ CREATE TABLE patients( id INTEGER NOT NULL,
442+ second_attack INTEGER,
443+ treatment INTEGER,
444+ trait_anxiety INTEGER);
445+ COPY patients FROM STDIN WITH DELIMITER '|';
446+ 1 | 1 | 1 | 70
447+ 3 | 1 | 1 | 50
448+ 5 | 1 | 0 | 40
449+ 7 | 1 | 0 | 75
450+ 9 | 1 | 0 | 70
451+ 11 | 0 | 1 | 65
452+ 13 | 0 | 1 | 45
453+ 15 | 0 | 1 | 40
454+ 17 | 0 | 0 | 55
455+ 19 | 0 | 0 | 50
456+ 2 | 1 | 1 | 80
457+ 4 | 1 | 0 | 60
458+ 6 | 1 | 0 | 65
459+ 8 | 1 | 0 | 80
460+ 10 | 1 | 0 | 60
461+ 12 | 0 | 1 | 50
462+ 14 | 0 | 1 | 35
463+ 16 | 0 | 1 | 50
464+ 18 | 0 | 0 | 45
465+ 20 | 0 | 0 | 60
466+ \.
467+
468+ SELECT madlib.logregr_train( 'patients',
469+ 'patients_logregr',
470+ 'second_attack',
471+ 'ARRAY[1, treatment, trait_anxiety]',
472+ NULL,
473+ 20,
474+ 'irls'
475+ );
476+
477+ SELECT * from patients_logregr;
478+ """
479+ else:
480+ help_string = "No such option. Use {schema_madlib}.logregr_train()"
481+
482+ return help_string.format(schema_madlib=schema_madlib)
483+
0 commit comments