CluStR/reglib.py at master · sweverett/CluStR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
''' Regression library for CluStR '''

#import rpy2.robjects as robjects
#from rpy2.robjects.packages import importr
import numpy as np
from linmix import linmix

# Imports the necessary R packages needed to run lrgs in python
#RLRGS = importr('lrgs')  # Multivariate regression package by Adam Mantz

# Set some aliases for useful R functions
#RARRAY = robjects.r('array')
#RMATRIX = robjects.r('matrix')
#RNORM = robjects.r('rnorm')
#RC = robjects.r('c')
#RLM = robjects.r('lm')

# pylint: disable=invalid-name

def run_lrgs(x, y, err_x, err_y, _xycov=None, nmc=500, dirichlet=True):
    '''
    Runs the lrgs regression algorithm written in R by interfacing through
    rpy2. For our purposes, inputs should be in scaled (log) form. (For the
    moment, only works for on-diagonal elements of the covariance matrix.) nmc
    is the length of the markov chain.
    '''

    # pylint: disable = too-many-arguments
    # pylint: disable = too-many-locals

    # Make sure dimensions are correct
    assert np.size(x) == np.size(y)
    assert np.size(err_x) == np.size(err_y)
    assert np.size(x) == np.size(err_x)

    # Convert x and y to r vectors
    rx = robjects.FloatVector(x)
    ry = robjects.FloatVector(y)
    rx_err = robjects.FloatVector(err_x)
    ry_err = robjects.FloatVector(err_y)

    # Set up covariance matrix
    M = RARRAY(0.0, dim=RC(2, 2, np.size(rx)))

    for i in range(np.size(rx)):
        M.rx[1, 1, i+1] = rx_err[i]
        M.rx[2, 2, i+1] = ry_err[i]

    # Set some R equivalents
    TRUE = robjects.BoolVector([True])
    FALSE = robjects.BoolVector([False])

    if dirichlet:
        d = TRUE
    else:
        d = FALSE

    # Run MCMC
    posterior = RLRGS.Gibbs_regression(rx, ry, M, nmc, dirichlet=d,
                                       trace='bsg', mention_every=50)

    # Extract relevant data from posterior
    B = np.array(posterior[0])  # Parameter chain
    S = np.array(posterior[1])[0][0]
    # ^ Scatter chain (only intrinsic scatter for the moment!)

    # Prepare lrgs fit chains
    intercept = B[0][0]
    slope = B[1][0]
    sigma = np.sqrt(S)

    # Return fit parameters consistently with run_linmix
    return (intercept, slope, sigma)

def run_linmix(x, y, err_x, err_y, Nmin=5000, Nmax=10000, vb=True, delta=None):
    # pylint: disable = too-many-arguments
    ''' Runs the Kelly regression algorithm through the package linmix.'''

    ''' For convenience, here are the linmix arguments:

        Linmix Args:
            x(array_like): The observed independent variable.
            y(array_like): The observed dependent variable.
            xsig(array_like): 1-sigma measurement errors in x.
            ysig(array_like): 1-sigma measurement errors in y.
            xycov(array_like): Covariance between the measurement errors in x
                               and y.
            delta(array_like): Array indicating whether a data point is
                               censored (i.e., not detected), or not.
                               If delta[i] == 1, then the ith source is
                               detected. If delta[i] == 0, then the ith source
                               is not detected and y[i] will be interpreted as
                               an upper limit. Note that if there are censored
                               data points, then the maximum-likelihood
                               estimate (alpha, beta, sigsqr) is not valid. By
                               default, all data points are assumed to be
                               detected.
            K(int): The number of Gaussians to use in the mixture model
                    for the distribution of xi.
            nchains(int): The number of Monte Carlo Markov Chains to
                          instantiate.
    '''


    # Make sure dimensions are correct
    assert np.size(x) == np.size(y)
    assert np.size(err_x) == np.size(err_y)
    assert np.size(x) == np.size(err_x)

    L = np.size(x)

    # Run linmix MCMC
    xycov = np.zeros(L)
    model = linmix.LinMix(x, y, err_x, err_y, xycov, delta, 2, 2)
    model.run_mcmc(Nmin, Nmax, silent=vb)

    # return intercept, slope, intrinsic scatter
    intercept = model.chain['alpha']
    slope = model.chain['beta']
    sigma = np.sqrt(model.chain['sigsqr'])

    # Return fit parameters consistently with run_lrgs
    return (intercept, slope, sigma)


def check_convergence(_intercept, _slope, _sigma):
    '''
    FIX: In future, should implement a function that checks the convergence of
    the MCMC using autocorrelation, etc. and display plots/statistics.
    '''
    pass