@@ -27,9 +27,64 @@ class LogisticRegressionConfig:
2727 )
2828
2929
30- @entrypoint ("scratchlgr " )
30+ @entrypoint ("scratchlgrsag " )
3131class LogisticRegression (SimpleModel ):
32+ r"""
33+ Logistic Regression using stochastic average gradient descent optimizer
3234
35+
36+ .. code-block:: console
37+
38+ $ cat > dataset.csv << EOF
39+ f1,ans
40+ 0.1,0
41+ 0.7,1
42+ 0.6,1
43+ 0.2,0
44+ 0.8,1
45+ EOF
46+ $ dffml train \
47+ -model scratchlgrsag \
48+ -model-features f1:float:1 \
49+ -model-predict ans:int:1 \
50+ -sources f=csv \
51+ -source-filename dataset.csv \
52+ -log debug
53+ $ dffml accuracy \
54+ -model scratchlgrsag \
55+ -model-features f1:float:1 \
56+ -model-predict ans:int:1 \
57+ -sources f=csv \
58+ -source-filename dataset.csv \
59+ -log debug
60+ 1.0
61+ $ echo -e 'f1,ans\n0.8,0\n' | \
62+ dffml predict all \
63+ -model scratchlgrsag \
64+ -model-features f1:float:1 \
65+ -model-predict ans:int:1 \
66+ -sources f=csv \
67+ -source-filename /dev/stdin \
68+ -log debug
69+ [
70+ {
71+ "extra": {},
72+ "features": {
73+ "ans": 0,
74+ "f1": 0.8
75+ },
76+ "last_updated": "2020-03-19T13:41:08Z",
77+ "prediction": {
78+ "ans": {
79+ "confidence": 1.0,
80+ "value": 1
81+ }
82+ },
83+ "key": "0"
84+ }
85+ ]
86+
87+ """
3388 # The configuration class needs to be set as the CONFIG property
3489 CONFIG = LogisticRegressionConfig
3590 # Logistic Regression only supports training on a single feature
@@ -50,16 +105,15 @@ def separating_line(self):
50105 return self .storage .get ("separating_line" , None )
51106
52107 @separating_line .setter
53- def separating_line (self , rline ):
108+ def separating_line (self , sline ):
54109 """
55110 Set separating_line in self.storage so it will be saved to disk
56111 """
57- self .storage ["separating_line" ] = rline
112+ self .storage ["separating_line" ] = sline
58113
59114 def predict_input (self , x ):
60115 """
61- Use the regression
62- line to make a prediction by returning ``m * x + b``.
116+ The Logistic regression with SAG optimizer: returns w * x + b > 0.5
63117 """
64118 prediction = self .separating_line [0 ] * x + self .separating_line [1 ]
65119 if prediction > 0.5 :
@@ -73,24 +127,30 @@ def predict_input(self, x):
73127 )
74128 return prediction
75129
76- def best_fit_line (self ):
130+ def best_separating_line (self ):
131+ """
132+ Determine the best separating hyperplane (here, the integer weight)
133+ s.t. w * x + b is well separable from 0.5.
134+ """
77135 self .logger .debug (
78136 "Number of input records: {}" .format (len (self .xData ))
79137 )
80- x = self .xData
81- y = self .yData
82- learning_rate = 0.01
83- w = 0.01
84- b = 0.0
85- for _ in range (1 , 1500 ):
138+ x = self .xData # feature array
139+ y = self .yData # class array
140+ learning_rate = 0.01 # learning rate for step: weight -= lr * step
141+ w = 0.01 # initial weight
142+ b = 0.0 # here unbiased data is considered so b = 0
143+ # epochs' loop: 1500 epochs
144+ for _ in range (0 , 1500 ):
86145 z = w * x + b
87146 val = - np .multiply (y , z )
88147 num = - np .multiply (y , np .exp (val ))
89148 den = 1 + np .exp (val )
90- f = num / den
91- gradJ = np .sum (x * f )
92- w = w - learning_rate * gradJ / len (x )
93- error = 0
149+ f = num / den # f is gradient dJ for each data point
150+ gradJ = np .sum (x * f ) # total dJ
151+ w = w - learning_rate * gradJ / len (x ) # SAG subtraction
152+ # Accuracy calculation
153+ error = 0 # incorrect values
94154 for x_id in range (len (x )):
95155 yhat = x [x_id ] * w + b > 0.5
96156 if yhat :
@@ -113,7 +173,7 @@ async def train(self, sources: Sources):
113173 self .yData = np .append (
114174 self .yData , feature_data [self .config .predict .NAME ]
115175 )
116- self .separating_line = self .best_fit_line ()
176+ self .separating_line = self .best_separating_line ()
117177
118178 async def accuracy (self , sources : Sources ) -> Accuracy :
119179 # Ensure the model has been trained before we try to make a prediction
0 commit comments