Skip to content
This repository was archived by the owner on Dec 6, 2023. It is now read-only.

Commit 1cd41ce

Browse files
committed
add adagrad solver for higher order factorization machines
1 parent 447bf02 commit 1cd41ce

File tree

9 files changed

+25186
-1471
lines changed

9 files changed

+25186
-1471
lines changed

polylearn/adagrad_fast.cpp

Lines changed: 23305 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

polylearn/adagrad_fast.pyx

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
# cython: language_level=3
2+
# cython: boundscheck=False
3+
# cython: wraparound=False
4+
# cython: cdivision=True
5+
6+
7+
from libc.math cimport sqrt
8+
from lightning.impl.dataset_fast cimport RowDataset
9+
10+
cimport numpy as np
11+
import numpy as np
12+
13+
from .kernels_fast cimport _fast_anova_kernel_grad
14+
from .loss_fast cimport LossFunction
15+
16+
17+
np.import_array()
18+
19+
20+
cdef inline void sync(double* param,
21+
unsigned int* last_seen,
22+
double grad_norm,
23+
double learning_rate,
24+
double beta,
25+
unsigned int t,
26+
unsigned int* dt):
27+
28+
dt[0] = t - last_seen[0] # dt could be local. is that efficient?
29+
if dt[0] > 0:
30+
sq = sqrt(grad_norm)
31+
correction = sq / (learning_rate * beta + sq + 1e-6)
32+
param[0] *= correction ** dt[0]
33+
last_seen[0] = t
34+
35+
36+
cdef inline void ada_update(double* param,
37+
double* grad_norm,
38+
unsigned int* last_seen,
39+
double update,
40+
double lp,
41+
double learning_rate,
42+
double beta,
43+
unsigned int t):
44+
update *= lp
45+
46+
grad_norm[0] += update ** 2
47+
sq = sqrt(grad_norm[0])
48+
49+
# p <- (p * sq - lr * update) / (lr * beta + sq)
50+
param[0] *= sq
51+
param[0] -= learning_rate * update
52+
param[0] /= 1e-6 + sq + learning_rate * beta
53+
last_seen[0] = t + 1
54+
55+
56+
def _fast_fm_adagrad(self,
57+
double[::1] w,
58+
double[:, ::1] P not None,
59+
RowDataset X,
60+
double[::1] y not None,
61+
unsigned int degree,
62+
double alpha,
63+
double beta,
64+
bint fit_linear,
65+
LossFunction loss,
66+
unsigned int max_iter,
67+
double learning_rate,
68+
callback,
69+
int n_calls):
70+
71+
cdef Py_ssize_t n_samples = X.get_n_samples()
72+
cdef Py_ssize_t n_components = P.shape[0]
73+
cdef Py_ssize_t n_features = P.shape[1]
74+
75+
cdef bint has_callback = callback is not None
76+
77+
cdef unsigned int it, t, dt
78+
cdef Py_ssize_t i, s, j, jj
79+
80+
cdef double y_pred, update
81+
82+
# data pointers
83+
cdef double* data
84+
cdef int* indices
85+
cdef int n_nz
86+
87+
# working memory and DP tables
88+
cdef double[:, ::1] P_grad_data
89+
cdef double[::1, :] A
90+
cdef double[::1, :] Ad
91+
92+
# to avoid reallocating at every iteration, we allocate more than enough
93+
94+
P_grad_data = np.empty_like(P)
95+
A = np.empty((n_features + 1, degree + 1), order='f')
96+
Ad = np.empty_like(A, order='f')
97+
98+
# adagrad bookkeeping
99+
cdef double[::1] w_grad_norms
100+
cdef double[:, ::1] P_grad_norms
101+
cdef unsigned int[::1] w_last_seen
102+
cdef unsigned int[:, ::1] P_last_seen
103+
w_grad_norms = np.zeros_like(w)
104+
P_grad_norms = np.zeros_like(P)
105+
w_last_seen = np.zeros_like(w, dtype=np.uint32)
106+
P_last_seen = np.zeros_like(P, dtype=np.uint32)
107+
108+
t = 0
109+
for it in range(max_iter):
110+
111+
for i in range(n_samples):
112+
X.get_row_ptr(i, &indices, &data, &n_nz)
113+
114+
y_pred = 0
115+
116+
# catch up
117+
if fit_linear:
118+
for jj in range(n_nz):
119+
j = indices[jj]
120+
sync(&w[j], &w_last_seen[j], w_grad_norms[j],
121+
learning_rate, alpha, t, &dt)
122+
123+
for s in range(n_components):
124+
for jj in range(n_nz):
125+
j = indices[jj]
126+
sync(&P[s, j], &P_last_seen[s, j], P_grad_norms[s, j],
127+
learning_rate, beta, t, &dt)
128+
129+
# compute predictions
130+
if fit_linear:
131+
for jj in range(n_nz):
132+
j = indices[jj]
133+
y_pred += w[j] * data[jj]
134+
135+
for s in range(n_components):
136+
y_pred += _fast_anova_kernel_grad(A,
137+
Ad,
138+
P,
139+
s,
140+
indices,
141+
data,
142+
n_nz,
143+
degree,
144+
P_grad_data)
145+
146+
# update
147+
lp = -loss.dloss(y[i], y_pred)
148+
149+
if fit_linear:
150+
for jj in range(n_nz):
151+
j = indices[jj]
152+
ada_update(&w[j],
153+
&w_grad_norms[j],
154+
&w_last_seen[j],
155+
data[jj], # derivative wrt w[j] is x[j]
156+
lp,
157+
learning_rate,
158+
alpha,
159+
t)
160+
161+
for s in range(n_components):
162+
for jj in range(n_nz):
163+
j = indices[jj]
164+
ada_update(&P[s, j],
165+
&P_grad_norms[s, j],
166+
&P_last_seen[s, j],
167+
P_grad_data[s, jj],
168+
lp,
169+
learning_rate,
170+
beta,
171+
t)
172+
t += 1
173+
# end for n_samples
174+
175+
if has_callback and it % n_calls == 0:
176+
ret = callback(self, it)
177+
if ret is not None:
178+
break
179+
# end for max_iter
180+
181+
# finalize
182+
for j in range(n_features):
183+
sync(&w[j], &w_last_seen[j], w_grad_norms[j], learning_rate, alpha, t,
184+
&dt)
185+
for s in range(n_components):
186+
for j in range(n_features):
187+
sync(&P[s, j], &P_last_seen[s, j], P_grad_norms[s, j],
188+
learning_rate, beta, t, &dt)

0 commit comments

Comments
 (0)