1 /**
2  * Implementation of standard regularizers for Linear layer.
3  *
4  * Copyright: 2017 Netflix, Inc.
5  * License: $(LINK2 http://www.apache.org/licenses/LICENSE-2.0, Apache License Version 2.0)
6  */
7 module vectorflow.regularizers;
8 
9 private
10 {
11 import std.random : uniform;
12 
13 import vectorflow.layers;
14 import vectorflow.math;
15 import vectorflow.neurallayer;
16 import vectorflow.optimizers;
17 import vectorflow.utils;
18 }
19 
20 
21 abstract class LinearPrior
22 {
23     /// pointer to the layer weight matrix
24     float[][] W; 
25     /// pointer to the layer gradient matrix
26     float[][] grad;
27     /// pointer to the optimizer for this layer
28     SGDOptimizer opt;
29 
30     void register(Linear layer)
31     {
32         W = layer.W;
33         grad = layer.grad;
34         if(layer.optimizer is null)
35            return; 
36         if(auto o = cast(SGDOptimizer)(layer.optimizer))
37             opt = o;
38         else
39             throw new Exception("Only support SGDOptimizer type.");
40     }
41 
42     LinearPrior dup();
43 }
44 
45 abstract class ProxyLinearPrior : LinearPrior
46 {
47     abstract void proxy_step();
48 
49     override ProxyLinearPrior dup();
50 }
51 
52 abstract class AdditiveLinearPrior : LinearPrior
53 {
54     abstract void accumulate_grad();
55 
56     override AdditiveLinearPrior dup();
57 }
58 
59 
60 /***
61 Adds L2 regularization to a Linear layer.
62 Example:
63 -------------
64 // basic L2 regularization: loss = (0.03 / 2) * || W ||^2
65 auto l1 = Linear(5).prior(L2Prior(0.03));
66 
67 // same, but centered around a non-zero matrix: loss = (0.03 / 2) * || W - W_p ||^2
68 auto l2 = Linear(5).prior(L2Prior(0.03, W_p));
69 
70 // L2 regularization with 1 lambda per feature (diagonal Hessian 0-centered prior).
71 // Example when input dim is 5:
72 auto l3 = Linear(10).prior(L2Prior([0.01f, 0.02f, 0.01f, 0.015f, 0.005f]));
73 
74 // L2 regularization with 1 lambda per feature, centered around a non-zero matrix.
75 // Example when input dim is 5:
76 auto l4 = Linear(10).prior(L2Prior([0.01f, 0.02f, 0.01f, 0.015f, 0.005f], W_p));
77 -------------
78 */
79 class L2Prior : AdditiveLinearPrior
80 {
81     float _lambda;
82     float[] _lambdas;
83     float[][] W_prior;
84 
85     protected size_t _ind_start;
86 
87     void delegate() _accumulate_grad;
88 
89     this(float lambda)
90     {
91         _lambda = lambda;
92         _ind_start = 0;
93     }
94 
95     this(float lambda, float[][] W_prior_)
96     {
97         _lambda = lambda;
98         W_prior = W_prior_;
99         _ind_start = 0;
100     }
101 
102     this(float[] lambdas)
103     {
104         _lambdas = lambdas;
105         _ind_start = 0;
106     }
107 
108     this(float[] lambdas, float[][] W_prior_)
109     {
110         _lambdas = lambdas;
111         W_prior = W_prior_;
112         _ind_start = 0;
113     }
114 
115     mixin opCallNew;
116 
117     override void register(Linear layer)
118     {
119         super.register(layer);
120         if(W.length > 0 && W_prior.length == 0)
121         {
122             // default to prior centered on 0
123             W_prior = allocate_matrix_zero!float(W.length, W[0].length);
124         }
125         if(layer.with_intercept)
126             _ind_start = 1; // don't regularize intercept
127 
128         if(_lambdas.length == 0)
129             _accumulate_grad = &_acc_grad_scal;
130         else
131             _accumulate_grad = &_acc_grad_vec;
132     }
133 
134     version(LDC)
135     {
136         private import ldc.attributes;
137 
138         pragma(inline, true)
139         @fastmath static void l2op_scal(float a, float[] x, float[] x2,
140                 float[] y, size_t start) pure
141         {
142             for(auto i = start; i < x.length; ++i)
143                 y[i] += a * (x[i] - x2[i]);
144         }
145 
146         pragma(inline, true)
147         @fastmath static void l2op_vec(float[] a, float[] x, float[] x2,
148                 float[] y, size_t start) pure
149         {
150             for(auto i = start; i < x.length; ++i)
151                 y[i] += a[i] * (x[i] - x2[i]);
152         }
153 
154         void _acc_grad_scal()
155         {
156             foreach(k; 0..W.length)
157                 l2op_scal(_lambda, W[k], W_prior[k], grad[k], _ind_start);
158         }
159         
160         void _acc_grad_vec()
161         {
162             foreach(k; 0..W.length)
163                 l2op_vec(_lambdas, W[k], W_prior[k], grad[k], _ind_start);
164         }
165     }
166     else
167     {
168         void _acc_grad_scal()
169         {
170             foreach(k; 0..W.length)
171                 for(auto i = _ind_start; i < W[k].length; ++i)
172                     grad[k][i] += _lambda * (W[k][i] - W_prior[k][i]);
173         }
174 
175         void _acc_grad_vec()
176         {
177             foreach(k; 0..W.length)
178                 for(auto i = _ind_start; i < W[k].length; ++i)
179                     grad[k][i] += _lambdas[i] * (W[k][i] - W_prior[k][i]);
180         }
181     }
182 
183     override void accumulate_grad()
184     {
185         _accumulate_grad();
186     }
187 
188     override AdditiveLinearPrior dup()
189     {
190         L2Prior cp;
191         if(_lambdas.length == 0)
192             cp = new L2Prior(_lambda, W_prior);
193         else
194             cp = new L2Prior(_lambdas, W_prior);
195         cp._ind_start = _ind_start;
196         return cp;
197     }
198 }
199 
200 /***
201 Adds L1 regularization to a Linear layer.
202 
203 This is implemented as a proximal operator during SGD.
204 
205 Example:
206 -------------
207 // basic L1 regularization: loss = 0.03 * | W |
208 auto l1 = Linear(5).prior(L1Prior(0.03));
209 
210 // same, but centered around a non-zero matrix: loss = 0.03 * | W - W_p |
211 auto l2 = Linear(5).prior(L1Prior(0.03, W_p));
212 -------------
213 */
214 class L1Prior : ProxyLinearPrior
215 {
216     float _lambda;
217     float[][] W_prior;
218 
219     protected size_t _ind_start;
220 
221     this(float lambda)
222     {
223         _lambda = lambda;
224         _ind_start = 0;
225     }
226 
227     this(float lambda, float[][] W_prior_)
228     {
229         _lambda = lambda;
230         W_prior = W_prior_;
231         _ind_start = 0;
232     }
233 
234     mixin opCallNew;
235 
236     override void register(Linear layer)
237     {
238         super.register(layer);
239         if(W.length > 0 && W_prior.length == 0)
240         {
241             // default to prior centered on 0
242             W_prior = allocate_matrix_zero!float(W.length, W[0].length);
243         }
244         if(layer.with_intercept)
245             _ind_start = 1; // don't regularize intercept
246     }
247 
248     override void proxy_step()
249     {
250         foreach(k; 0..W.length)
251         {
252             for(auto i = _ind_start; i < W[k].length; ++i)
253             {
254                 if(W[k][i] > W_prior[k][i])
255                     W[k][i] = fmax(0, W[k][i] - W_prior[k][i] - _lambda * opt.current_lr(k,i));
256                 else
257                     W[k][i] = -fmax(0, -W[k][i] + W_prior[k][i] - _lambda * opt.current_lr(k,i));
258             }
259         }
260     }
261 
262     override ProxyLinearPrior dup()
263     {
264         auto cp = new L1Prior(_lambda, W_prior);
265         cp._ind_start = _ind_start;
266         return cp;
267     }
268 }
269 
270 
271 /***
272 Adds a positive constraint on the weights of a Linear layer.
273 
274 If the Linear layer has intercepts, the constraint won't apply to them.
275 
276 This is implemented as a proximal operator during SGD.
277 Example:
278 -------------
279 // force weights to be positive:
280 auto l1 = Linear(5).prior(PositivePrior());
281 
282 // force weights to be above 1e-3:
283 auto l2 = Linear(10).prior(PositivePrior(1e-3));
284 -------------
285 */
286 class PositivePrior : ProxyLinearPrior
287 {
288     protected size_t _ind_start;
289     protected float _eps;
290 
291     this()
292     {
293         _ind_start = 0;
294         _eps = 0;
295     }
296 
297     this(float eps)
298     {
299         _ind_start = 0;
300         _eps = eps;
301     }
302 
303     mixin opCallNew;
304 
305     override void register(Linear layer)
306     {
307         super.register(layer);
308         if(layer.with_intercept)
309             _ind_start = 1; // don't regularize intercept
310     }
311 
312     override void proxy_step()
313     {
314         foreach(k; 0..W.length)
315             for(auto i = _ind_start; i < W[k].length; ++i)
316                 W[k][i] = fmax(_eps, W[k][i]);
317     }
318 
319     override ProxyLinearPrior dup()
320     {
321         auto cp = new PositivePrior();
322         cp._eps = _eps;
323         cp._ind_start = _ind_start;
324         return cp;
325     }
326 }
327 
328 
329 
330 class RotationPrior : AdditiveLinearPrior
331 {
332     float _lambda;
333     ulong _num_draws;
334 
335     this(float lambda, ulong num_draws = 1)
336     {
337         _lambda = lambda;
338         _num_draws = num_draws;
339     }
340 
341     mixin opCallNew;
342 
343     override void accumulate_grad()
344     {
345         foreach(_; 0.._num_draws)
346         {
347             size_t i = uniform(0, W.length, RAND_GEN);
348             size_t j = uniform(0, W.length, RAND_GEN);
349 
350             auto ri = W[i];
351             auto rj = W[j];
352             float g = dotProd(ri, rj);
353             if(i != j)
354             {
355                 foreach(u; 0..W[i].length)
356                 {
357                     grad[i][u] += _lambda * g * rj[u];
358                     grad[j][u] += _lambda * g * ri[u];
359                 }
360             }
361             else
362             {
363                 g -= 1.0;
364                 foreach(u; 0..W[i].length)
365                     grad[i][u] += 2 * _lambda * g * ri[u];
366             }
367         }
368     }
369 
370     override AdditiveLinearPrior dup()
371     {
372         return new RotationPrior(_lambda, _num_draws);
373     }
374 }
375