1 /**
2  Implementation of different stochastic optimizers.
3 
4  The default parallelization strategy over the cores is
5  $(LINK2 https://people.eecs.berkeley.edu/~brecht/papers/hogwildTR.pdf, Hogwild!).
6  This is a lock-free strategy where race conditions will occur. This means that
7  the library is non-deterministic when training a network as soon as there is
8  more than one core involved.
9  Hogwild! will work as long as the data access pattern is sparse enough, which
10  means that if you have too few dense parameters to learn and too many cores,
11  the optimization can fail.
12 
13  * Copyright: 2017 Netflix, Inc.
14  * License: $(LINK2 http://www.apache.org/licenses/LICENSE-2.0, Apache License Version 2.0)
15  */
16 module vectorflow.optimizers;
17 
18 
19 private
20 {
21 import core.thread : Thread;
22 import core.time : MonoTime;
23 import std.algorithm : map, max, reduce;
24 import std.conv : text, to;
25 import std.format;
26 import std.meta : Filter, staticSort;
27 import std.parallelism : TaskPool;
28 import std.random;
29 import std.range : evenChunks;
30 import std.range.interfaces : ForwardRange;
31 import std.range.primitives : ElementType, isForwardRange;
32 import std.stdio;
33 import std.string : lastIndexOf;
34 import std.variant;
35 import std.traits : isNumeric;
36 
37 import vectorflow.monitor;
38 import vectorflow.neurallayer;
39 import vectorflow.neuralnet;
40 import vectorflow.layers;
41 import vectorflow.regularizers;
42 import vectorflow.utils;
43 import vectorflow.math : fabs, dotProd, sqrt;
44 }
45 
46 
47 interface Optimizer {
48 
49    void register(NeuralLayer layer);
50 
51    void update(NeuralLayer layer, float[] ext_grad);
52    void update(NeuralLayer layer, SparseF[] ext_grad);
53 
54    Optimizer dup();
55 }
56 
57 
58 class SGDOptimizer : Optimizer {
59 
60     ulong num_epochs;
61     ulong mini_batch_sz;
62     float lr;
63     ulong cnt;
64 
65     AdditiveLinearPrior[] priors;
66     ProxyLinearPrior prox;
67 
68     this(ulong epochs, ulong mini_batch_sz_, float lr_)
69     {
70         num_epochs = epochs;
71         mini_batch_sz = mini_batch_sz_;
72         lr = lr_;
73     }
74 
75     void learn(D, T, V, R, S)(NeuralNet nn, D data,
76                      S delegate(R net_out, ref T ex, ref V[] grad) grad_f,
77                      bool verbose, uint num_cores)
78         if(isForwardRange!D
79                 && (is(V == float) || is(V == SparseF))
80                 && (is(R == float[]) || is(R == NeuralNet))
81                 && (isNumeric!S || is(S == void)))
82     {
83         enum Comp(string F1, string F2) = F1 < F2;
84         alias feats_fields = staticSort!(
85             Comp, Filter!(isFeaturesField, __traits(allMembers, T)));
86 
87         auto start_time = MonoTime.currTime;
88 
89         auto monitor = new SGDMonitor(verbose, num_epochs, num_cores,
90                 start_time, isNumeric!S);
91         
92         void _learn(U)(NeuralNet net, U d, ulong n_passes, ulong core_id)
93         {
94             foreach(l; net.layers)
95                 l.pre_learning();
96 
97             // seed rng per thread with a different seed
98             RAND_GEN.seed(42 + cast(uint)Thread.getThis.id);
99             static if(is(V == float))
100             {
101                 auto grads = new V[nn.output.length];
102                 grads[] = 0;
103             }
104             static if(is(V == SparseF))
105             {
106                 V[] grads;
107                 grads.length = nn.output.length;
108             }
109 
110             ulong sum_num_features = 0;
111             ulong tmp_cnt = 0;
112             double sum_loss = 0.0;
113             foreach(p; 0..num_epochs)
114             {
115                 cnt = 0;
116                 foreach(v; d)
117                 {
118                     auto preds = net.predict(v);
119                     foreach(root_id, field; feats_fields)
120                         sum_num_features += mixin("v." ~ field ~ ".length");
121                     tmp_cnt++;
122 
123                     static if(is(R == float[]))
124                     {
125                         static if(isNumeric!S)
126                             sum_loss += grad_f(preds, v, grads);
127                         else
128                             grad_f(preds, v, grads);
129                     }
130                     else
131                     {
132                         static if(isNumeric!S)
133                             sum_loss += grad_f(net, v, grads);
134                         else
135                             grad_f(net, v, grads);
136                     }
137 
138                     cnt++;
139                     net.backward_prop(grads);
140 
141                     if(cnt % 100_000 == 0)
142                     {
143                         synchronized(this)
144                             monitor.progress_callback(
145                                 core_id, p, tmp_cnt, sum_num_features, sum_loss);
146                         sum_num_features = 0;
147                         tmp_cnt = 0;
148                         static if(isNumeric!S)
149                             sum_loss = 0;
150                     }
151 
152                 }
153                 // flush last mini batch
154                 if(cnt % mini_batch_sz != 0)
155                 {
156                     cnt = mini_batch_sz;
157                     net.backward_prop(grads);
158                 }
159                 synchronized(this)
160                     monitor.progress_callback(
161                         core_id, p+1, tmp_cnt, sum_num_features, sum_loss);
162                 sum_num_features = 0;
163                 tmp_cnt = 0;
164                 static if(isNumeric!S)
165                     sum_loss = 0;
166             }
167             foreach(l; nn.layers)
168                 l.post_learning();
169 
170         }
171 
172         if(num_cores == 1)
173             _learn(nn, data, num_epochs, 0);
174         else
175         {
176             auto chunks = data.evenChunks(num_cores);
177             NeuralNet[] nets;
178             foreach(i; 0..num_cores)
179             {
180                 auto cp = nn.dup(true);
181                 foreach(l; cp.layers)
182                 {
183                     l.allocate_interface();
184                     l.allocate_grad_params();
185                 }
186                 cp.share_params(nn);
187                 foreach(l; cp.layers)
188                     if(l.optimizer)
189                         l.optimizer.register(l);
190                 nets ~= cp;
191             }
192 
193             auto pool = new TaskPool(num_cores);
194             foreach(i, chunk; pool.parallel(chunks))
195             {
196                 auto net = nets[i];
197                 _learn(net, chunk, num_epochs, i);
198             }
199             pool.stop();
200         }
201         monitor.wrap_up();
202     }
203 
204     abstract float current_lr(ulong k, ulong j);
205 
206     abstract void register(NeuralLayer layer);
207 
208     abstract void update(NeuralLayer layer, float[] ext_grad);
209     abstract void update(NeuralLayer layer, SparseF[] ext_grad);
210     abstract Optimizer dup();
211 
212     override string toString()
213     {
214         auto fullname = this.classinfo.name;
215         auto classname = fullname[fullname.lastIndexOf('.')+1..$];
216         return "opt." ~ classname ~
217             "[epochs:" ~ num_epochs.to!string ~
218             ", mini_batch_sz:" ~ mini_batch_sz.to!string ~
219             ", lr:" ~ lr.to!string ~
220             "]";
221     }
222 }
223 
224 
225 /**
226  AdaGrad stochastic optimizer.
227 
228  See $(LINK2 http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf, 
229  Adaptive Subgradient Methods for Online Learning and Stochastic Optimization)
230 Examples:
231 -----------------
232 
233 auto nn = NeuralNet()
234     .stack(SparseData(1000))
235     .stack(Linear(1));
236 
237 // Adagrad learning with 5 epochs, learning rate 0.1, mini-batch size of 100:
238 nn.learn(data, "square", AdaGrad(5, 0.1, 100));
239 -----------------
240  *
241  */
242 class AdaGrad : SGDOptimizer {
243 
244     // references
245     float[][] W;
246     float[][] grad;
247 
248     // local variables
249     float eps;
250     float[][] acc_grad; 
251 
252     void delegate(NeuralLayer, float[]) _update;
253 
254     this(ulong num_epochs, float lr, ulong mini_batch_sz = 1, float eps_ = 1e-8)
255     {
256         super(num_epochs, mini_batch_sz, lr);
257         eps = eps_;
258         _update = &update_general!float;
259     }
260     mixin opCallNew;
261 
262     override void register(NeuralLayer layer)
263     {
264         if(auto l = cast(Linear)layer)
265         {
266             params_to_optimize(l.W, l.grad);
267             priors = l.priors;
268             foreach(p; priors)
269                 p.register(l);
270             prox = l.prox;
271             if(prox !is null)
272                 prox.register(l);
273         }
274         else if(layer.learnable)
275             throw new Exception(
276                 "AdaGrad solver cannot optimize this layer type: " ~
277                 layer.to!string);
278     }
279 
280     void params_to_optimize(float[][] W_, float[][] grad_)
281     {
282         W = W_;
283         grad =grad_;
284         acc_grad.length = W.length;
285         foreach(k; 0..acc_grad.length)
286         {
287             acc_grad[k].length = W[k].length;
288             acc_grad[k][] = 0;
289             grad[k][] = 0;
290         }
291     }
292 
293     override void update(NeuralLayer layer, float[] ext_grad)
294     {
295         _update(layer, ext_grad);
296     }
297 
298     override void update(NeuralLayer layer, SparseF[] ext_grad)
299     {
300         update_general!SparseF(layer, ext_grad);
301     }
302 
303     void update_general(V)(NeuralLayer layer, V[] ext_grad)
304         if ((is(V == float) || is(V == SparseF)))
305     {
306         layer.accumulate_grad(ext_grad);
307         cnt++;
308         if(cnt % mini_batch_sz == 0)
309         {
310             foreach(p; priors)
311                 p.accumulate_grad();
312             update_matrix(W, grad, acc_grad, lr, eps);
313             if(prox !is null)
314                 prox.proxy_step();
315             foreach(k; 0..grad.length)
316                 grad[k][] = 0;
317             cnt = 0;
318         }
319     }
320 
321     pragma(inline, true)
322     override float current_lr(ulong k, ulong j)
323     {
324         return lr / sqrt(eps + acc_grad[k][j]);
325     }
326 
327     version(LDC)
328     {
329         private import ldc.attributes;
330 
331         pragma(inline, true)
332         @fastmath static void ada_op1(float[] row, float[] g) pure
333         {
334             for(int i = 0; i < row.length; ++i)
335                 row[i] += g[i] * g[i];
336         }
337 
338         pragma(inline, true)
339         @fastmath static void ada_op2(float[] row, float[] g, float[] sg,
340                 float lr_, float eps_) pure
341         {
342             for(int i = 0; i < row.length; ++i)
343             {
344                 // reduces cache invalidation across cores
345                 // since we're writing back to the shared weights
346                 if(fabs(g[i]) > 0) //LLVM is smart and will vec this to ucomiss
347                     row[i] -= lr_ * g[i] / sqrt(eps_ + sg[i]);
348             }
349         }
350     }
351 
352     pragma(inline, true)
353     final static void update_matrix(
354             float[][] w, float[][] g, float[][] sum_g_sq,
355             float lr_, float eps_) pure
356     {
357         foreach(k; 0..w.length)
358         {
359             auto row_W = w[k];
360             auto row_g = g[k];
361             auto row_sum_g_sq = sum_g_sq[k];
362 
363             version(LDC)
364             {
365                 ada_op1(row_sum_g_sq, row_g);
366                 ada_op2(row_W, row_g, row_sum_g_sq, lr_, eps_);
367             }
368             else
369             {
370                 foreach(i; 0..row_W.length)
371                 {
372                     row_sum_g_sq[i] += row_g[i] * row_g[i];
373                     row_W[i] -= lr_ * row_g[i] / sqrt(eps_ + row_sum_g_sq[i]);
374                 }
375             }
376         }
377     }
378 
379     override AdaGrad dup()
380     {
381         return new AdaGrad(num_epochs, lr, mini_batch_sz, eps);
382     }
383 
384     override string toString()
385     {
386         return super.toString ~ "[eps:" ~ eps.to!string ~ "]";
387     }
388 }
389 
390 
391 /**
392  ADAM stochastic optimizer.
393 
394  See $(LINK2 https://arxiv.org/pdf/1412.6980.pdf, ADAM: A method for stochastic optimization)
395 Examples:
396 -----------------
397 
398 auto nn = NeuralNet()
399     .stack(DenseData(200))
400     .stack(Linear(10));
401 
402 // ADAM learning with 5 epochs, learning rate 0.1, mini-batch size of 100:
403 nn.learn(data, "multinomial", ADAM(5, 0.1, 100));
404 -----------------
405  *
406  */
407 class ADAM : SGDOptimizer {
408 
409     // references
410     float[][] W;
411     float[][] grad;
412 
413     // local variables
414     float eps;
415 
416     float beta1_0;
417     float beta2_0;
418     float beta1;
419     float beta2;
420 
421     float[][] M;
422     float[][] S;
423 
424     this(ulong num_epochs, float lr, ulong mini_batch_sz = 1,
425          float eps_ = 1e-8, float beta1_0_ = 0.9, float beta2_0_ = 0.999)
426     {
427         super(num_epochs, mini_batch_sz, lr);
428         // See original ADAM paper for definition of these parameters:
429         eps = eps_;
430         beta1_0 = beta1_0_;
431         beta2_0 = beta2_0_;
432     }
433     mixin opCallNew;
434 
435     override void register(NeuralLayer layer)
436     {
437         if(auto l = cast(Linear)layer)
438         {
439             params_to_optimize(l.W, l.grad);
440             priors = l.priors;
441             foreach(p; priors)
442                 p.register(l);
443             prox = l.prox;
444             if(prox !is null)
445                 prox.register(l);
446         }
447     }
448 
449     void params_to_optimize(float[][] W_, float[][] grad_)
450     {
451         W = W_;
452         grad =grad_;
453         M.length = W.length;
454         S.length = W.length;
455         foreach(k; 0..W.length)
456         {
457             grad[k][] = 0;
458             M[k].length = W[k].length;
459             M[k][] = 0;
460             S[k].length = W[k].length;
461             S[k][] = 0;
462         }
463         beta1 = beta1_0;
464         beta2 = beta2_0;
465     }
466 
467     override void update(NeuralLayer layer, float[] ext_grad)
468     {
469         update_general!float(layer, ext_grad);
470     }
471 
472     override void update(NeuralLayer layer, SparseF[] ext_grad)
473     {
474         update_general!SparseF(layer, ext_grad);
475     }
476 
477     void update_general(V)(NeuralLayer layer, V[] ext_grad)
478         if ((is(V == float) || is(V == SparseF)))
479     {
480         layer.accumulate_grad(ext_grad);
481         cnt++;
482         if(cnt % mini_batch_sz == 0)
483         {
484             foreach(p; priors)
485                 p.accumulate_grad();
486             update_matrix();            
487             if(prox !is null)
488                 prox.proxy_step();
489             beta1 *= beta1_0;
490             beta2 *= beta2_0;
491             foreach(k; 0..grad.length)
492                 grad[k][] = 0;
493             cnt = 0;
494         }
495     }
496 
497     pragma(inline, true)
498     override float current_lr(ulong k, ulong j)
499     {
500         return lr / (eps + sqrt(S[k][j] / (1.0 - beta2)));
501     }
502 
503     version(LDC)
504     {
505         import ldc.attributes;
506         pragma(inline, true)
507         @fastmath static void adam_op(float[] row, float beta, float[] g) pure
508         {
509             for(int i = 0; i < row.length; ++i)
510                 row[i] = beta * row[i] + (1.0 - beta) * g[i];
511         }
512 
513         pragma(inline, true)
514         @fastmath static void adam_op2(float[] row, float beta, float[] g) pure
515         {
516             for(int i = 0; i < row.length; ++i)
517                 row[i] = beta * row[i] + (1.0 - beta) * g[i] * g[i];
518         }
519 
520         pragma(inline, true)
521         @fastmath static void adam_op3(
522                 float[] row_W, float[] row_S, float[] row_M,
523                 float beta1_, float beta2_, float eps_, float lr_) pure
524         {
525             float k1 = 1.0/(1.0 - beta1_);
526             float k2 = 1.0/(1.0 - beta2_);
527             for(int i = 0; i < row_W.length; ++i)
528                 row_W[i] -= lr_ * k1 * row_M[i] / (sqrt(k2 * row_S[i]) + eps_);
529         }
530     }
531 
532     final void update_matrix()
533     {
534         foreach(k; 0..W.length)
535         {
536             auto row_grad = grad[k];
537             auto row_W = W[k];
538             auto row_M = M[k];
539             auto row_S = S[k];
540 
541             version(LDC)
542             {
543                 adam_op(row_M, beta1_0, row_grad);
544                 adam_op2(row_S, beta2_0, row_grad);
545                 adam_op3(row_W, row_S, row_M, beta1, beta2, eps, lr);
546             }
547             else
548             {
549                 foreach(i; 0..row_W.length)
550                 {
551                     auto g = row_grad[i];
552 
553                     row_M[i] = beta1_0 * row_M[i] + (1.0 - beta1_0) * g;
554                     row_S[i] = beta2_0 * row_S[i] + (1.0 - beta2_0) * g * g;
555 
556                     auto gt = row_M[i] / (1.0 - beta1);
557                     auto st = row_S[i] / (1.0 - beta2);
558 
559                     row_W[i] -= lr * gt / (sqrt(st) + eps);
560                 }
561             }
562         }
563     }
564 
565     override ADAM dup()
566     {
567         return new ADAM(num_epochs, lr, mini_batch_sz, eps, beta1_0, beta2_0);
568     }
569 
570     override string toString()
571     {
572         return text(
573             super.toString,
574             "[eps:", eps, " beta1:", beta1_0, ", beta2:", beta2_0, "]");
575     }
576 }
577 
578 
579 class ShadowSGDOptimizer : SGDOptimizer {
580 
581     NeuralNet _net;
582 
583     this(NeuralNet net)
584     {
585         _net = net;
586         num_epochs = 0;
587         foreach(l; net.layers)
588         {
589             if(!l.learnable)
590                 continue;
591             if(auto o = cast(SGDOptimizer)(l.optimizer))
592             {
593                 if(o.num_epochs > num_epochs)
594                     num_epochs = o.num_epochs;
595             }
596         }
597         super(num_epochs, 1, 0.0);
598     }
599 
600     override float current_lr(ulong k, ulong j){return 0.0f;}
601     override void register(NeuralLayer layer){}
602     override void update(NeuralLayer layer, float[] ext_grad){}
603     override void update(NeuralLayer layer, SparseF[] ext_grad){}
604 
605     override Optimizer dup()
606     {
607         return new ShadowSGDOptimizer(_net);
608     }
609 }