1 /** 2 Implementation of different stochastic optimizers. 3 4 The default parallelization strategy over the cores is 5 $(LINK2 https://people.eecs.berkeley.edu/~brecht/papers/hogwildTR.pdf, Hogwild!). 6 This is a lock-free strategy where race conditions will occur. This means that 7 the library is non-deterministic when training a network as soon as there is 8 more than one core involved. 9 Hogwild! will work as long as the data access pattern is sparse enough, which 10 means that if you have too few dense parameters to learn and too many cores, 11 the optimization can fail. 12 13 * Copyright: 2017 Netflix, Inc. 14 * License: $(LINK2 http://www.apache.org/licenses/LICENSE-2.0, Apache License Version 2.0) 15 */ 16 module vectorflow.optimizers; 17 18 19 private 20 { 21 import core.thread : Thread; 22 import core.time : MonoTime; 23 import std.algorithm : map, max, reduce; 24 import std.conv : text, to; 25 import std.format; 26 import std.meta : Filter, staticSort; 27 import std.parallelism : TaskPool; 28 import std.random; 29 import std.range : evenChunks; 30 import std.range.interfaces : ForwardRange; 31 import std.range.primitives : ElementType, isForwardRange; 32 import std.stdio; 33 import std.string : lastIndexOf; 34 import std.variant; 35 import std.traits : isNumeric; 36 37 import vectorflow.monitor; 38 import vectorflow.neurallayer; 39 import vectorflow.neuralnet; 40 import vectorflow.layers; 41 import vectorflow.regularizers; 42 import vectorflow.utils; 43 import vectorflow.math : fabs, dotProd, sqrt; 44 } 45 46 47 interface Optimizer { 48 49 void register(NeuralLayer layer); 50 51 void update(NeuralLayer layer, float[] ext_grad); 52 void update(NeuralLayer layer, SparseF[] ext_grad); 53 54 Optimizer dup(); 55 } 56 57 58 class SGDOptimizer : Optimizer { 59 60 ulong num_epochs; 61 ulong mini_batch_sz; 62 float lr; 63 ulong cnt; 64 65 AdditiveLinearPrior[] priors; 66 ProxyLinearPrior prox; 67 68 this(ulong epochs, ulong mini_batch_sz_, float lr_) 69 { 70 num_epochs = epochs; 71 mini_batch_sz = mini_batch_sz_; 72 lr = lr_; 73 } 74 75 void learn(D, T, V, R, S)(NeuralNet nn, D data, 76 S delegate(R net_out, ref T ex, ref V[] grad) grad_f, 77 bool verbose, uint num_cores) 78 if(isForwardRange!D 79 && (is(V == float) || is(V == SparseF)) 80 && (is(R == float[]) || is(R == NeuralNet)) 81 && (isNumeric!S || is(S == void))) 82 { 83 enum Comp(string F1, string F2) = F1 < F2; 84 alias feats_fields = staticSort!( 85 Comp, Filter!(isFeaturesField, __traits(allMembers, T))); 86 87 auto start_time = MonoTime.currTime; 88 89 auto monitor = new SGDMonitor(verbose, num_epochs, num_cores, 90 start_time, isNumeric!S); 91 92 void _learn(U)(NeuralNet net, U d, ulong n_passes, ulong core_id) 93 { 94 foreach(l; net.layers) 95 l.pre_learning(); 96 97 // seed rng per thread with a different seed 98 RAND_GEN.seed(42 + cast(uint)Thread.getThis.id); 99 static if(is(V == float)) 100 { 101 auto grads = new V[nn.output.length]; 102 grads[] = 0; 103 } 104 static if(is(V == SparseF)) 105 { 106 V[] grads; 107 grads.length = nn.output.length; 108 } 109 110 ulong sum_num_features = 0; 111 ulong tmp_cnt = 0; 112 double sum_loss = 0.0; 113 foreach(p; 0..num_epochs) 114 { 115 cnt = 0; 116 foreach(v; d) 117 { 118 auto preds = net.predict(v); 119 foreach(root_id, field; feats_fields) 120 sum_num_features += mixin("v." ~ field ~ ".length"); 121 tmp_cnt++; 122 123 static if(is(R == float[])) 124 { 125 static if(isNumeric!S) 126 sum_loss += grad_f(preds, v, grads); 127 else 128 grad_f(preds, v, grads); 129 } 130 else 131 { 132 static if(isNumeric!S) 133 sum_loss += grad_f(net, v, grads); 134 else 135 grad_f(net, v, grads); 136 } 137 138 cnt++; 139 net.backward_prop(grads); 140 141 if(cnt % 100_000 == 0) 142 { 143 synchronized(this) 144 monitor.progress_callback( 145 core_id, p, tmp_cnt, sum_num_features, sum_loss); 146 sum_num_features = 0; 147 tmp_cnt = 0; 148 static if(isNumeric!S) 149 sum_loss = 0; 150 } 151 152 } 153 // flush last mini batch 154 if(cnt % mini_batch_sz != 0) 155 { 156 cnt = mini_batch_sz; 157 net.backward_prop(grads); 158 } 159 synchronized(this) 160 monitor.progress_callback( 161 core_id, p+1, tmp_cnt, sum_num_features, sum_loss); 162 sum_num_features = 0; 163 tmp_cnt = 0; 164 static if(isNumeric!S) 165 sum_loss = 0; 166 } 167 foreach(l; nn.layers) 168 l.post_learning(); 169 170 } 171 172 if(num_cores == 1) 173 _learn(nn, data, num_epochs, 0); 174 else 175 { 176 auto chunks = data.evenChunks(num_cores); 177 NeuralNet[] nets; 178 foreach(i; 0..num_cores) 179 { 180 auto cp = nn.dup(true); 181 foreach(l; cp.layers) 182 { 183 l.allocate_interface(); 184 l.allocate_grad_params(); 185 } 186 cp.share_params(nn); 187 foreach(l; cp.layers) 188 if(l.optimizer) 189 l.optimizer.register(l); 190 nets ~= cp; 191 } 192 193 auto pool = new TaskPool(num_cores); 194 foreach(i, chunk; pool.parallel(chunks)) 195 { 196 auto net = nets[i]; 197 _learn(net, chunk, num_epochs, i); 198 } 199 pool.stop(); 200 } 201 monitor.wrap_up(); 202 } 203 204 abstract float current_lr(ulong k, ulong j); 205 206 abstract void register(NeuralLayer layer); 207 208 abstract void update(NeuralLayer layer, float[] ext_grad); 209 abstract void update(NeuralLayer layer, SparseF[] ext_grad); 210 abstract Optimizer dup(); 211 212 override string toString() 213 { 214 auto fullname = this.classinfo.name; 215 auto classname = fullname[fullname.lastIndexOf('.')+1..$]; 216 return "opt." ~ classname ~ 217 "[epochs:" ~ num_epochs.to!string ~ 218 ", mini_batch_sz:" ~ mini_batch_sz.to!string ~ 219 ", lr:" ~ lr.to!string ~ 220 "]"; 221 } 222 } 223 224 225 /** 226 AdaGrad stochastic optimizer. 227 228 See $(LINK2 http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf, 229 Adaptive Subgradient Methods for Online Learning and Stochastic Optimization) 230 Examples: 231 ----------------- 232 233 auto nn = NeuralNet() 234 .stack(SparseData(1000)) 235 .stack(Linear(1)); 236 237 // Adagrad learning with 5 epochs, learning rate 0.1, mini-batch size of 100: 238 nn.learn(data, "square", AdaGrad(5, 0.1, 100)); 239 ----------------- 240 * 241 */ 242 class AdaGrad : SGDOptimizer { 243 244 // references 245 float[][] W; 246 float[][] grad; 247 248 // local variables 249 float eps; 250 float[][] acc_grad; 251 252 void delegate(NeuralLayer, float[]) _update; 253 254 this(ulong num_epochs, float lr, ulong mini_batch_sz = 1, float eps_ = 1e-8) 255 { 256 super(num_epochs, mini_batch_sz, lr); 257 eps = eps_; 258 _update = &update_general!float; 259 } 260 mixin opCallNew; 261 262 override void register(NeuralLayer layer) 263 { 264 if(auto l = cast(Linear)layer) 265 { 266 params_to_optimize(l.W, l.grad); 267 priors = l.priors; 268 foreach(p; priors) 269 p.register(l); 270 prox = l.prox; 271 if(prox !is null) 272 prox.register(l); 273 } 274 else if(layer.learnable) 275 throw new Exception( 276 "AdaGrad solver cannot optimize this layer type: " ~ 277 layer.to!string); 278 } 279 280 void params_to_optimize(float[][] W_, float[][] grad_) 281 { 282 W = W_; 283 grad =grad_; 284 acc_grad.length = W.length; 285 foreach(k; 0..acc_grad.length) 286 { 287 acc_grad[k].length = W[k].length; 288 acc_grad[k][] = 0; 289 grad[k][] = 0; 290 } 291 } 292 293 override void update(NeuralLayer layer, float[] ext_grad) 294 { 295 _update(layer, ext_grad); 296 } 297 298 override void update(NeuralLayer layer, SparseF[] ext_grad) 299 { 300 update_general!SparseF(layer, ext_grad); 301 } 302 303 void update_general(V)(NeuralLayer layer, V[] ext_grad) 304 if ((is(V == float) || is(V == SparseF))) 305 { 306 layer.accumulate_grad(ext_grad); 307 cnt++; 308 if(cnt % mini_batch_sz == 0) 309 { 310 foreach(p; priors) 311 p.accumulate_grad(); 312 update_matrix(W, grad, acc_grad, lr, eps); 313 if(prox !is null) 314 prox.proxy_step(); 315 foreach(k; 0..grad.length) 316 grad[k][] = 0; 317 cnt = 0; 318 } 319 } 320 321 pragma(inline, true) 322 override float current_lr(ulong k, ulong j) 323 { 324 return lr / sqrt(eps + acc_grad[k][j]); 325 } 326 327 version(LDC) 328 { 329 private import ldc.attributes; 330 331 pragma(inline, true) 332 @fastmath static void ada_op1(float[] row, float[] g) pure 333 { 334 for(int i = 0; i < row.length; ++i) 335 row[i] += g[i] * g[i]; 336 } 337 338 pragma(inline, true) 339 @fastmath static void ada_op2(float[] row, float[] g, float[] sg, 340 float lr_, float eps_) pure 341 { 342 for(int i = 0; i < row.length; ++i) 343 { 344 // reduces cache invalidation across cores 345 // since we're writing back to the shared weights 346 if(fabs(g[i]) > 0) //LLVM is smart and will vec this to ucomiss 347 row[i] -= lr_ * g[i] / sqrt(eps_ + sg[i]); 348 } 349 } 350 } 351 352 pragma(inline, true) 353 final static void update_matrix( 354 float[][] w, float[][] g, float[][] sum_g_sq, 355 float lr_, float eps_) pure 356 { 357 foreach(k; 0..w.length) 358 { 359 auto row_W = w[k]; 360 auto row_g = g[k]; 361 auto row_sum_g_sq = sum_g_sq[k]; 362 363 version(LDC) 364 { 365 ada_op1(row_sum_g_sq, row_g); 366 ada_op2(row_W, row_g, row_sum_g_sq, lr_, eps_); 367 } 368 else 369 { 370 foreach(i; 0..row_W.length) 371 { 372 row_sum_g_sq[i] += row_g[i] * row_g[i]; 373 row_W[i] -= lr_ * row_g[i] / sqrt(eps_ + row_sum_g_sq[i]); 374 } 375 } 376 } 377 } 378 379 override AdaGrad dup() 380 { 381 return new AdaGrad(num_epochs, lr, mini_batch_sz, eps); 382 } 383 384 override string toString() 385 { 386 return super.toString ~ "[eps:" ~ eps.to!string ~ "]"; 387 } 388 } 389 390 391 /** 392 ADAM stochastic optimizer. 393 394 See $(LINK2 https://arxiv.org/pdf/1412.6980.pdf, ADAM: A method for stochastic optimization) 395 Examples: 396 ----------------- 397 398 auto nn = NeuralNet() 399 .stack(DenseData(200)) 400 .stack(Linear(10)); 401 402 // ADAM learning with 5 epochs, learning rate 0.1, mini-batch size of 100: 403 nn.learn(data, "multinomial", ADAM(5, 0.1, 100)); 404 ----------------- 405 * 406 */ 407 class ADAM : SGDOptimizer { 408 409 // references 410 float[][] W; 411 float[][] grad; 412 413 // local variables 414 float eps; 415 416 float beta1_0; 417 float beta2_0; 418 float beta1; 419 float beta2; 420 421 float[][] M; 422 float[][] S; 423 424 this(ulong num_epochs, float lr, ulong mini_batch_sz = 1, 425 float eps_ = 1e-8, float beta1_0_ = 0.9, float beta2_0_ = 0.999) 426 { 427 super(num_epochs, mini_batch_sz, lr); 428 // See original ADAM paper for definition of these parameters: 429 eps = eps_; 430 beta1_0 = beta1_0_; 431 beta2_0 = beta2_0_; 432 } 433 mixin opCallNew; 434 435 override void register(NeuralLayer layer) 436 { 437 if(auto l = cast(Linear)layer) 438 { 439 params_to_optimize(l.W, l.grad); 440 priors = l.priors; 441 foreach(p; priors) 442 p.register(l); 443 prox = l.prox; 444 if(prox !is null) 445 prox.register(l); 446 } 447 } 448 449 void params_to_optimize(float[][] W_, float[][] grad_) 450 { 451 W = W_; 452 grad =grad_; 453 M.length = W.length; 454 S.length = W.length; 455 foreach(k; 0..W.length) 456 { 457 grad[k][] = 0; 458 M[k].length = W[k].length; 459 M[k][] = 0; 460 S[k].length = W[k].length; 461 S[k][] = 0; 462 } 463 beta1 = beta1_0; 464 beta2 = beta2_0; 465 } 466 467 override void update(NeuralLayer layer, float[] ext_grad) 468 { 469 update_general!float(layer, ext_grad); 470 } 471 472 override void update(NeuralLayer layer, SparseF[] ext_grad) 473 { 474 update_general!SparseF(layer, ext_grad); 475 } 476 477 void update_general(V)(NeuralLayer layer, V[] ext_grad) 478 if ((is(V == float) || is(V == SparseF))) 479 { 480 layer.accumulate_grad(ext_grad); 481 cnt++; 482 if(cnt % mini_batch_sz == 0) 483 { 484 foreach(p; priors) 485 p.accumulate_grad(); 486 update_matrix(); 487 if(prox !is null) 488 prox.proxy_step(); 489 beta1 *= beta1_0; 490 beta2 *= beta2_0; 491 foreach(k; 0..grad.length) 492 grad[k][] = 0; 493 cnt = 0; 494 } 495 } 496 497 pragma(inline, true) 498 override float current_lr(ulong k, ulong j) 499 { 500 return lr / (eps + sqrt(S[k][j] / (1.0 - beta2))); 501 } 502 503 version(LDC) 504 { 505 import ldc.attributes; 506 pragma(inline, true) 507 @fastmath static void adam_op(float[] row, float beta, float[] g) pure 508 { 509 for(int i = 0; i < row.length; ++i) 510 row[i] = beta * row[i] + (1.0 - beta) * g[i]; 511 } 512 513 pragma(inline, true) 514 @fastmath static void adam_op2(float[] row, float beta, float[] g) pure 515 { 516 for(int i = 0; i < row.length; ++i) 517 row[i] = beta * row[i] + (1.0 - beta) * g[i] * g[i]; 518 } 519 520 pragma(inline, true) 521 @fastmath static void adam_op3( 522 float[] row_W, float[] row_S, float[] row_M, 523 float beta1_, float beta2_, float eps_, float lr_) pure 524 { 525 float k1 = 1.0/(1.0 - beta1_); 526 float k2 = 1.0/(1.0 - beta2_); 527 for(int i = 0; i < row_W.length; ++i) 528 row_W[i] -= lr_ * k1 * row_M[i] / (sqrt(k2 * row_S[i]) + eps_); 529 } 530 } 531 532 final void update_matrix() 533 { 534 foreach(k; 0..W.length) 535 { 536 auto row_grad = grad[k]; 537 auto row_W = W[k]; 538 auto row_M = M[k]; 539 auto row_S = S[k]; 540 541 version(LDC) 542 { 543 adam_op(row_M, beta1_0, row_grad); 544 adam_op2(row_S, beta2_0, row_grad); 545 adam_op3(row_W, row_S, row_M, beta1, beta2, eps, lr); 546 } 547 else 548 { 549 foreach(i; 0..row_W.length) 550 { 551 auto g = row_grad[i]; 552 553 row_M[i] = beta1_0 * row_M[i] + (1.0 - beta1_0) * g; 554 row_S[i] = beta2_0 * row_S[i] + (1.0 - beta2_0) * g * g; 555 556 auto gt = row_M[i] / (1.0 - beta1); 557 auto st = row_S[i] / (1.0 - beta2); 558 559 row_W[i] -= lr * gt / (sqrt(st) + eps); 560 } 561 } 562 } 563 } 564 565 override ADAM dup() 566 { 567 return new ADAM(num_epochs, lr, mini_batch_sz, eps, beta1_0, beta2_0); 568 } 569 570 override string toString() 571 { 572 return text( 573 super.toString, 574 "[eps:", eps, " beta1:", beta1_0, ", beta2:", beta2_0, "]"); 575 } 576 } 577 578 579 class ShadowSGDOptimizer : SGDOptimizer { 580 581 NeuralNet _net; 582 583 this(NeuralNet net) 584 { 585 _net = net; 586 num_epochs = 0; 587 foreach(l; net.layers) 588 { 589 if(!l.learnable) 590 continue; 591 if(auto o = cast(SGDOptimizer)(l.optimizer)) 592 { 593 if(o.num_epochs > num_epochs) 594 num_epochs = o.num_epochs; 595 } 596 } 597 super(num_epochs, 1, 0.0); 598 } 599 600 override float current_lr(ulong k, ulong j){return 0.0f;} 601 override void register(NeuralLayer layer){} 602 override void update(NeuralLayer layer, float[] ext_grad){} 603 override void update(NeuralLayer layer, SparseF[] ext_grad){} 604 605 override Optimizer dup() 606 { 607 return new ShadowSGDOptimizer(_net); 608 } 609 }