1 /** 2 * Implementation of standard regularizers for Linear layer. 3 * 4 * Copyright: 2017 Netflix, Inc. 5 * License: $(LINK2 http://www.apache.org/licenses/LICENSE-2.0, Apache License Version 2.0) 6 */ 7 module vectorflow.regularizers; 8 9 private 10 { 11 import std.random : uniform; 12 13 import vectorflow.layers; 14 import vectorflow.math; 15 import vectorflow.neurallayer; 16 import vectorflow.optimizers; 17 import vectorflow.utils; 18 } 19 20 21 abstract class LinearPrior 22 { 23 /// pointer to the layer weight matrix 24 float[][] W; 25 /// pointer to the layer gradient matrix 26 float[][] grad; 27 /// pointer to the optimizer for this layer 28 SGDOptimizer opt; 29 30 void register(Linear layer) 31 { 32 W = layer.W; 33 grad = layer.grad; 34 if(layer.optimizer is null) 35 return; 36 if(auto o = cast(SGDOptimizer)(layer.optimizer)) 37 opt = o; 38 else 39 throw new Exception("Only support SGDOptimizer type."); 40 } 41 42 LinearPrior dup(); 43 } 44 45 abstract class ProxyLinearPrior : LinearPrior 46 { 47 abstract void proxy_step(); 48 49 override ProxyLinearPrior dup(); 50 } 51 52 abstract class AdditiveLinearPrior : LinearPrior 53 { 54 abstract void accumulate_grad(); 55 56 override AdditiveLinearPrior dup(); 57 } 58 59 60 /*** 61 Adds L2 regularization to a Linear layer. 62 Example: 63 ------------- 64 // basic L2 regularization: loss = (0.03 / 2) * || W ||^2 65 auto l1 = Linear(5).prior(L2Prior(0.03)); 66 67 // same, but centered around a non-zero matrix: loss = (0.03 / 2) * || W - W_p ||^2 68 auto l2 = Linear(5).prior(L2Prior(0.03, W_p)); 69 70 // L2 regularization with 1 lambda per feature (diagonal Hessian 0-centered prior). 71 // Example when input dim is 5: 72 auto l3 = Linear(10).prior(L2Prior([0.01f, 0.02f, 0.01f, 0.015f, 0.005f])); 73 74 // L2 regularization with 1 lambda per feature, centered around a non-zero matrix. 75 // Example when input dim is 5: 76 auto l4 = Linear(10).prior(L2Prior([0.01f, 0.02f, 0.01f, 0.015f, 0.005f], W_p)); 77 ------------- 78 */ 79 class L2Prior : AdditiveLinearPrior 80 { 81 float _lambda; 82 float[] _lambdas; 83 float[][] W_prior; 84 85 protected size_t _ind_start; 86 87 void delegate() _accumulate_grad; 88 89 this(float lambda) 90 { 91 _lambda = lambda; 92 _ind_start = 0; 93 } 94 95 this(float lambda, float[][] W_prior_) 96 { 97 _lambda = lambda; 98 W_prior = W_prior_; 99 _ind_start = 0; 100 } 101 102 this(float[] lambdas) 103 { 104 _lambdas = lambdas; 105 _ind_start = 0; 106 } 107 108 this(float[] lambdas, float[][] W_prior_) 109 { 110 _lambdas = lambdas; 111 W_prior = W_prior_; 112 _ind_start = 0; 113 } 114 115 mixin opCallNew; 116 117 override void register(Linear layer) 118 { 119 super.register(layer); 120 if(W.length > 0 && W_prior.length == 0) 121 { 122 // default to prior centered on 0 123 W_prior = allocate_matrix_zero!float(W.length, W[0].length); 124 } 125 if(layer.with_intercept) 126 _ind_start = 1; // don't regularize intercept 127 128 if(_lambdas.length == 0) 129 _accumulate_grad = &_acc_grad_scal; 130 else 131 _accumulate_grad = &_acc_grad_vec; 132 } 133 134 version(LDC) 135 { 136 private import ldc.attributes; 137 138 pragma(inline, true) 139 @fastmath static void l2op_scal(float a, float[] x, float[] x2, 140 float[] y, size_t start) pure 141 { 142 for(auto i = start; i < x.length; ++i) 143 y[i] += a * (x[i] - x2[i]); 144 } 145 146 pragma(inline, true) 147 @fastmath static void l2op_vec(float[] a, float[] x, float[] x2, 148 float[] y, size_t start) pure 149 { 150 for(auto i = start; i < x.length; ++i) 151 y[i] += a[i] * (x[i] - x2[i]); 152 } 153 154 void _acc_grad_scal() 155 { 156 foreach(k; 0..W.length) 157 l2op_scal(_lambda, W[k], W_prior[k], grad[k], _ind_start); 158 } 159 160 void _acc_grad_vec() 161 { 162 foreach(k; 0..W.length) 163 l2op_vec(_lambdas, W[k], W_prior[k], grad[k], _ind_start); 164 } 165 } 166 else 167 { 168 void _acc_grad_scal() 169 { 170 foreach(k; 0..W.length) 171 for(auto i = _ind_start; i < W[k].length; ++i) 172 grad[k][i] += _lambda * (W[k][i] - W_prior[k][i]); 173 } 174 175 void _acc_grad_vec() 176 { 177 foreach(k; 0..W.length) 178 for(auto i = _ind_start; i < W[k].length; ++i) 179 grad[k][i] += _lambdas[i] * (W[k][i] - W_prior[k][i]); 180 } 181 } 182 183 override void accumulate_grad() 184 { 185 _accumulate_grad(); 186 } 187 188 override AdditiveLinearPrior dup() 189 { 190 L2Prior cp; 191 if(_lambdas.length == 0) 192 cp = new L2Prior(_lambda, W_prior); 193 else 194 cp = new L2Prior(_lambdas, W_prior); 195 cp._ind_start = _ind_start; 196 return cp; 197 } 198 } 199 200 /*** 201 Adds L1 regularization to a Linear layer. 202 203 This is implemented as a proximal operator during SGD. 204 205 Example: 206 ------------- 207 // basic L1 regularization: loss = 0.03 * | W | 208 auto l1 = Linear(5).prior(L1Prior(0.03)); 209 210 // same, but centered around a non-zero matrix: loss = 0.03 * | W - W_p | 211 auto l2 = Linear(5).prior(L1Prior(0.03, W_p)); 212 ------------- 213 */ 214 class L1Prior : ProxyLinearPrior 215 { 216 float _lambda; 217 float[][] W_prior; 218 219 protected size_t _ind_start; 220 221 this(float lambda) 222 { 223 _lambda = lambda; 224 _ind_start = 0; 225 } 226 227 this(float lambda, float[][] W_prior_) 228 { 229 _lambda = lambda; 230 W_prior = W_prior_; 231 _ind_start = 0; 232 } 233 234 mixin opCallNew; 235 236 override void register(Linear layer) 237 { 238 super.register(layer); 239 if(W.length > 0 && W_prior.length == 0) 240 { 241 // default to prior centered on 0 242 W_prior = allocate_matrix_zero!float(W.length, W[0].length); 243 } 244 if(layer.with_intercept) 245 _ind_start = 1; // don't regularize intercept 246 } 247 248 override void proxy_step() 249 { 250 foreach(k; 0..W.length) 251 { 252 for(auto i = _ind_start; i < W[k].length; ++i) 253 { 254 if(W[k][i] > W_prior[k][i]) 255 W[k][i] = fmax(0.0f, W[k][i] - W_prior[k][i] - _lambda * opt.current_lr(k,i)); 256 else 257 W[k][i] = -fmax(0.0f, -W[k][i] + W_prior[k][i] - _lambda * opt.current_lr(k,i)); 258 } 259 } 260 } 261 262 override ProxyLinearPrior dup() 263 { 264 auto cp = new L1Prior(_lambda, W_prior); 265 cp._ind_start = _ind_start; 266 return cp; 267 } 268 } 269 270 271 /*** 272 Adds a positive constraint on the weights of a Linear layer. 273 274 If the Linear layer has intercepts, the constraint won't apply to them. 275 276 This is implemented as a proximal operator during SGD. 277 Example: 278 ------------- 279 // force weights to be positive: 280 auto l1 = Linear(5).prior(PositivePrior()); 281 282 // force weights to be above 1e-3: 283 auto l2 = Linear(10).prior(PositivePrior(1e-3)); 284 ------------- 285 */ 286 class PositivePrior : ProxyLinearPrior 287 { 288 protected size_t _ind_start; 289 protected float _eps; 290 291 this() 292 { 293 _ind_start = 0; 294 _eps = 0; 295 } 296 297 this(float eps) 298 { 299 _ind_start = 0; 300 _eps = eps; 301 } 302 303 mixin opCallNew; 304 305 override void register(Linear layer) 306 { 307 super.register(layer); 308 if(layer.with_intercept) 309 _ind_start = 1; // don't regularize intercept 310 } 311 312 override void proxy_step() 313 { 314 foreach(k; 0..W.length) 315 for(auto i = _ind_start; i < W[k].length; ++i) 316 W[k][i] = fmax(_eps, W[k][i]); 317 } 318 319 override ProxyLinearPrior dup() 320 { 321 auto cp = new PositivePrior(); 322 cp._eps = _eps; 323 cp._ind_start = _ind_start; 324 return cp; 325 } 326 } 327 328 329 330 class RotationPrior : AdditiveLinearPrior 331 { 332 float _lambda; 333 ulong _num_draws; 334 335 this(float lambda, ulong num_draws = 1) 336 { 337 _lambda = lambda; 338 _num_draws = num_draws; 339 } 340 341 mixin opCallNew; 342 343 override void accumulate_grad() 344 { 345 foreach(_; 0.._num_draws) 346 { 347 size_t i = uniform(0, W.length, RAND_GEN); 348 size_t j = uniform(0, W.length, RAND_GEN); 349 350 auto ri = W[i]; 351 auto rj = W[j]; 352 float g = dotProd(ri, rj); 353 if(i != j) 354 { 355 foreach(u; 0..W[i].length) 356 { 357 grad[i][u] += _lambda * g * rj[u]; 358 grad[j][u] += _lambda * g * ri[u]; 359 } 360 } 361 else 362 { 363 g -= 1.0; 364 foreach(u; 0..W[i].length) 365 grad[i][u] += 2 * _lambda * g * ri[u]; 366 } 367 } 368 } 369 370 override AdditiveLinearPrior dup() 371 { 372 return new RotationPrior(_lambda, _num_draws); 373 } 374 } 375