1 /**
2  * NeuralNet is the main abstraction of vectorflow.
3  *
4  * Copyright: 2017 Netflix, Inc.
5  * License: $(LINK2 http://www.apache.org/licenses/LICENSE-2.0, Apache License Version 2.0)
6  */
7 module vectorflow.neuralnet;
8 
9 private
10 {
11 import std.algorithm : canFind, countUntil, map, startsWith, sum;
12 import std.array : split;
13 import std.conv : text, to;
14 import std.file : exists, FileException, remove;
15 import std.format : format;
16 import std.meta : anySatisfy, Filter, staticSort;
17 import std.stdio : File, writeln;
18 
19 import std.range.primitives : isForwardRange, isInputRange, ElementType;
20 import std.traits : isAggregateType, isNumeric;
21 import std.variant;
22 
23 import vectorflow.layers;
24 import vectorflow.neurallayer;
25 import vectorflow.serde;
26 import vectorflow.optimizers;
27 import vectorflow.losses;
28 import vectorflow.utils : ct_msg, opCallNew;
29 }
30 
31 
32 /***
33  * Neural-network abstraction.
34 Example:
35 -----------------
36 auto nn = NeuralNet()
37     .stack(DenseData(400))
38     .stack(Linear(10));
39 // nn is a network working on 400-dimensions dense vectors and predicting
40 // a 10-dimensions vector
41 -----------------
42 */
43 class NeuralNet {
44 
45     /// array of all the roots
46     InputLayer[] roots;
47     /// all nodes of the computational graph
48     NeuralLayer[] layers;
49     /// map: name --> layer
50     NeuralLayer[string] layers_map;
51     /// edges of the graph: src -> [dst1, ..., dstk]
52     string[][string] edges;
53     /// array of all the leaves
54     NeuralLayer[] leaves;
55     /// reference to the leaf of the net
56     @property NeuralLayer out_layer(){return leaves[0];}
57 
58     private bool _ever_initialized;
59 
60     this()
61     {
62         _ever_initialized = false;
63     }
64     mixin opCallNew;
65 
66     /**
67     * Name and add a root to the net.
68     *
69     * Params:
70     *   name_ = name to give to the layer.
71     *   layer = input layer to add as root to the net.
72     *
73     * Returns: current neural network with the newly added layer.
74     */
75     NeuralNet add_root(string name_, InputLayer layer)
76     {
77         check_name(name_);
78         layer.name = name_;
79         return add_root(layer);
80     }
81 
82     /**
83     * Add a root to the net.
84     *
85     * Params:
86     *   root_ = input layer to add as root to the net.
87     *
88     * Returns: current neural network with the newly added layer.
89     */
90     NeuralNet add_root(InputLayer root_)
91     {
92         roots ~= root_;
93         add(root_);
94         return this;
95     }
96 
97     /**
98     * Name and add a layer to the net, without wiring it.
99     *
100     * Params:
101     *   name_ = name to give to the layer.
102     *   layer = which layer to add to the net.
103     *   opt = optional optimizer to use for this layer.
104     *
105     * Returns: current neural network with the newly added layer.
106     */
107     NeuralNet add(string name_, NeuralLayer layer, Optimizer opt = null)
108     {
109         check_name(name_);
110         layer.name = name_;
111         return add(layer, opt);
112     }
113 
114     /**
115     * Add a layer to the net, without wiring it.
116     *
117     * Params:
118     *   layer = which layer to add to the net.
119     *   opt = optional optimizer to use for this layer.
120     *
121     * Returns: current neural network with the newly added layer.
122     */
123     NeuralNet add(NeuralLayer layer, Optimizer opt = null)
124     {
125         if(roots.length == 0)
126         {
127             if((cast(InputLayer)layer) is null)
128                 throw new Exception(
129                     "First layer added has to be an InputLayer.");
130             if(opt !is null)
131                 throw new Exception(
132                     "A root is not learnable, it cannot have an optimizer.");
133             add_root(layer.to!InputLayer);
134         }
135         else
136         {
137             if(layer.name == "")
138                 layer.name = generate_name();
139             if(layer.name in layers_map)
140                 throw new Exception("A layer with the name `" ~ 
141                         layer.name ~ "` already exist.");
142             layers_map[layer.name] = layer;
143             layers ~= layer;
144             leaves ~= layer;
145         }
146         if(opt !is null)
147             layer.set_optimizer(opt);
148 
149         return this;
150     }
151 
152     /**
153     * Stack a layer on top of the former leaf of the net.
154     *
155     * Params:
156     *   layer = which layer to add to the net. It will be wired to the
157     *           previous leaf.
158     *   opt = optional optimizer to use for this layer.
159     *
160     * Returns: current neural network with the newly added layer.
161     */
162     NeuralNet stack(NeuralLayer layer, Optimizer opt = null)
163     {
164         if(leaves.length > 1)
165             throw new Exception("Your current net is not a stack.");
166         add(layer, opt);
167         if(layers.length >= 2)
168         {
169             // wire to previous
170             auto previous = layers[$-2];
171             wire(previous, layer);
172         }
173         return this;
174     }
175 
176     /**
177     * Stack a layer on top of the former leaf of the net.
178     *
179     * Params:
180     *   name_ = name to give to the layer
181     *   layer = which layer to add to the net. It will be wired to the
182     *           previous leaf.
183     *   opt = optional optimizer to use for this layer.
184     *
185     * Returns: current neural network with the newly added layer.
186     */
187     NeuralNet stack(string name_, NeuralLayer layer, Optimizer opt = null)
188     {
189         check_name(name_);
190         layer.name = name_;
191         return stack(layer, opt);
192     }
193 
194     /**
195     * Compute the prediction of the net for $(PARAM v).
196     * Runs forward-propagation and outputs the predicted vector.
197     *
198     * Params:
199     *    v = observation with one or multiple `features*` attributes
200     *        which have the types expected by the roots in proper order
201     *        (i.e: float[], SparseF[], SparseFG[], custom roots types...)
202     *
203     * Returns: array of last layer neurons values 
204     *
205     * Example:
206     * ---
207     * struct O {
208     *   float[] features_foo;
209     * }
210     * net.predict(O([1.2f, 0.7f]));
211     * ---
212     */
213     float[] predict(T)(T v) if(isAggregateType!T && isLearnableRow!T)
214     {
215         enum Comp(string F1, string F2) = F1 < F2;
216         alias feats_fields = staticSort!(
217             Comp, Filter!(isFeaturesField, __traits(allMembers, T)));
218         assert(feats_fields.length == roots.length,
219             "Number of `features*` fields should match number of roots.");
220         reset();
221         foreach(root_id, field; feats_fields)
222             roots[root_id].forward_prop(mixin("v." ~ field));
223         return output;
224     }
225 
226     /**
227     * Compute the prediction of the net when passing the arguments to the
228     * root(s) of the net.
229     *
230     * Params: the data to feed to the roots in proper order
231     *
232     * Returns: array of last layer neurons values
233     *
234     * Examples:
235     * ---
236     * // net with a single DenseData(2) root:
237     * net.predict([3.2f, -1.5f]);
238     * // net with a single SparseData(dim >= 34) root:
239     * net.predict([SparseF(34, -0.7f), SparseF(3, 0.2f)]);
240     * // net with one DenseData(1), one SparseData(dim >= 16) root:
241     * net.predict([0.2f], [SparseF(16, -0.15f)]);
242     * ---
243     */
244     float[] predict(T...)(T args)
245     {
246         assert(args.length == roots.length,
247             "The number of arguments should match the number of roots.");
248         reset();
249         foreach(i, v; args)
250             roots[i].forward_prop(v);
251         return output;
252     }
253 
254     /**
255     * Create a directed edge between `parent` and `child` nodes.
256     *
257     * Params:
258     *    parent = name of origin layer
259     *    child = name of destination layer
260     *    with_alloc = whether or not both layers should allocate internal
261     *                 parameters
262     */
263     void wire(string parent, string child, bool with_alloc = true)
264     {
265         check_layer_here(parent);
266         check_layer_here(child);
267 
268         auto p = layers_map[parent];
269         auto c = layers_map[child];
270         wire(p, c, with_alloc);
271     }
272 
273     /**
274     * Create a directed edge between `parent` and `child` nodes.
275     *
276     * Params:
277     *    parent = origin layer
278     *    child = destination layer
279     *    with_alloc = whether or not both layers should allocate internal
280     *                 parameters
281     */    
282     void wire(NeuralLayer parent, NeuralLayer child, bool with_alloc = true)
283     {
284         check_layer_here(parent.name);
285         check_layer_here(child.name);
286         if(parent.name in edges && edges[parent.name].canFind(child.name))
287             throw new Exception(
288                     "The edge `" ~
289                     parent.name ~ "` -> `" ~ child.name ~
290                     "` has already been added to the graph.");
291         parent.children ~= child;
292         child.parents ~= parent;
293         foreach(l; layers)
294             l.recompute_topology();
295         if(with_alloc)
296         {
297             parent.allocate_interface();
298             parent.allocate_params();
299             parent.allocate_grad_params();
300             child.allocate_interface();
301             child.allocate_params();
302             child.allocate_grad_params();
303         }
304         edges[parent.name] ~= child.name;
305 
306         // remove parent from the leaves array if it was already there:
307         auto ind_leaf = leaves.countUntil!(l => l.name == parent.name);
308         if(ind_leaf != -1)
309         {
310             if(leaves.length == 1)
311                 leaves.length = 0;
312             if(ind_leaf == 0)
313                 leaves = leaves[1..$];
314             else if(ind_leaf == leaves.length - 1)
315                 leaves = leaves[0..$-1];
316             else
317                 leaves = leaves[0..ind_leaf] ~ leaves[ind_leaf+1..$];
318         }
319         
320         optimize_graph(this);
321     }
322 
323     protected void check_layer_here(string name)
324     {
325         if(name in layers_map)
326             return;
327         throw new Exception(text(
328            "Layer `", name,
329            "` is unknown. Add it to the net first if you ",
330            "want to wire it.\nCurrent net: ", this));
331     }
332 
333     /**
334     * Initialize at random all the parameters of the net.
335     *
336     * Params:
337     *    rand_scale = parameters values drawn in ]-rand_scale, rand_scale[
338     */ 
339     void initialize(double rand_scale)
340     {
341         _ever_initialized = true;
342         foreach(l; layers)
343             l.init(rand_scale);
344     }
345 
346     /**
347     * Return a reference to the dense output vector of the leaf of the net.
348     */
349     @property float[] output(){ return out_layer.out_d; }
350 
351     void backward_prop(V)(V[] output_grad)
352         if ((is(V == float) || is(V == SparseF)))
353     {
354         out_layer.backward_prop(output_grad); // backpropagation
355     }
356 
357     /**
358     * Return the total number of learnable parameters in the net.
359     */
360     @property ulong num_params()
361     {
362         return layers.map!(l => l.num_params).sum;
363     }
364 
365     /**
366     * Reset any internal state variables of the net.
367     */
368     void reset()
369     {
370         foreach(l; layers)
371             l.reset();
372     }
373 
374     /**
375     * Remove any optimizer defined on layers of the net.
376     */
377     void clear_opt()
378     {
379         foreach(l; layers)
380             l.set_optimizer(null);
381     }
382 
383     /**
384     * Discard local weights and use those of the target net instead.
385     * However, the net keeps its own internal state.
386     * Useful for hogwild SGD implementation.
387     *
388     * Params:
389     *    net = NeuralNet whose parameters should be used.
390     */
391     void share_params(NeuralNet net)
392     {
393         foreach(i, ref l; layers)
394             if((cast(InputLayer)l) is null)
395                 l.share_params(net.layers[i]);
396     }
397 
398     /**
399     * Train neural network on some data, using specified gradient callback and
400     * optimizer.
401     *
402     * Params:
403     *    data = forward range of rows
404     *    grad_f = gradient callback (see losses.d for details)
405     *    opt = optimizer to use on all learnable layers for training
406     *    verbose = whether or not to show progress during training
407     *    num_cores = degree of Hogwild parallelism
408     */
409     void learn(D, T, V, R, S, O : Optimizer)(
410             D data,
411             S delegate(R net_out, ref T ex, ref V[] grad) grad_f,
412             O opt, bool verbose = false, uint num_cores = 1)
413         if(isForwardRange!D && is(ElementType!D == T) // dataset constraints
414                 && (is(V == float) || is(V == SparseF))
415                 && (is(R == float[]) || is(R == NeuralNet))
416                 && (isNumeric!S || is(S == void)))
417     {
418         static if(!isAggregateType!T || !isLearnableRow!T)
419         {
420             static assert(0, text(
421                 "Your rows are invalid. Rows should be of an aggregate type (",
422                 "struct, class, union or interface) and have at least one ",
423                 "attribute or property whose name starts with `features`: ",
424                 "that's the data that will be forward-propagated into the ",
425                 "computational graph. If your graph has multiple roots, the ",
426                 "lexicographic order of the attributes starting with ",
427                 "`features` will be used to map them to the roots of ",
428                 "the graph, in the original order these roots were added to ",
429                 "the graph."));
430         }
431 
432         if(!_ever_initialized)
433         {
434             writeln("Net not initialized. Initializing all weights to 0.");
435             initialize(0.0);
436         }
437         {
438             foreach(l; layers)
439             {
440                 if(!l.learnable)
441                     continue;
442                 if(!l.optimizer_set)
443                 {
444                     auto opt_cp = opt.dup;
445                     l.set_optimizer(opt_cp);
446                     opt_cp.register(l);
447                 }
448                 else
449                 {
450                     l.set_optimizer(l.optimizer);
451                     l.optimizer.register(l);
452                 }
453             }
454             // this is just to drive the learning, but each node
455             // has its own copy and optimization variables in a SGD setting
456         }
457 
458         auto cores_str = (
459                 num_cores == 1 ? "1 core." : "%d cores.".format(num_cores));
460         writeln("Training net with ", num_params, " parameters on ", cores_str);
461         foreach(l; layers)
462             l.pre_learning();
463         opt.learn(this, data, grad_f, verbose, num_cores);
464         foreach(l; layers)
465             l.post_learning();
466     }
467 
468     /**
469     * Train neural network on a dataset, using a predefined loss.
470     *
471     * Params:
472     *    data = forward range of rows
473     *    loss = one of the predefined loss functions
474     *    opt = optimizer to use on all learnable layers for training
475     *    verbose = whether or not to show progress during training
476     *    num_cores = degree of Hogwild parallelism
477     *    monitor_loss = whether or not loss value should be tracked during
478     *    training for monitoring (slightly slower)
479     */
480     void learn(D, O : Optimizer)(D data, string loss,
481             O opt, bool verbose = false, uint num_cores = 1,
482             bool monitor_loss = true)
483         if(isForwardRange!D)
484     {
485         if(monitor_loss)
486         {
487             learn(data, get_grad!(ElementType!D, true)(loss),
488                     opt, verbose, num_cores);
489         }
490         else
491         {
492             learn(data, get_grad!(ElementType!D, false)(loss),
493                     opt, verbose, num_cores);
494         }
495     }
496 
497     /**
498     * Train neural network on some data, using a gradient callback.
499     *
500     * Assumes that an optimizer has already been specified on all learnable
501     * layers.
502     *
503     * Params:
504     *    data = forward range of rows
505     *    grad_f = gradient callback (see losses.d for details)
506     *    verbose = whether or not to show progress during training
507     *    num_cores = degree of Hogwild parallelism
508     */
509     void learn(D, T, V, R, S)(
510             D data,
511             float delegate(R net_out, ref T ex, ref V[] grad) grad_f,
512             bool verbose = false, uint num_cores = 1)
513     {
514         check_all_layers_have_optimizer();
515         auto driver = new ShadowSGDOptimizer(this);
516 
517         learn(data, grad_f, driver, verbose, num_cores);
518     }
519 
520     /**
521     * Train neural network on some data, using a predefined loss.
522     *
523     * Assumes that an optimizer has already been specified on all learnable
524     * layers.
525     *
526     * Params:
527     *    data = forward range of rows
528     *    loss = one of the predefined loss functions
529     *    verbose = whether or not to show progress during training
530     *    num_cores = degree of Hogwild parallelism
531     */
532     void learn(D)(D data, string loss, bool verbose = false, uint num_cores = 1)
533     {
534         check_all_layers_have_optimizer();
535         auto driver = new ShadowSGDOptimizer(this);
536 
537         learn(data, loss, driver, verbose, num_cores);
538     }
539 
540     override string toString()
541     {
542         string s = "NeuralNet[" ~ this.num_params.to!string ~ " parameters]\n";
543         foreach(l; layers)
544             s ~= (l.name ~ "|" ~ l.to!string ~ "\n");
545         return s[0..$-1];
546     }
547 
548     private void check_name(string name_)
549     {
550         if(name_.length == 0)
551             throw new Exception("You must specify a non-empty name");
552         else if(name_.canFind(','))
553             throw new Exception(
554                 "Name of layers cannot contain commas: `" ~ name_ ~ "`.");
555     }
556 
557     static bool is_upstream_stack(NeuralLayer layer)
558     {
559         bool is_stack = layer.parents.length <= 1 && layer.children.length <= 1;
560         foreach(p; layer.parents)
561             is_stack &= is_upstream_stack(p);
562         return is_stack;
563     }
564 
565     private void check_all_layers_have_optimizer()
566     {
567         string not_set;
568         foreach(l; layers)
569         {
570             if(l.learnable && !l.optimizer_set())
571                 not_set ~= l.name ~ ",";
572         }
573         if(not_set != "")
574             throw new Exception(
575                 "You haven't specified an optimizer for the following " ~
576                 "learnable layers: " ~ not_set[0..$-1]);
577     }
578 
579     private string generate_name()
580     {
581         return "layer" ~ to!string(layers.length + 1);
582     }
583 
584     /**
585     * Dump the neural net (topology and weight values) to the specified path.
586     *
587     * Params:
588     *    path = where to dump the neural net.
589     */
590     void serialize(string path)
591     {
592         auto f = File(path, "w");
593         scope(exit) f.close();
594         scope(failure)
595         {
596             f.close();
597             try
598             {
599                 writeln("Serialization failed.");
600                 remove(path);
601             }
602             catch(FileException e)
603             {
604                 writeln("Couldn't cleanup `", path,
605                         "` after serialization failure: ", e);
606             }
607         }
608 
609         auto ser = new Serializer(&f);
610 
611         // serialize root names
612         ser.write(roots.length.to!ulong);
613         foreach(r; roots)
614             ser.write(r.name);
615 
616         // serialize edges
617         ser.write(edges.length.to!ulong);
618         foreach(p; edges.byKeyValue())
619         {
620             ser.write(p.value.length.to!ulong);
621             foreach(child; p.value)
622             {
623                 ser.write(p.key ~ "," ~ child); // parent,child
624             }
625         }
626 
627         // now serialize layers
628         foreach(l; layers)
629             l.ser(ser);
630     }
631 
632     /**
633     * Deserialize the neural net from the specified path.
634     *
635     * Params:
636     *    path = file path of the neural net to read.
637     */
638     static NeuralNet deserialize(string path)
639     {
640         if(!exists(path))
641             throw new Exception("File does not exists: " ~ path);
642         auto f = File(path, "r");
643         scope(exit) f.close();
644         scope(failure) f.close();
645 
646         auto nn = new NeuralNet();
647 
648         auto deser = new Serializer(&f);
649 
650         // deserialize root names
651         bool[string] root_names;
652         auto num_roots = deser.read!ulong();
653         foreach(_; 0..num_roots)
654             root_names[deser.read!string()] = true;
655 
656         // deserialize edges
657         string[][string] edges;
658         auto num_parents = deser.read!ulong();
659         foreach(_; 0..num_parents)
660         {
661             auto num_children = deser.read!ulong();
662             foreach(__; 0..num_children)
663             {
664                 auto edge = deser.read!string();
665                 auto toks = edge.split(',');
666                 edges[toks[0]] ~= toks[1];
667             }
668         }
669 
670         // deserialize all layers
671         auto layers = deser.deserialize_layers();
672         foreach(l; layers)
673         {
674             if(l.name in root_names)
675                 nn.add_root(l.to!InputLayer);
676             else
677                 nn.add(l);
678         }
679 
680         foreach(p; edges.byKeyValue())
681             foreach(child; p.value)
682                 nn.wire(p.key, child, false);
683         foreach(l; nn.layers)
684         {
685             if(l.type ==LayerT.DENSE)
686                 l.out_d.length = l.dim_out;
687             l.allocate_interface();
688         }
689 
690         return nn;
691     }
692 
693     /**
694     * Return a copy of the net.
695     *
696     * Params:
697     *    topology_only = whether or not the copy should be shallow
698     */
699     NeuralNet dup(bool topology_only = false)
700     {
701         auto cp = new NeuralNet();
702 
703         bool[string] root_names;
704         foreach(r; roots)
705         {
706             root_names[r.name] = true;
707             cp.add_root(r.name, cast(InputLayer)r.dup);
708         }
709         foreach(l; layers)
710         {
711             if(l.name !in root_names)
712             {
713                 auto lcp = l.dup;
714                 if(l.optimizer)
715                     lcp.set_optimizer(l.optimizer.dup);
716                 cp.add(l.name, lcp);
717             }
718         }
719 
720         foreach(p; edges.byKeyValue())
721             foreach(child; p.value)
722                 cp.wire(p.key, child, !topology_only);
723         if(!topology_only)
724             foreach(l; cp.layers)
725             {
726                 l.allocate_interface();
727                 l.allocate_params();
728             }
729         return cp;
730     }
731 }
732 
733 package enum isFeaturesField(string s) = s.startsWith("features");
734 package enum isLearnableRow(T) = anySatisfy!(isFeaturesField, __traits(allMembers, T));
735 
736 
737 private void optimize_graph(NeuralNet net)
738 {
739     foreach(layer; net.layers)
740     {
741         if(auto l = cast(Linear)layer)
742         {
743             foreach(p; l.priors)
744                 p.register(l);
745             if(l.prox !is null)
746                 l.prox.register(l);
747         }
748     }
749 }
750 
751 version(assert)
752 {
753     static this()
754     {
755         ct_msg!("Non-release build.");
756     }
757 }