vectorflow.ml source code

1 /**
2  * Utility functions for ML-related tasks.
3  *
4  * Copyright: 2017 Netflix, Inc.
5  * License: $(LINK2 http://www.apache.org/licenses/LICENSE-2.0, Apache License Version 2.0)
6  */
7 module vectorflow.ml;
8 
9 private{
10 import std.algorithm : map, reverse, sort;
11 import std.array;
12 import std.conv : to;
13 import std.range : evenChunks;
14 import std.range.primitives;
15 import std.traits;
16 import std.typecons : Tuple, tuple;
17 }
18 
19 
20 /**
21 * Compute ROC curve and AUC. Returns the AUC
22 *
23 * Params:
24 *    data = forward range of (true label, prediction)
25 *    roc_curve = will populate the ROC curve in this array
26 *    num_cutoff = number of points in the ROC curve
27 */
28 double computeROCandAUC(R)(R data, out Tuple!(float, float)[] roc_curve,
29                            ulong num_cutoff = 400)
30     if(isForwardRange!R)
31 {
32     // sort by predictions
33     auto sorted = data.sort!((a, b) => a[1] < b[1]).map!(x => x[0]);
34     float all_pos = 0;
35     float all_neg = 0;
36     foreach(l; sorted)
37     {
38         if(l > 0)
39             all_pos++;
40         else
41             all_neg++;
42     }
43 
44     auto chunks = evenChunks(sorted, num_cutoff);
45 
46     roc_curve.length = 0;
47 
48     float sum_pos = 0;
49     float sum_neg = 0;
50     foreach(chunk; chunks)
51     {
52         foreach(l; chunk)
53         {
54             if(l > 0)
55             {
56                 sum_pos++;
57                 all_pos--;
58             }
59             else
60             {
61                 sum_neg++;
62                 all_neg--;
63             }
64         }
65         float tpr = all_pos / (all_pos + sum_pos);
66         float tnr = sum_neg / (all_neg + sum_neg);
67         roc_curve ~= tuple(1.0f - tnr, tpr);
68     }
69     reverse(roc_curve);
70 
71     double auc = 0.0;
72     // origin triangle first
73     float h = roc_curve[0][1] - 0;
74     float w = roc_curve[0][0] - 0;
75     auc += h * w / 2;
76 
77     // then all other triangles
78     foreach(i; 0..roc_curve.length - 1)
79     {
80         float height = roc_curve[i + 1][1] - roc_curve[i][1];
81         float width = roc_curve[i + 1][0] - roc_curve[i][0];
82         auc += height * width / 2 + roc_curve[i][1] * width;
83     }
84     // then endpoint triangle
85     h = 1.0 - roc_curve[$-1][1];
86     w = 1.0 - roc_curve[$-1][0];
87     auc += h * w / 2 + roc_curve[$-1][1] * w;
88 
89     return auc;
90 }
91 
92 
93 /**
94 * Compute the histogram of an InputRange using uniform bins
95 *
96 * Params:
97 *    vals = InputRange of numerical values
98 *    num_bins = number of bins in the histogram returned
99 *    bin_min = lower bound of the histogram range
100 *    bin_max = higher bound of the histogram range
101 *    bins = intervals [a, b[ used for binning
102 *    normalized = whether or not the histogram is normalized by its sum
103 */
104 float[] histogram(D)(D vals, ushort num_bins, float bin_min, float bin_max,
105                      out float[] bins, bool normalized = false)
106     if(isInputRange!D && isNumeric!(ElementType!D))
107 {
108     auto hist = new float[num_bins];
109     hist[] = 0;
110 
111     auto bins_left_bound = new float[num_bins];
112     foreach(i; 0..num_bins)
113         bins_left_bound[i] = bin_min + (bin_max - bin_min) * i.to!float / (num_bins - 1);
114     bins = bins_left_bound;
115 
116     foreach(ref v; vals)
117     {
118         ulong bin_ind = 0;
119         while(bin_ind < bins_left_bound.length && v >= bins_left_bound[bin_ind])
120             ++bin_ind;
121         if(bin_ind < bins_left_bound.length)
122             bin_ind--;
123         if(bin_ind >= 0 && bin_ind < bins_left_bound.length)
124             hist[bin_ind] += 1;
125     }
126     if(normalized)
127         hist[] /= hist.sum();
128 
129     return hist;
130 }
131