1 /** 2 * Utility functions for ML-related tasks. 3 * 4 * Copyright: 2017 Netflix, Inc. 5 * License: $(LINK2 http://www.apache.org/licenses/LICENSE-2.0, Apache License Version 2.0) 6 */ 7 module vectorflow.ml; 8 9 private{ 10 import std.algorithm : map, reverse, sort; 11 import std.array; 12 import std.conv : to; 13 import std.range : evenChunks; 14 import std.range.primitives; 15 import std.traits; 16 import std.typecons : Tuple, tuple; 17 } 18 19 20 /** 21 * Compute ROC curve and AUC. Returns the AUC 22 * 23 * Params: 24 * data = forward range of (true label, prediction) 25 * roc_curve = will populate the ROC curve in this array 26 * num_cutoff = number of points in the ROC curve 27 */ 28 double computeROCandAUC(R)(R data, out Tuple!(float, float)[] roc_curve, 29 ulong num_cutoff = 400) 30 if(isForwardRange!R) 31 { 32 // sort by predictions 33 auto sorted = data.sort!((a, b) => a[1] < b[1]).map!(x => x[0]); 34 float all_pos = 0; 35 float all_neg = 0; 36 foreach(l; sorted) 37 { 38 if(l > 0) 39 all_pos++; 40 else 41 all_neg++; 42 } 43 44 auto chunks = evenChunks(sorted, num_cutoff); 45 46 roc_curve.length = 0; 47 48 float sum_pos = 0; 49 float sum_neg = 0; 50 foreach(chunk; chunks) 51 { 52 foreach(l; chunk) 53 { 54 if(l > 0) 55 { 56 sum_pos++; 57 all_pos--; 58 } 59 else 60 { 61 sum_neg++; 62 all_neg--; 63 } 64 } 65 float tpr = all_pos / (all_pos + sum_pos); 66 float tnr = sum_neg / (all_neg + sum_neg); 67 roc_curve ~= tuple(1.0f - tnr, tpr); 68 } 69 reverse(roc_curve); 70 71 double auc = 0.0; 72 // origin triangle first 73 float h = roc_curve[0][1] - 0; 74 float w = roc_curve[0][0] - 0; 75 auc += h * w / 2; 76 77 // then all other triangles 78 foreach(i; 0..roc_curve.length - 1) 79 { 80 float height = roc_curve[i + 1][1] - roc_curve[i][1]; 81 float width = roc_curve[i + 1][0] - roc_curve[i][0]; 82 auc += height * width / 2 + roc_curve[i][1] * width; 83 } 84 // then endpoint triangle 85 h = 1.0 - roc_curve[$-1][1]; 86 w = 1.0 - roc_curve[$-1][0]; 87 auc += h * w / 2 + roc_curve[$-1][1] * w; 88 89 return auc; 90 } 91 92 93 /** 94 * Compute the histogram of an InputRange using uniform bins 95 * 96 * Params: 97 * vals = InputRange of numerical values 98 * num_bins = number of bins in the histogram returned 99 * bin_min = lower bound of the histogram range 100 * bin_max = higher bound of the histogram range 101 * bins = intervals [a, b[ used for binning 102 * normalized = whether or not the histogram is normalized by its sum 103 */ 104 float[] histogram(D)(D vals, ushort num_bins, float bin_min, float bin_max, 105 out float[] bins, bool normalized = false) 106 if(isInputRange!D && isNumeric!(ElementType!D)) 107 { 108 auto hist = new float[num_bins]; 109 hist[] = 0; 110 111 auto bins_left_bound = new float[num_bins]; 112 foreach(i; 0..num_bins) 113 bins_left_bound[i] = bin_min + (bin_max - bin_min) * i.to!float / (num_bins - 1); 114 bins = bins_left_bound; 115 116 foreach(ref v; vals) 117 { 118 ulong bin_ind = 0; 119 while(bin_ind < bins_left_bound.length && v >= bins_left_bound[bin_ind]) 120 ++bin_ind; 121 if(bin_ind < bins_left_bound.length) 122 bin_ind--; 123 if(bin_ind >= 0 && bin_ind < bins_left_bound.length) 124 hist[bin_ind] += 1; 125 } 126 if(normalized) 127 hist[] /= hist.sum(); 128 129 return hist; 130 } 131