1 module ggplotd.stat;
2
3 version( unittest )
4 {
5 import dunit.toolkit;
6 }
7
8 import std.traits : isCallable;
9
10 /**
11 Create Aes based on a function
12
13 Params:
14 func = Function
15 min = x coordinate to start from
16 max = x coordinate to end at
17 precision = Number of points to calculate
18
19 Examples:
20 --------------
21 /// http://blackedder.github.io/ggplotd/images/function.png
22 import std.random : uniform;
23 import std.typecons : Tuple; import ggplotd.stat : statFunction;
24 import ggplotd.ggplotd : GGPlotD;
25 import ggplotd.geom : geomLine, geomPoint;
26 import ggplotd.range : mergeRange;
27
28 auto f = (double x) { return x / (1 + x); };
29
30 auto aes = statFunction(f, 0.0, 10);
31 auto gg = GGPlotD().put(geomLine(aes));
32
33 // Generate some noisy points
34 auto f2 = (double x) { return x / (1 + x) * uniform(0.75, 1.25); };
35 auto aes2 = f2.statFunction(0.0, 10, 25);
36
37 // Show points in different colour
38 auto withColour = Tuple!(string, "colour")("aquamarine").mergeRange(aes2);
39 gg = gg.put(withColour.geomPoint);
40
41 gg.save("function.png");
42 --------------
43
44 */
45 auto statFunction( FUNC, T )( FUNC func, T min,
46 T max, size_t precision = 50 ) if (isCallable!FUNC)
47 {
48 import std.array : array;
49 import std.algorithm : map;
50 import std.range : iota;
51
52 import ggplotd.aes : Aes;
53 auto stepSize = (max-min)/(precision-1);
54 auto xs = [min];
55 if (stepSize > 0)
56 xs = iota( min, max, stepSize ).array ~ max;
57 auto ys = xs.map!((x) => func(x));
58
59 return Aes!(typeof(xs), "x", typeof(ys), "y")( xs, ys );
60 }
61
62 unittest
63 {
64 import std.array : array;
65 import std.algorithm : map;
66
67 auto aes = statFunction( (double x) { return 2*x; }, 0.0, 1, 2 );
68 assertEqual( aes.map!("a.x").array, [0.0, 1.0] );
69 assertEqual( aes.map!("a.y").array, [0.0, 2.0] );
70
71 aes = statFunction( (double x) { return 3*x; }, -1.0, 1, 3 );
72 assertEqual( aes.map!("a.x").array, [-1.0, 0.0, 1.0] );
73 assertEqual( aes.map!("a.y").array, [-3.0, 0.0, 3.0] );
74 }
75
76 /+++++++++++
77 Binning
78 +++++++++++/
79
80 private auto binID(T)( T value, T min, T max, T width )
81 {
82 import std.conv : to;
83 import std.math : floor;
84 assert( min != max, "Minimum value of bin should differ from maximum" );
85 assert( width != 0, "Bin width can't be 0" );
86
87 import std.stdio : writeln;
88 if ( !(value <= max && value >= min) )
89 writeln( value, " ", min, " ", max );
90
91 assert( value <= max && value >= min, "Value must be within given range" );
92 if (value==max) // Corner case for highest value
93 value -= 0.1*width; // Might not work correctly for integers
94 auto id = floor((value - min) / width);
95 return id.to!int;
96 }
97
98 unittest
99 {
100 import std.algorithm : map;
101 import std.array : array;
102 assertEqual(
103 [0.1,0.3,0.6,0.2,0.4].map!((a) => a.binID( 0, 0.6, 0.2 )).array,
104 [0,1,2,1,2]
105 );
106 // TODO: add tests for integers etc.
107 }
108
109 private auto idToRange(T)( size_t value, T min, T width )
110 {
111 auto base = min + value*width;
112 return [ base, base + width ];
113 }
114
115 // Multidimensional bin. Data either is range of range, with each subrange
116 // a set of data. TODO: should we treat one dimensional data differently?
117 private template bin(DATA)
118 {
119 struct Bin
120 {
121 double[][] range;
122 size_t count;
123 }
124
125 auto bin(DATA data, double[] mins, double[] maxs,
126 size_t[] noBins)
127 {
128 import std.range : zip;
129 import std.algorithm : filter, all, group, map;
130 import ggplotd.range : groupBy;
131 assert( noBins.all!((a) => a > 0), "noBins must be larger than 0" );
132
133 auto widths = zip(mins, maxs, noBins).map!((t) => (t[1]-t[0])/(t[2]));
134
135 auto binIDs = data
136 .filter!((sample)
137 {
138 return zip(sample, mins, maxs).
139 all!( (args) => (args[0] >= args[1] && args[0] <= args[2]));
140 })
141 .map!((sample)
142 {
143 import std.array : array; // Needed for groupBy to work correctly
144 return zip(sample, mins, maxs, widths)
145 .map!( (args) => binID( args[0], args[1], args[2], args[3] ) ).array;
146 } );
147
148 import std.typecons : tuple;
149
150 return binIDs.groupBy!((a) => tuple(a)).values.map!((g)
151 {
152 import std.array : array;
153 Bin bin;
154 bin.count = g.length;
155 bin.range = zip( g[0], mins, widths )
156 .map!((args) => idToRange( args[0], args[1], args[2] ) )
157 .array;
158 return bin;
159 });
160 }
161 }
162
163 unittest {
164 import std.algorithm : reduce;
165 auto bins = bin( [[0.0],[0.1],[0.2],[0.2],[0.01],[0.3]], [0.0], [0.3], [10] );
166 assertEqual( 6, reduce!((a,b) => a += b.count )( 0, bins ) );
167 auto bins2 = bin( [[0.0,100],[0.1,101],[0.2,109],[0.01,110],[0.3,103.1]], [0.0,100], [0.3,110], [10,10] );
168 assertEqual( 5, reduce!((a,b) => a += b.count )( 0, bins2 ) );
169 }
170
171 private auto bin(DATA)(DATA data, double min, double max,
172 size_t noBins)
173 {
174 import std.algorithm : map;
175 return bin( data.map!((d) => [d]), [min], [max], [noBins] );
176 }
177
178 unittest {
179 import std.algorithm : reduce;
180 auto bins = bin( [0.0,0.1,0.2,0.2,0.01,0.3], 0.0, 0.3, 10 );
181 assertEqual( 6, reduce!((a,b) => a += b.count )( 0, bins ) );
182
183 import std.algorithm : map;
184 import std.array : array;
185 import std.random : uniform;
186 import std.range : iota, walkLength;
187 auto xs = iota(0,10,1).map!((i)=>uniform(0,0.75)+uniform(0.25,1)).array;
188 auto binsR = bin( xs, 0.0, 2.0, 5 );
189 assertEqual( xs.walkLength, reduce!((a,b) => a += b.count )( 0, binsR ) );
190
191 binsR = bin( xs, 0.0, 1.0, 5 );
192 assertLessThanOrEqual( binsR.walkLength, 5 );
193 }
194
195 private template statHistND(int dim, AES)
196 {
197 import std.algorithm : map;
198 import ggplotd.aes : Aes;
199 import ggplotd.range : mergeRange;
200
201 struct VolderMort(AESV)
202 {
203 private import std.range : ElementType;
204 private import std.typecons : Tuple;
205 private import ggplotd.aes : group;
206
207 typeof(group(AESV.init)) grouped;
208 ElementType!(ElementType!(typeof(grouped))) defaults;
209 typeof(bin!(double[][])((double[][]).init, (double[]).init,
210 (double[]).init, (size_t[]).init))
211 binned;
212
213 size_t[] _noBins;
214 double[] mins;
215 double[] maxs;
216
217 this( AESV aes, size_t[] noBins )
218 {
219 import std.algorithm : min, max, reduce;
220 import std.array : array;
221 import std.conv : to;
222 import std.range : empty, popFront, front, take, walkLength;
223 import std.typecons : tuple;
224
225 import ggplotd.algorithm : safeMin, safeMax;
226 import ggplotd.range : uniquer;
227
228 static assert( dim == 1 || dim == 2, "Only dimension of 1 or 2 i supported" );
229
230 _noBins = noBins;
231 static if (dim == 1) {
232 if (_noBins[0] < 1)
233 _noBins[0] = min(aes.map!((a) => a.x.to!double)
234 .uniquer.take(30).walkLength,
235 min(30,max(11, aes.walkLength/10)));
236 auto seed = tuple(
237 aes.front.x.to!double, aes.front.x.to!double );
238 auto minmax = reduce!((a,b) => safeMin(a,b.x.to!double),
239 (a,b) => safeMax(a,b.x.to!double))(seed,aes);
240 mins = [minmax[0]];
241 maxs = [minmax[1]];
242 }
243 else
244 {
245 if (_noBins[0] < 1)
246 _noBins[0] = min(aes.map!((a) => a.x.to!double)
247 .uniquer.take(30).walkLength,
248 min(30,max(11, aes.walkLength/25)));
249 if (_noBins[1] < 1)
250 _noBins[1] = min(aes.map!((a) => a.y.to!double)
251 .uniquer.take(30).walkLength,
252 min(30,max(11, aes.walkLength/25)));
253 auto seed = tuple(
254 aes.front.x.to!double, aes.front.x.to!double,
255 aes.front.y.to!double, aes.front.y.to!double );
256 auto minmax = reduce!(
257 (a,b) => safeMin(a,b.x.to!double),
258 (a,b) => safeMax(a,b.x.to!double),
259 (a,b) => safeMin(a,b.y.to!double),
260 (a,b) => safeMax(a,b.y.to!double)
261 )(seed,aes);
262 mins = [minmax[0],minmax[2]];
263 maxs = [minmax[1],minmax[3]];
264 }
265
266 grouped = group(aes);
267
268 foreach( i; 0..mins.length ) {
269 if (mins[i] == maxs[i]) {
270 mins[i] -= 0.5;
271 maxs[i] += 0.5;
272 }
273 }
274
275 defaults = grouped.front.front;
276 static if (dim == 1)
277 auto data = grouped.front.map!((t) => [t.x.to!double]); // Extract the x coordinates
278 else
279 auto data = grouped.front.map!((t) => [t.x.to!double,t.y.to!double]); // Extract the x coordinates
280 binned = bin(
281 data.array,
282 mins, maxs,
283 _noBins);
284 assert( !grouped.empty, "Groups should not be empty" );
285 grouped.popFront;
286 }
287
288 @property bool empty() {
289 import std.range : empty;
290 return (grouped.empty && binned.empty); }
291
292 @property auto front()
293 {
294 import ggplotd.aes : merge;
295 import std.conv : to;
296 static if (dim == 1)
297 {
298 auto w = binned.front.range[0][1]-binned.front.range[0][0];
299 return defaults.merge(
300 Tuple!(double, "x", double, "y", double, "width", double, "height")(
301 binned.front.range[0][0] + 0.5*w, 0.5*binned.front.count,
302 w, binned.front.count) );
303 }
304 else
305 {
306 // Returning polygons. In theory could also return rectangle, but their lines
307 // will overlap which will make it look wrong. Ideally would be able to set
308 // linewidth to 0 for the rectangles and that would solve it.
309 return [
310 defaults.merge(
311 Tuple!(double, "x", double, "y", double, "colour")(
312 binned.front.range[0][0], binned.front.range[1][0],
313 binned.front.count.to!double)),
314
315 defaults.merge(
316 Tuple!(double, "x", double, "y", double, "colour")(
317 binned.front.range[0][1], binned.front.range[1][0],
318 binned.front.count.to!double)),
319
320 defaults.merge(
321 Tuple!(double, "x", double, "y", double, "colour")(
322 binned.front.range[0][1], binned.front.range[1][1],
323 binned.front.count.to!double)),
324
325 defaults.merge(
326 Tuple!(double, "x", double, "y", double, "colour")(
327 binned.front.range[0][0], binned.front.range[1][1],
328 binned.front.count.to!double)),
329
330 ];
331 }
332 }
333
334 void popFront()
335 {
336 import std.array : array;
337 import std.conv : to;
338 import std.range : empty, front, popFront;
339 if (!binned.empty)
340 binned.popFront;
341 if (binned.empty && !grouped.empty)
342 {
343 defaults = grouped.front.front;
344 static if (dim == 1) // Extract the coordinates
345 auto data = grouped.front.map!((t) => [t.x.to!double]);
346 else
347 auto data = grouped.front.map!((t) => [t.x.to!double,t.y.to!double]);
348 binned = bin(data.array, mins, maxs, _noBins);
349 grouped.popFront;
350 }
351 }
352 }
353
354 auto statHistND(AES aesRange, size_t[] noBins)
355 {
356 // Get maxs, mins and noBins
357 return VolderMort!(typeof(aesRange))( aesRange, noBins );
358 }
359 }
360
361 /**
362 Create Aes that specifies the bins to draw an histogram
363
364 Params:
365 aesRaw = Data that the histogram will be based on
366 noBins = Optional number of bins. On a value of 0 the number of bins will be chosen automatically.
367
368 Returns: Range that holds rectangles representing different bins
369 */
370 auto statHist(AES)(AES aesRaw, size_t noBins = 0)
371 {
372 return statHistND!(1,AES)( aesRaw, [noBins] );
373 }
374
375 unittest
376 {
377 import std.algorithm : each, map, sort;
378 import std.array : array;
379 import std.random : uniform;
380 import std.range : iota, walkLength;
381
382 import ggplotd.aes : Aes;
383
384 auto xs = iota(0,100,1).map!((a) => uniform(0,6) ).array;
385 auto sh = statHist( Aes!(int[], "x")(xs) );
386 auto binXs = sh.map!((b) => b.x).array.sort().array;
387 assertEqual( binXs.length, 6 );
388
389 // Test single point
390 xs = [1];
391 sh = statHist( Aes!(int[], "x")(xs) );
392 assertEqual(sh.walkLength, 1);
393 }
394
395 /**
396 Create Aes that specifies the bins to draw an histogram
397
398 Params:
399 aesRange = Data that the histogram will be based on
400 noBinsX = Optional number of bins for x axis. On a value of 0 the number of bins will be chosen automatically.
401 noBinsY = Optional number of bins for y axis. On a value of 0 the number of bins will be chosen automatically.
402
403 Returns: Range that holds rectangles representing different bins
404 */
405 auto statHist2D(AES)(AES aesRange, size_t noBinsX = 0, size_t noBinsY = 0)
406 {
407 return statHistND!(2,AES)( aesRange, [noBinsX, noBinsY]);
408 }
409
410 /**
411 Calculate kernel density for given data
412
413 Params:
414 aesRange = Data that the histogram will be based on
415
416 Returns: InputRange that holds x and y coordinates for the kernel
417 */
418 auto statDensity(AES)( AES aesRange )
419 {
420 import std.algorithm : joiner, map, min, max, reduce;
421 import std.conv : to;
422 import std.range : chain, front;
423 import std.typecons : tuple, Tuple;
424 import ggplotd.aes : Aes, group;
425 import ggplotd.range : mergeRange;
426 import ggplotd.algorithm : safeMin, safeMax;
427
428 auto minmax = reduce!(
429 (a,b) => safeMin(a,b.x.to!double),
430 (a,b) => safeMax(a,b.x.to!double))(tuple(double.init,double.init), aesRange);
431 auto margin = (minmax[1] - minmax[0])/10.0;
432 minmax[0] -= margin;
433 minmax[1] += margin;
434
435
436
437 return aesRange.group.map!((g) {
438 auto xs = g.map!((t) => t.x.to!double);
439
440 // Calculate the kernel width (using scott thing in dstats)
441 // Initialize kernel with normal distribution.
442 import std.math : isFinite;
443 import dstats.kerneldensity : scottBandwidth, KernelDensity;
444 import dstats.distrib : normalPDF;
445 auto sigma = scottBandwidth(xs);
446 if (!isFinite(sigma) || sigma <= 0)
447 sigma = 0.5;
448 auto kernel = (double x) { return normalPDF(x, 0.0, sigma); };
449 auto density = KernelDensity.fromCallable(kernel, xs);
450
451 // Use statFunction with the kernel to produce a line
452 // Also add points to close the path down to zero
453 auto coords = chain(
454 [Tuple!(double, "x", double, "y")( minmax[0], 0.0 )],
455 statFunction( density, minmax[0], minmax[1] ),
456 [Tuple!(double, "x", double, "y")( minmax[1], 0.0 )]);
457
458
459 return g.front.mergeRange( coords );
460 }).joiner;
461 }
462
463 ///
464 unittest
465 {
466 import std.stdio : writeln;
467 import std.algorithm : map;
468 import std.array : array;
469 import std.random : uniform;
470 import std.range : chain, iota, repeat, walkLength;
471
472 import ggplotd.aes : Aes;
473
474 auto xs = iota(0,100,1)
475 .map!((i)=>uniform(0,0.75)+uniform(0.25,1))
476 .array;
477
478 auto dens = statDensity( Aes!( typeof(xs), "x")( xs ) );
479 auto dim = dens.walkLength;
480 assertGreaterThan( dim, 1 );
481
482 // Test that grouping leads to longer (twice as long) result
483 auto cols = chain("a".repeat(50),"b".repeat(50) );
484 auto dens2 = statDensity( Aes!(typeof(cols), "colour", typeof(xs), "x")( cols, xs ) );
485 assertGreaterThan( dens2.walkLength, 1.9*dim );
486 assertLessThan( dens2.walkLength, 2.1*dim );
487
488 // Test that colour is passed through (merged)
489 assertEqual( dens2.front.colour.length, 1 );
490
491 // Test single point
492 xs = [1];
493 dens = statDensity( Aes!( typeof(xs), "x")( xs ) );
494 assertEqual(dens.walkLength, 3);
495 }
496
497 /**
498 Calculate kernel density for given x and y data
499
500 Params:
501 aesRange = Data that the histogram will be based on
502
503 Returns: Range of ranges that holds polygons for the kernel
504 */
505 auto statDensity2D(AES)( AES aesRange )
506 {
507 import std.algorithm : joiner, map, min, max, reduce;
508 import std.array : array;
509 import std.conv : to;
510 import std.range : chain, front, iota, zip;
511 import std.meta : Erase;
512 import std.typecons : tuple, Tuple;
513 import ggplotd.aes : Aes, group, DefaultGroupFields;
514 import ggplotd.algorithm : safeMin, safeMax;
515 import ggplotd.range : mergeRange;
516
517 auto minmax = reduce!(
518 (a,b) => safeMin(a,b.x.to!double),
519 (a,b) => safeMax(a,b.x.to!double),
520 (a,b) => safeMin(a,b.y.to!double),
521 (a,b) => safeMax(a,b.y.to!double),
522 )(tuple(double.init,double.init,double.init,double.init), aesRange);
523
524 if (minmax[0] == minmax[1]) {
525 minmax[0] -= 0.5;
526 minmax[1] += 0.5;
527 }
528 if (minmax[2] == minmax[3]) {
529 minmax[2] -= 0.5;
530 minmax[3] += 0.5;
531 }
532
533 return aesRange.group!(Erase!("colour",DefaultGroupFields)).map!((g) {
534 auto xs = g.map!((t) => t.x.to!double);
535 auto ys = g.map!((t) => t.y.to!double);
536
537 import std.math : isFinite;
538 // Calculate the kernel width (using scott thing in dstats)
539 // Initialize kernel with normal distribution.
540 import dstats.kerneldensity : scottBandwidth, KernelDensity;
541 import dstats.distrib : normalPDF;
542 auto sigmaX = scottBandwidth(xs);
543 if (!isFinite(sigmaX) || sigmaX <= 0)
544 sigmaX = 1e-5;
545 auto sigmaY = scottBandwidth(ys);
546 if (!isFinite(sigmaY) || sigmaY <= 0)
547 sigmaY = 1e-5;
548
549 auto kernel = (double x, double y) { return normalPDF(x, 0.0, sigmaX)*
550 normalPDF(y, 0.0, sigmaY); };
551 auto density = KernelDensity.fromCallable(kernel, xs, ys);
552
553 auto marginX = (minmax[1] - minmax[0])/10.0;
554 auto marginY = (minmax[3] - minmax[2])/10.0;
555
556 minmax[0] -= marginX;
557 minmax[1] += marginX;
558 minmax[2] -= marginY;
559 minmax[3] += marginY;
560
561
562 // Use statFunction with the kernel to produce a line
563 // Also add points to close the path down to zero
564 auto stepX = (minmax[1] - minmax[0])/25.0;
565 auto stepY = (minmax[3] - minmax[2])/25.0;
566 auto xCoords = iota( minmax[0], minmax[1], stepX ).array ~ minmax[1];
567 auto yCoords = iota( minmax[2], minmax[3], stepY ).array ~ minmax[3];
568 auto coords = zip(xCoords, xCoords[1..$]).map!( (xr) {
569 return zip(yCoords, yCoords[1..$]).map!( (yr) {
570 // Two polygons
571 return [
572 g.front.mergeRange(Aes!( double[], "x", double[], "y", double[], "colour" )
573 ([xr[0], xr[0], xr[1]],
574 [yr[0], yr[1], yr[1]],
575 [density(xr[0],yr[0]),density(xr[0],yr[1]),density(xr[1],yr[1])])),
576 g.front.mergeRange(
577 Aes!( double[], "x", double[], "y", double[], "colour" )
578 ([xr[0], xr[1], xr[1]],
579 [yr[0], yr[0], yr[1]],
580 [density(xr[0],yr[0]),density(xr[1],yr[0]),density(xr[1],yr[1])]))
581 ];
582 }).joiner;
583 }).joiner;
584 return coords;
585 //return g;
586 }).joiner;
587 }
588
589 ///
590 unittest
591 {
592 import std.algorithm : map;
593 import std.array : array;
594 import std.random : uniform;
595 import std.range : iota, walkLength;
596
597 import ggplotd.aes : Aes;
598 auto xs = iota(0,500,1).map!((x) => uniform(0.0,5)+uniform(0.0,5))
599 .array;
600 auto ys = iota(0,500,1).map!((y) => uniform(1.0,1.5)+uniform(1.0,1.5))
601 .array;
602 auto aes = Aes!(typeof(xs), "x", typeof(ys), "y")( xs, ys);
603
604 auto sD = statDensity2D( aes );
605 assertGreaterThan( sD.walkLength, 1150 );
606 assertLessThan( sD.walkLength, 1450 );
607 assertEqual( sD.front.walkLength, 3 );
608
609 // One value
610 xs = [1];
611 ys = [2];
612 aes = Aes!(typeof(xs), "x", typeof(ys), "y")( xs, ys);
613
614 sD = statDensity2D( aes );
615 assertGreaterThan(sD.walkLength, 0);
616 }