Skip to content

Commit 6a442e3

Browse files
mbostockFil
andauthored
linear regression with confidence bands (#945)
* linear regression with confidence bands * handle invalid p; adopt marks * fix fill, stroke, z * fix tests * fit regression extent to data * configurable precision * combine regression marks * fix combined mark order * add penguins test * rename tests * remove unnecessary sort * linearRegressionX * make band optional * tentative documentation for Plot.linearRegression * fix band-line order * ci, not p Co-authored-by: Philippe Rivière <[email protected]>
1 parent 90d2238 commit 6a442e3

15 files changed

+1428
-2
lines changed

README.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,6 +1132,39 @@ Plot.image(presidents, {x: "inauguration", y: "favorability", src: "portrait"})
11321132
11331133
Returns a new image with the given *data* and *options*. If neither the **x** nor **y** nor **frameAnchor** options are specified, *data* is assumed to be an array of pairs [[*x₀*, *y₀*], [*x₁*, *y₁*], [*x₂*, *y₂*], …] such that **x** = [*x₀*, *x₁*, *x₂*, …] and **y** = [*y₀*, *y₁*, *y₂*, …].
11341134
1135+
1136+
### Linear regression
1137+
1138+
[<img src="./img/linear-regression.png" width="600" alt="a scatterplot of penguin culmens, showing the length and depth of several species, with linear regression models by species and for the whole population, illustrating Simpson’s paradox">](https://observablehq.com/@observablehq/plot-linear-regression)
1139+
1140+
[Source](./src/marks/linearRegression.js) · [Examples](https://observablehq.com/@observablehq/plot-linear-regression) · Draws [linear regression](https://en.wikipedia.org/wiki/Linear_regression) lines with confidence bands, representing the estimated relation of a dependent variable (typically *y*) on an independent variable (typically *x*). The linear regression line is fit using the [least squares](https://en.wikipedia.org/wiki/Least_squares) approach. See Torben Jansen’s [“Linear regression with confidence bands”](https://observablehq.com/@toja/linear-regression-with-confidence-bands) and [this StatExchange question](https://stats.stackexchange.com/questions/101318/understanding-shape-and-calculation-of-confidence-bands-in-linear-regression) for details on the confidence interval calculation.
1141+
1142+
The given *options* are passed through to these underlying marks, with the exception of the following options:
1143+
1144+
* **stroke** - the stroke color of the regression line; defaults to *currentColor*
1145+
* **fill** - the fill color of the confidence band; defaults to the line’s *stroke*
1146+
* **fillOpacity** - the fill opacity of the confidence band; defaults to 0.1
1147+
* **ci** - the confidence interval in [0, 1), or 0 to hide bands; defaults to 0.95
1148+
* **precision** - the distance (in pixels) between samples of the confidence band; defaults to 4
1149+
1150+
Multiple regressions can be defined by specifying the *z*, *fill*, or *stroke* channel.
1151+
1152+
#### Plot.linearRegressionX(*data*, *options*)
1153+
1154+
```js
1155+
Plot.linearRegressionX(mtcars, {y: "wt", x: "hp"})
1156+
```
1157+
1158+
Returns a linear regression mark where *x* is the dependent variable and *y* is the independent variable.
1159+
1160+
#### Plot.linearRegressionY(*data*, *options*)
1161+
1162+
```js
1163+
Plot.linearRegressionY(mtcars, {x: "wt", y: "hp"})
1164+
```
1165+
1166+
Returns a linear regression mark where *y* is the dependent variable and *x* is the independent variable.
1167+
11351168
### Line
11361169
11371170
[<img src="./img/line.png" width="320" height="198" alt="a line chart">](https://observablehq.com/@observablehq/plot-line)

img/linear-regression.png

37.1 KB
Loading

src/index.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ export {Frame, frame} from "./marks/frame.js";
1010
export {Hexgrid, hexgrid} from "./marks/hexgrid.js";
1111
export {Image, image} from "./marks/image.js";
1212
export {Line, line, lineX, lineY} from "./marks/line.js";
13+
export {linearRegressionX, linearRegressionY} from "./marks/linearRegression.js";
1314
export {Link, link} from "./marks/link.js";
1415
export {Rect, rect, rectX, rectY} from "./marks/rect.js";
1516
export {RuleX, RuleY, ruleX, ruleY} from "./marks/rule.js";

src/marks/linearRegression.js

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
import {create, extent, range, sum, area as shapeArea, namespaces} from "d3";
2+
import {identity, indexOf, isNone, isNoneish, maybeZ} from "../options.js";
3+
import {Mark} from "../plot.js";
4+
import {qt} from "../stats.js";
5+
import {applyDirectStyles, applyGroupedChannelStyles, applyIndirectStyles, applyTransform, groupZ, offset} from "../style.js";
6+
import {maybeDenseIntervalX, maybeDenseIntervalY} from "../transforms/bin.js";
7+
8+
const defaults = {
9+
ariaLabel: "linear-regression",
10+
fill: "currentColor",
11+
fillOpacity: 0.1,
12+
stroke: "currentColor",
13+
strokeWidth: 1.5,
14+
strokeLinecap: "round",
15+
strokeLinejoin: "round",
16+
strokeMiterlimit: 1
17+
};
18+
19+
class LinearRegression extends Mark {
20+
constructor(data, options = {}) {
21+
const {x, y, z, ci = 0.95, precision = 4} = options;
22+
super(
23+
data,
24+
[
25+
{name: "x", value: x, scale: "x"},
26+
{name: "y", value: y, scale: "y"},
27+
{name: "z", value: maybeZ(options), optional: true}
28+
],
29+
options,
30+
defaults
31+
);
32+
this.z = z;
33+
this.ci = +ci;
34+
this.precision = +precision;
35+
if (!(0 <= this.ci && this.ci < 1)) throw new Error(`invalid ci; not in [0, 1): ${ci}`);
36+
if (!(this.precision > 0)) throw new Error(`invalid precision: ${precision}`);
37+
}
38+
render(I, {x, y}, channels, dimensions) {
39+
const {x: X, y: Y, z: Z} = channels;
40+
const {dx, dy, ci} = this;
41+
return create("svg:g")
42+
.call(applyIndirectStyles, this, dimensions)
43+
.call(applyTransform, x, y, offset + dx, offset + dy)
44+
.call(g => g.selectAll()
45+
.data(Z ? groupZ(I, Z, this.z) : [I])
46+
.enter()
47+
.call(enter => enter.append("path")
48+
.attr("fill", "none")
49+
.call(applyDirectStyles, this)
50+
.call(applyGroupedChannelStyles, this, {...channels, fill: null, fillOpacity: null})
51+
.attr("d", I => this._renderLine(I, X, Y))
52+
.call(ci && !isNone(this.fill) ? path => path.select(pathBefore)
53+
.attr("stroke", "none")
54+
.call(applyDirectStyles, this)
55+
.call(applyGroupedChannelStyles, this, {...channels, stroke: null, strokeOpacity: null, strokeWidth: null})
56+
.attr("d", I => this._renderBand(I, X, Y)) : () => {})))
57+
.node();
58+
}
59+
}
60+
61+
function pathBefore() {
62+
return this.parentNode.insertBefore(this.ownerDocument.createElementNS(namespaces.svg, "path"), this);
63+
}
64+
65+
class LinearRegressionX extends LinearRegression {
66+
constructor(data, options) {
67+
super(data, options);
68+
}
69+
_renderBand(I, X, Y) {
70+
const {ci, precision} = this;
71+
const [y1, y2] = extent(I, i => Y[i]);
72+
const f = linearRegressionF(I, Y, X);
73+
const g = confidenceIntervalF(I, Y, X, (1 - ci) / 2, f);
74+
return shapeArea()
75+
.y(y => y)
76+
.x0(y => g(y, -1))
77+
.x1(y => g(y, +1))
78+
(range(y1, y2 - precision / 2, precision).concat(y2));
79+
}
80+
_renderLine(I, X, Y) {
81+
const [y1, y2] = extent(I, i => Y[i]);
82+
const f = linearRegressionF(I, Y, X);
83+
return `M${f(y1)},${y1}L${f(y2)},${y2}`;
84+
}
85+
}
86+
87+
class LinearRegressionY extends LinearRegression {
88+
constructor(data, options) {
89+
super(data, options);
90+
}
91+
_renderBand(I, X, Y) {
92+
const {ci, precision} = this;
93+
const [x1, x2] = extent(I, i => X[i]);
94+
const f = linearRegressionF(I, X, Y);
95+
const g = confidenceIntervalF(I, X, Y, (1 - ci) / 2, f);
96+
return shapeArea()
97+
.x(x => x)
98+
.y0(x => g(x, -1))
99+
.y1(x => g(x, +1))
100+
(range(x1, x2 - precision / 2, precision).concat(x2));
101+
}
102+
_renderLine(I, X, Y) {
103+
const [x1, x2] = extent(I, i => X[i]);
104+
const f = linearRegressionF(I, X, Y);
105+
return `M${x1},${f(x1)}L${x2},${f(x2)}`;
106+
}
107+
}
108+
109+
export function linearRegressionX(data, {y = indexOf, x = identity, stroke, fill = isNoneish(stroke) ? "currentColor" : stroke, ...options} = {}) {
110+
return new LinearRegressionX(data, maybeDenseIntervalY({...options, x, y, fill, stroke}));
111+
}
112+
113+
export function linearRegressionY(data, {x = indexOf, y = identity, stroke, fill = isNoneish(stroke) ? "currentColor" : stroke, ...options} = {}) {
114+
return new LinearRegressionY(data, maybeDenseIntervalX({...options, x, y, fill, stroke}));
115+
}
116+
117+
function linearRegressionF(I, X, Y) {
118+
let sumX = 0, sumY = 0, sumXY = 0, sumX2 = 0;
119+
for (const i of I) {
120+
const xi = X[i];
121+
const yi = Y[i];
122+
sumX += xi;
123+
sumY += yi;
124+
sumXY += xi * yi;
125+
sumX2 += xi * xi;
126+
}
127+
const n = I.length;
128+
const slope = (n * sumXY - sumX * sumY) / (n * sumX2 - sumX * sumX);
129+
const intercept = (sumY - slope * sumX) / n;
130+
return x => slope * x + intercept;
131+
}
132+
133+
function confidenceIntervalF(I, X, Y, p, f) {
134+
const mean = sum(I, i => X[i]) / I.length;
135+
let a = 0, b = 0;
136+
for (const i of I) {
137+
a += (X[i] - mean) ** 2;
138+
b += (Y[i] - f(X[i])) ** 2;
139+
}
140+
const sy = Math.sqrt(b / (I.length - 2));
141+
const t = qt(p, I.length - 2);
142+
return (x, k) => {
143+
const Y = f(x);
144+
const se = sy * Math.sqrt(1 / I.length + (x - mean) ** 2 / a);
145+
return Y + k * t * se;
146+
};
147+
}

src/stats.js

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
// https://github.com/jstat/jstat
2+
//
3+
// Copyright (c) 2013 jStat
4+
//
5+
// Permission is hereby granted, free of charge, to any person obtaining a copy
6+
// of this software and associated documentation files (the "Software"), to deal
7+
// in the Software without restriction, including without limitation the rights
8+
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
// copies of the Software, and to permit persons to whom the Software is
10+
// furnished to do so, subject to the following conditions:
11+
//
12+
// The above copyright notice and this permission notice shall be included in
13+
// all copies or substantial portions of the Software.
14+
//
15+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
// SOFTWARE.
22+
23+
export function ibetainv(p, a, b) {
24+
var EPS = 1e-8;
25+
var a1 = a - 1;
26+
var b1 = b - 1;
27+
var j = 0;
28+
var lna, lnb, pp, t, u, err, x, al, h, w, afac;
29+
if (p <= 0) return 0;
30+
if (p >= 1) return 1;
31+
if (a >= 1 && b >= 1) {
32+
pp = p < 0.5 ? p : 1 - p;
33+
t = Math.sqrt(-2 * Math.log(pp));
34+
x = (2.30753 + t * 0.27061) / (1 + t * (0.99229 + t * 0.04481)) - t;
35+
if (p < 0.5) x = -x;
36+
al = (x * x - 3) / 6;
37+
h = 2 / (1 / (2 * a - 1) + 1 / (2 * b - 1));
38+
w =
39+
(x * Math.sqrt(al + h)) / h -
40+
(1 / (2 * b - 1) - 1 / (2 * a - 1)) * (al + 5 / 6 - 2 / (3 * h));
41+
x = a / (a + b * Math.exp(2 * w));
42+
} else {
43+
lna = Math.log(a / (a + b));
44+
lnb = Math.log(b / (a + b));
45+
t = Math.exp(a * lna) / a;
46+
u = Math.exp(b * lnb) / b;
47+
w = t + u;
48+
if (p < t / w) x = Math.pow(a * w * p, 1 / a);
49+
else x = 1 - Math.pow(b * w * (1 - p), 1 / b);
50+
}
51+
afac = -gammaln(a) - gammaln(b) + gammaln(a + b);
52+
for (; j < 10; j++) {
53+
if (x === 0 || x === 1) return x;
54+
err = ibeta(x, a, b) - p;
55+
t = Math.exp(a1 * Math.log(x) + b1 * Math.log(1 - x) + afac);
56+
u = err / t;
57+
x -= t = u / (1 - 0.5 * Math.min(1, u * (a1 / x - b1 / (1 - x))));
58+
if (x <= 0) x = 0.5 * (x + t);
59+
if (x >= 1) x = 0.5 * (x + t + 1);
60+
if (Math.abs(t) < EPS * x && j > 0) break;
61+
}
62+
return x;
63+
}
64+
65+
export function ibeta(x, a, b) {
66+
// Factors in front of the continued fraction.
67+
var bt =
68+
x === 0 || x === 1
69+
? 0
70+
: Math.exp(
71+
gammaln(a + b) -
72+
gammaln(a) -
73+
gammaln(b) +
74+
a * Math.log(x) +
75+
b * Math.log(1 - x)
76+
);
77+
if (x < 0 || x > 1) return false;
78+
if (x < (a + 1) / (a + b + 2))
79+
// Use continued fraction directly.
80+
return (bt * betacf(x, a, b)) / a;
81+
// else use continued fraction after making the symmetry transformation.
82+
return 1 - (bt * betacf(1 - x, b, a)) / b;
83+
}
84+
85+
export function betacf(x, a, b) {
86+
var fpmin = 1e-30;
87+
var m = 1;
88+
var qab = a + b;
89+
var qap = a + 1;
90+
var qam = a - 1;
91+
var c = 1;
92+
var d = 1 - (qab * x) / qap;
93+
var m2, aa, del, h;
94+
95+
// These q's will be used in factors that occur in the coefficients
96+
if (Math.abs(d) < fpmin) d = fpmin;
97+
d = 1 / d;
98+
h = d;
99+
100+
for (; m <= 100; m++) {
101+
m2 = 2 * m;
102+
aa = (m * (b - m) * x) / ((qam + m2) * (a + m2));
103+
// One step (the even one) of the recurrence
104+
d = 1 + aa * d;
105+
if (Math.abs(d) < fpmin) d = fpmin;
106+
c = 1 + aa / c;
107+
if (Math.abs(c) < fpmin) c = fpmin;
108+
d = 1 / d;
109+
h *= d * c;
110+
aa = (-(a + m) * (qab + m) * x) / ((a + m2) * (qap + m2));
111+
// Next step of the recurrence (the odd one)
112+
d = 1 + aa * d;
113+
if (Math.abs(d) < fpmin) d = fpmin;
114+
c = 1 + aa / c;
115+
if (Math.abs(c) < fpmin) c = fpmin;
116+
d = 1 / d;
117+
del = d * c;
118+
h *= del;
119+
if (Math.abs(del - 1.0) < 3e-7) break;
120+
}
121+
122+
return h;
123+
}
124+
125+
export function gammaln(x) {
126+
var j = 0;
127+
var cof = [
128+
76.18009172947146, -86.5053203294167, 24.01409824083091, -1.231739572450155,
129+
0.1208650973866179e-2, -0.5395239384953e-5
130+
];
131+
var ser = 1.000000000190015;
132+
var xx, y, tmp;
133+
tmp = (y = xx = x) + 5.5;
134+
tmp -= (xx + 0.5) * Math.log(tmp);
135+
for (; j < 6; j++) ser += cof[j] / ++y;
136+
return Math.log((2.506628274631 * ser) / xx) - tmp;
137+
}
138+
139+
export function qt(p, dof) {
140+
var x = ibetainv(2 * Math.min(p, 1 - p), 0.5 * dof, 0.5);
141+
x = Math.sqrt((dof * (1 - x)) / x);
142+
return p > 0.5 ? x : -x;
143+
}

src/style.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ function groupAesthetics({ariaLabel: AL, title: T, fill: F, fillOpacity: FO, str
184184
return [AL, T, F, FO, S, SO, SW, O, H].filter(c => c !== undefined);
185185
}
186186

187-
function groupZ(I, Z, z) {
187+
export function groupZ(I, Z, z) {
188188
const G = group(I, i => Z[i]);
189189
if (z === undefined && G.size > I.length >> 1) {
190190
warn(`Warning: the implicit z channel has high cardinality. This may occur when the fill or stroke channel is associated with quantitative data rather than ordinal or categorical data. You can suppress this warning by setting the z option explicitly; if this data represents a single series, set z to null.`);
@@ -247,7 +247,7 @@ export function maybeClip(clip) {
247247
throw new Error(`invalid clip method: ${clip}`);
248248
}
249249

250-
export function applyIndirectStyles(selection, mark, {width, height, marginLeft, marginRight, marginTop, marginBottom}) {
250+
export function applyIndirectStyles(selection, mark, dimensions) {
251251
applyAttr(selection, "aria-label", mark.ariaLabel);
252252
applyAttr(selection, "aria-description", mark.ariaDescription);
253253
applyAttr(selection, "aria-hidden", mark.ariaHidden);
@@ -265,6 +265,7 @@ export function applyIndirectStyles(selection, mark, {width, height, marginLeft,
265265
applyAttr(selection, "paint-order", mark.paintOrder);
266266
applyAttr(selection, "pointer-events", mark.pointerEvents);
267267
if (mark.clip === "frame") {
268+
const {width, height, marginLeft, marginRight, marginTop, marginBottom} = dimensions;
268269
const id = `plot-clip-${++nextClipId}`;
269270
selection
270271
.attr("clip-path", `url(#${id})`)

test/data/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,10 @@ https://www.metoffice.gov.uk/hadobs/hadcrut4/data/current/series_format.html
7171
The New York Times
7272
https://www.nytimes.com/2019/12/02/upshot/wealth-poverty-divide-american-cities.html
7373

74+
## mtcars.csv
75+
1974 *Motor Trend* US magazine
76+
https://www.rdocumentation.org/packages/datasets/versions/3.6.2/topics/mtcars
77+
7478
## moby-dick-chapter-1.txt
7579
*Moby Dick; or The Whale* by Herman Melville
7680
https://www.gutenberg.org/files/2701/2701-h/2701-h.htm

0 commit comments

Comments
 (0)