diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 72a329951..22db382f1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,7 +23,7 @@ jobs: run: | pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128 pip install pyro-ppl - pip install --upgrade "jax[cuda12-local]" + pip install --upgrade "jax[cuda12-local]==0.6.2" pip install numpyro pyro-ppl python scripts/test-jax-install.py - name: Check nvidia Drivers diff --git a/lectures/finite_markov.md b/lectures/finite_markov.md index 3c3c921c9..924358069 100644 --- a/lectures/finite_markov.md +++ b/lectures/finite_markov.md @@ -1077,7 +1077,7 @@ for x0, col in ((0, 'blue'), (1, 'green')): X_bar = (X == 0).cumsum() / (1 + np.arange(N, dtype=float)) # Plot ax.fill_between(range(N), np.zeros(N), X_bar - p, color=col, alpha=0.1) - ax.plot(X_bar - p, color=col, label=f'$X_0 = \, {x0} $') + ax.plot(X_bar - p, color=col, label=fr'$X_0 = \, {x0} $') # Overlay in black--make lines clearer ax.plot(X_bar - p, 'k-', alpha=0.6) @@ -1280,7 +1280,7 @@ Q = np.zeros((n, n), dtype=int) with open(infile) as f: edges = f.readlines() for edge in edges: - from_node, to_node = re.findall('\w', edge) + from_node, to_node = re.findall(r'\w', edge) i, j = alphabet.index(from_node), alphabet.index(to_node) Q[i, j] = 1 # Create the corresponding Markov matrix P diff --git a/lectures/imp_sample.md b/lectures/imp_sample.md index 5effb8d2d..11c7258f9 100644 --- a/lectures/imp_sample.md +++ b/lectures/imp_sample.md @@ -90,7 +90,7 @@ w_range = np.linspace(1e-2, 1-1e-5, 1000) plt.plot(w_range, g(w_range), label='g') plt.plot(w_range, f(w_range), label='f') -plt.xlabel('$\omega$') +plt.xlabel(r'$\omega$') plt.legend() plt.title('density functions $f$ and $g$') plt.show() @@ -104,8 +104,8 @@ l = jit(lambda w: f(w) / g(w)) ```{code-cell} ipython3 plt.plot(w_range, l(w_range)) -plt.title('$\ell(\omega)$') -plt.xlabel('$\omega$') +plt.title(r'$\ell(\omega)$') +plt.xlabel(r'$\omega$') plt.show() ``` @@ -314,7 +314,7 @@ for i, t in enumerate([1, 5, 10, 20]): for n, bins, μ_hat, σ_hat in [[n_p, bins_p, μ_hat_p, σ_hat_p], [n_q, bins_q, μ_hat_q, σ_hat_q]]: idx = np.argmax(n) - axs[row, col].text(bins[idx], n[idx], '$\hat{μ}$='+f'{μ_hat:.4g}'+', $\hat{σ}=$'+f'{σ_hat:.4g}') + axs[row, col].text(bins[idx], n[idx], r'$\hat{μ}$='+f'{μ_hat:.4g}'+r', $\hat{σ}=$'+f'{σ_hat:.4g}') plt.show() ``` @@ -418,7 +418,7 @@ for i, t in enumerate([1, 20]): for n, bins, μ_hat, σ_hat in [[n_p, bins_p, μ_hat_p, σ_hat_p], [n_q, bins_q, μ_hat_q, σ_hat_q]]: idx = np.argmax(n) - axs[i].text(bins[idx], n[idx], '$\hat{μ}$='+f'{μ_hat:.4g}'+', $\hat{σ}=$'+f'{σ_hat:.4g}') + axs[i].text(bins[idx], n[idx], r'$\hat{μ}$='+f'{μ_hat:.4g}'+r', $\hat{σ}=$'+f'{σ_hat:.4g}') plt.show() ``` @@ -452,7 +452,7 @@ for i, t in enumerate([1, 20]): for n, bins, μ_hat, σ_hat in [[n_p, bins_p, μ_hat_p, σ_hat_p], [n_q, bins_q, μ_hat_q, σ_hat_q]]: idx = np.argmax(n) - axs[i].text(bins[idx], n[idx], '$\hat{μ}$='+f'{μ_hat:.4g}'+', $\hat{σ}=$'+f'{σ_hat:.4g}') + axs[i].text(bins[idx], n[idx], r'$\hat{μ}$='+f'{μ_hat:.4g}'+r', $\hat{σ}=$'+f'{σ_hat:.4g}') plt.show() ``` diff --git a/lectures/likelihood_bayes.md b/lectures/likelihood_bayes.md index f3dd2f6c9..0f2211f06 100644 --- a/lectures/likelihood_bayes.md +++ b/lectures/likelihood_bayes.md @@ -3,8 +3,10 @@ jupytext: text_representation: extension: .md format_name: myst + format_version: 0.13 + jupytext_version: 1.17.2 kernelspec: - display_name: Python 3 + display_name: Python 3 (ipykernel) language: python name: python3 --- @@ -55,6 +57,8 @@ import numpy as np from numba import vectorize, jit, prange from math import gamma import pandas as pd +from scipy.integrate import quad + import seaborn as sns colors = sns.color_palette() @@ -117,7 +121,14 @@ $$ where $w^t=\{ w_1,\dots,w_t\}$ is a history of observations up to and including time $t$. -Sometimes for shorthand we'll write $L_t = L(w^t)$. +Sometimes for shorthand we'll write + +$$ +L_t = L(w^t) = \frac{f(w^t)}{g(w^t)} +$$ + +where we use the conventions +that $f(w^t) = f(w_1) f(w_2) \ldots f(w_t)$ and $g(w^t) = g(w_1) g(w_2) \ldots g(w_t)$. Notice that the likelihood process satisfies the *recursion* or *multiplicative decomposition* @@ -136,7 +147,7 @@ beta distributions, then computes and simulates an associated likelihood ratio process by generating a sequence $w^t$ from *some* probability distribution, for example, a sequence of IID draws from $g$. -```{code-cell} python3 +```{code-cell} ipython3 # Parameters in the two beta distributions. F_a, F_b = 1, 1 G_a, G_b = 3, 1.2 @@ -151,7 +162,7 @@ f = jit(lambda x: p(x, F_a, F_b)) g = jit(lambda x: p(x, G_a, G_b)) ``` -```{code-cell} python3 +```{code-cell} ipython3 @jit def simulate(a, b, T=50, N=500): ''' @@ -173,33 +184,52 @@ def simulate(a, b, T=50, N=500): We'll also use the following Python code to prepare some informative simulations -```{code-cell} python3 +```{code-cell} ipython3 l_arr_g = simulate(G_a, G_b, N=50000) l_seq_g = np.cumprod(l_arr_g, axis=1) ``` -```{code-cell} python3 +```{code-cell} ipython3 l_arr_f = simulate(F_a, F_b, N=50000) l_seq_f = np.cumprod(l_arr_f, axis=1) ``` ## Likelihood Ratio Process and Bayes’ Law -Let $\pi_t$ be a Bayesian posterior defined as +Let $\pi_{t+1}$ be a Bayesian posterior probability defined as $$ -\pi_t = {\rm Prob}(q=f|w^t) -$$ +\pi_{t+1} = {\rm Prob}(q=f|w^{t+1}) +$$ (eq:defbayesposterior) The likelihood ratio process is a principal actor in the formula that governs the evolution of the posterior probability $\pi_t$, an instance of **Bayes' Law**. -Bayes’ law implies that $\{\pi_t\}$ obeys the recursion +Bayes' law is just the following application of the standardformula for conditional probability: + +$$ +{\rm Prob}(q=f|w^{t+1}) = \frac { {\rm Prob}(q=f|w^{t} ) f(w_{t+1})}{ {\rm Prob}(q=f|w^{t} ) f(w_{t+1}) + (1 - {\rm Prob}(q=f|w^{t} )) g(w_{t+1})} +$$ + +or + +$$ +\pi_{t+1} = \frac { \pi_t f(w_{t+1})}{ \pi_t f(w_{t+1}) + (1 - \pi_t) g(w_{t+1})} +$$ (eq:bayes150) + +Evidently, the above equation asserts that + +$$ +{\rm Prob}(q=f|w^{t+1}) = \frac{{\rm Prob}(q=f|w^{t}) f(w_{t+1} )} {{\rm Prob}(w_{t+1})} +$$ + + +Dividing both the numerator and the denominator on the right side of the equation {eq}`eq:bayes150` by $g(w_{t+1})$ implies the recursion ```{math} :label: eq_recur1 -\pi_t=\frac{\pi_{t-1} l_t(w_t)}{\pi_{t-1} l_t(w_t)+1-\pi_{t-1}} +\pi_{t+1}=\frac{\pi_{t} l_t(w_{t+1})}{\pi_{t} l_t(w_t)+1-\pi_{t}} ``` with $\pi_{0}$ being a Bayesian prior probability that $q = f$, @@ -208,7 +238,7 @@ i.e., a personal or subjective belief about $q$ based on our having seen no data Below we define a Python function that updates belief $\pi$ using likelihood ratio $\ell$ according to recursion {eq}`eq_recur1` -```{code-cell} python3 +```{code-cell} ipython3 @jit def update(π, l): "Update π using likelihood l" @@ -277,6 +307,16 @@ and the initial prior $\pi_{0}$ Formula {eq}`eq_Bayeslaw103` generalizes formula {eq}`eq_recur1`. +```{note} +Fomula {eq}`eq_Bayeslaw103` can also be derived by starting from the formula for conditional probability + +$$ +\pi_{t+1} \equiv {\rm Prob}(q=f|w^{t+1}) = \frac { \pi_0 f(w^{t+1})}{ \pi_0 f(w^{t+1}) + (1 - \pi_0) g(w^{t+1})} +$$ + +and then dividing the numerator and the denominator on the right side by $g(w^{t+1})$. +``` + Formula {eq}`eq_Bayeslaw103` can be regarded as a one step revision of prior probability $\pi_0$ after seeing the batch of data $\left\{ w_{i}\right\} _{i=1}^{t+1}$. @@ -293,14 +333,14 @@ $\pi_t$ that are associated with the *same* realization of the likelihood ratio First, we tell Python two values of $\pi_0$. -```{code-cell} python3 +```{code-cell} ipython3 π1, π2 = 0.2, 0.8 ``` Next we generate paths of the likelihood ratio process $L_t$ and the posterior $\pi_t$ for a history of IID draws from density $f$. -```{code-cell} python3 +```{code-cell} ipython3 T = l_arr_f.shape[1] π_seq_f = np.empty((2, T+1)) π_seq_f[:, 0] = π1, π2 @@ -310,13 +350,13 @@ for t in range(T): π_seq_f[i, t+1] = update(π_seq_f[i, t], l_arr_f[0, t]) ``` -```{code-cell} python3 +```{code-cell} ipython3 fig, ax1 = plt.subplots() for i in range(2): ax1.plot(range(T+1), π_seq_f[i, :], label=fr"$\pi_0$={π_seq_f[i, 0]}") -ax1.set_ylabel("$\pi_t$") +ax1.set_ylabel(r"$\pi_t$") ax1.set_xlabel("t") ax1.legend() ax1.set_title("when f governs data") @@ -334,7 +374,7 @@ Please note that there are two different scales on the $y$ axis. Now let's study what happens when the history consists of IID draws from density $g$ -```{code-cell} python3 +```{code-cell} ipython3 T = l_arr_g.shape[1] π_seq_g = np.empty((2, T+1)) π_seq_g[:, 0] = π1, π2 @@ -344,13 +384,13 @@ for t in range(T): π_seq_g[i, t+1] = update(π_seq_g[i, t], l_arr_g[0, t]) ``` -```{code-cell} python3 +```{code-cell} ipython3 fig, ax1 = plt.subplots() for i in range(2): ax1.plot(range(T+1), π_seq_g[i, :], label=fr"$\pi_0$={π_seq_g[i, 0]}") -ax1.set_ylabel("$\pi_t$") +ax1.set_ylabel(r"$\pi_t$") ax1.set_xlabel("t") ax1.legend() ax1.set_title("when g governs data") @@ -364,7 +404,7 @@ plt.show() Below we offer Python code that verifies that nature chose permanently to draw from density $f$. -```{code-cell} python3 +```{code-cell} ipython3 π_seq = np.empty((2, T+1)) π_seq[:, 0] = π1, π2 @@ -373,20 +413,311 @@ for i in range(2): π_seq[i, 1:] = πL / (πL + 1 - π_seq[i, 0]) ``` -```{code-cell} python3 +```{code-cell} ipython3 np.abs(π_seq - π_seq_f).max() < 1e-10 ``` We thus conclude that the likelihood ratio process is a key ingredient of the formula {eq}`eq_Bayeslaw103` for -a Bayesian's posteior probabilty that nature has drawn history $w^t$ as repeated draws from density -$g$. +a Bayesian's posterior probabilty that nature has drawn history $w^t$ as repeated draws from density +$f$. +## Another timing protocol +Let's study how the posterior probability $\pi_t = {\rm Prob}(q=f|w^{t}) $ behaves when nature generates the +history $w^t = w_1, w_2, \ldots, w_t$ under a different timing protocol. -## Behavior of posterior probability $\{\pi_t\}$ under the subjective probability distribution +Until now we assumed that before time $1$ nature somehow chose to draw $w^t$ as an iid sequence from **either** $f$ **or** $g$. + +Nature's decision about whether to draw from $f$ or $g$ was thus **permanent**. + +We now assume a different timing protocol in which before **each period** $t =1, 2, \ldots$ nature flips an $x$-weighted coin and with probability +$x \in (0,1)$ draws from $f$ in period $t$ and with probability $1 - x $ draws from $g$. + +Under this timing protocol, nature draws permanently from **neither** $f$ **nor** $g$, so a statistician who thinks that nature is drawing +i.i.d. draws **permanently** from one of them is mistaken. + +* in truth, nature actually draws **permanently** from an $x$-mixture of $f$ and $g$ -- a distribution that is neither $f$ nor $g$ when +$x \in (0,1)$ + + + +Thus, the Bayesian prior $\pi_0$ and the sequence of posterior probabilities described by equation {eq}`eq_Bayeslaw103` should **not** be interpreted as the statistician's opinion about the mixing parameter $x$ under the alternative timing protocol in which nature draws from an $x$-mixture of $f$ and $g$. + +This is clear when we remember the definition of $\pi_t$ in equation {eq}`eq:defbayesposterior`, which for convenience we repeat here: + +$$ +\pi_{t+1} = {\rm Prob}(q=f|w^{t+1}) +$$ + + + +Let's write some Python code to study how $\pi_t$ behaves when nature actually generates data as i.i.d. draws from neither $f$ nor from $g$ +but instead as i.i.d. draws from an $x$-mixture of two beta distributions. + +```{note} +This is a situation in which the statistician's model is misspecified, so we should anticipate that a Kullback-Liebler divergence with respect to an $x$-mixture distribution will shape outcomes. +``` + +We can study how $\pi_t$ would behave for various values of nature's mixing probability $x$. + + + +First, let's create a function to simulate data under the mixture timing protocol: + +```{code-cell} ipython3 +@jit +def simulate_mixture_path(x_true, T): + """ + Simulate T observations under mixture timing protocol. + """ + w = np.empty(T) + for t in range(T): + if np.random.rand() < x_true: + w[t] = np.random.beta(F_a, F_b) + else: + w[t] = np.random.beta(G_a, G_b) + return w +``` + +Let's generate a sequence of observations from this mixture model with a true mixing probability of $x=0.5$. + +We will first use this sequence to study how $\pi_t$ behaves. + +```{note} +Later, we can use it to study how a statistician who knows that an $x$-mixture of $f$ and $g$ could construct maximum likelihood or Bayesian estimators of $x$ along with the free parameters of $f$ and $g$. +``` + +```{code-cell} ipython3 +x_true = 0.5 +T_mix = 200 + +# Three different priors with means 0.25, 0.5, 0.75 +prior_params = [(1, 3), (1, 1), (3, 1)] +prior_means = [a/(a+b) for a, b in prior_params] + +# Generate one path of observations from the mixture +set_seed() +w_mix = simulate_mixture_path(x_true, T_mix) +``` + +### Behavior of $\pi_t$ under wrong model + +Let's study how the posterior probability $\pi_t$ that nature permanently draws from $f$ behaves when data are actually generated by +an $x$-mixture of $f$ and $g$. + +```{code-cell} ipython3 +fig, ax = plt.subplots(figsize=(10, 6)) +T_plot = 200 + +for i, mean0 in enumerate(prior_means): + π_wrong = np.empty(T_plot + 1) + π_wrong[0] = mean0 + + # Compute likelihood ratios for the mixture data + for t in range(T_plot): + l_t = f(w_mix[t]) / g(w_mix[t]) + π_wrong[t + 1] = update(π_wrong[t], l_t) + + ax.plot(range(T_plot + 1), π_wrong, + label=fr'$\pi_0 = ${mean0:.2f}', + color=colors[i], linewidth=2) + +ax.axhline(y=x_true, color='black', linestyle='--', + label=f'True x = {x_true}', linewidth=2) +ax.set_xlabel('t') +ax.set_ylabel(r'$\pi_t$') +ax.legend() +plt.show() +``` + +Evidently, $\pi_t$ converges to 1. + +This indicates that the model concludes that the data is generated by $f$. + +Why does this happen? + +Given $x = 0.5$, the data generating process is a mixture of $f$ and $g$: $m(w) = \frac{1}{2}f(w) + \frac{1}{2}g(w)$. + +A widely used measure of "closeness" between two distributions is the Kullback-Leibler (KL) divergence. + + +Let's check the KL divergence of the mixture distribution $m$ from both $f$ and $g$. + +```{code-cell} ipython3 +def compute_KL(f, g): + """ + Compute KL divergence KL(f, g) + """ + integrand = lambda w: f(w) * np.log(f(w) / g(w)) + val, _ = quad(integrand, 1e-5, 1-1e-5) + return val + + +def compute_div_m(f, g): + """ + Compute Jensen-Shannon divergence + """ + def m(w): + return 0.5 * (f(w) + g(w)) + + return compute_KL(m, f), compute_KL(m, g) + + +KL_f, KL_g = compute_div_m(f, g) + +print(f'KL(m, f) = {KL_f:.3f}\nKL(m, g) = {KL_g:.3f}') +``` + +Since $KL(m, f) < KL(m, g)$, $f$ is "closer" to the mixture distribution $m$. + +Hence by our discussion on KL divergence and likelihood ratio process in +{doc}`likelihood_ratio_process`, $log(L_t) \to \infty$ as $t \to \infty$. + +Now looking back to the key equation {eq}`eq_Bayeslaw103`. + +Consider the function + +$$ +h(z) = \frac{\pi_0 z}{\pi_0 z + 1 - \pi_0}. +$$ + +The limit $\lim_{z \to \infty} h(z)$ is 1. + +Hence $\pi_t \to 1$ as $t \to \infty$ for any $\pi_0 \in (0,1)$. + +This explains what we observed in the plot above. + + + +But how can we learn the true mixing parameter $x$? + +This topic is taken up in {doc}`this lecture `. + +We also explore this topic in the exrecise below. + +```{exercise} +:label: likelihood_bayes_ex1 +The true data generating process is a mixture, and one of the parameters to be learned is the mixing proportion $x$. + +A correct Bayesian approach should directly model the uncertainty about $x$ and update beliefs about it as new data arrives. + +Here is the algorithm: + +First we specify a prior distribution for $x$ given by $x \sim \text{Beta}(\alpha_0, \beta_0)$ with sexpectation $\mathbb{E}[x] = \frac{\alpha_0}{\alpha_0 + \beta_0}$. + +The likelihood for a single observation $w_t$ is $p(w_t|x) = x f(w_t) + (1-x) g(w_t)$. + +For a sequence $w^t = (w_1, \dots, w_t)$, the likelihood is $p(w^t|x) = \prod_{i=1}^t p(w_i|x)$. + +The posterior distribution is updated using $p(x|w^t) \propto p(w^t|x) p(x)$. + +Recursively, the posterior after $w_t$ is $p(x|w^t) \propto p(w_t|x) p(x|w^{t-1})$. + +Without a conjugate prior, we can approximate the posterior by discretizing $x$ into a grid. + +Your task is to implement this algorithm in Python. + +You can verify your implementation by checking that the posterior mean converges to the true value of $x$ +as $t$ increases. +``` + +```{solution-start} likelihood_bayes_ex1 +:class: dropdown +``` + +Here is one solution: + +```{code-cell} ipython3 +@jit +def learn_x_bayesian(observations, α0, β0, grid_size=2000): + """ + Sequential Bayesian learning of the mixing probability x + using a grid approximation. + """ + w = np.asarray(observations) + T = w.size + + x_grid = np.linspace(1e-3, 1 - 1e-3, grid_size) + + # Log prior + log_prior = (α0 - 1) * np.log(x_grid) + (β0 - 1) * np.log1p(-x_grid) + + μ_path = np.empty(T + 1) + μ_path[0] = α0 / (α0 + β0) + + log_post = log_prior.copy() + + for t in range(T): + wt = w[t] + # P(w_t | x) = x f(w_t) + (1 - x) g(w_t) + like = x_grid * f(wt) + (1 - x_grid) * g(wt) + log_post += np.log(like) + + # normalize + log_post -= log_post.max() + post = np.exp(log_post) + post /= post.sum() + + μ_path[t + 1] = np.sum(x_grid * post) + + return μ_path + +x_posterior_means = [learn_x_bayesian(w_mix, α0, β0) for α0, β0 in prior_params] +``` + +Let's visualize how the posterior mean of $x$ evolves over time, starting from three different prior beliefs. + +```{code-cell} ipython3 +fig, ax = plt.subplots(figsize=(10, 6)) + +for i, (x_means, mean0) in enumerate(zip(x_posterior_means, prior_means)): + ax.plot(range(T_mix + 1), x_means, + label=fr'Prior mean = ${mean0:.2f}$', + color=colors[i], linewidth=2) + +ax.axhline(y=x_true, color='black', linestyle='--', + label=f'True x = {x_true}', linewidth=2) +ax.set_xlabel('$t$') +ax.set_ylabel('Posterior mean of $x$') +ax.legend() +plt.show() +``` + +The plot shows that regardless of the initial prior belief, all three posterior means eventually converge towards the true value of $x=0.5$. + +Next, let's look at multiple simulations with a longer time horizon, all starting from a uniform prior. + +```{code-cell} ipython3 +set_seed() +n_paths = 20 +T_long = 10_000 + +fig, ax = plt.subplots(figsize=(10, 5)) + +for j in range(n_paths): + w_path = simulate_mixture_path(x_true, T_long) + x_means = learn_x_bayesian(w_path, 1, 1) # Uniform prior + ax.plot(range(T_long + 1), x_means, alpha=0.5, linewidth=1) + +ax.axhline(y=x_true, color='red', linestyle='--', + label=f'True x = {x_true}', linewidth=2) +ax.set_ylabel('Posterior mean of $x$') +ax.set_xlabel('$t$') +ax.legend() +plt.tight_layout() +plt.show() +``` + +We can see that the posterior mean of $x$ converges to the true value $x=0.5$. + +```{solution-end} +``` + +## Behavior of posterior probability $\{\pi_t\}$ under the subjective probability distribution + We'll end this lecture by briefly studying what our Baysian learner expects to learn under the subjective beliefs $\pi_t$ cranked out by Bayes' law. @@ -431,7 +762,7 @@ $$ Let $a \in \{ f, g\} $ be an index that indicates whether nature chose permanently to draw from distribution $f$ or from distribution $g$. After drawing $w_0$, the worker uses Bayes' law to deduce that -the posterior probability $\pi_0 = {\rm Prob}{a = f | w_0} $ +the posterior probability $\pi_0 = {\rm Prob} ({a = f | w_0}) $ that the density is $f(w)$ is $$ @@ -629,8 +960,6 @@ def create_table(π0s, N=10000, T=500, decimals=2): table.index = π0s return table - - # simulate T = 200 π0 = .5 @@ -638,8 +967,6 @@ T = 200 π_path, w_path = martingale_simulate(π0=π0, T=T, N=10000) ``` - - ```{code-cell} ipython3 fig, ax = plt.subplots() for i in range(100): @@ -650,8 +977,6 @@ ax.set_ylabel(r'$\pi_t$') plt.show() ``` - - The above graph indicates that * each of paths converges @@ -664,8 +989,6 @@ The above graph indicates that Convergence actually occurs pretty fast, as the following graph of the cross-ensemble distribution of $\pi_t$ for various small $t$'s indicates. - - ```{code-cell} ipython3 fig, ax = plt.subplots() for t in [1, 10, T-1]: @@ -710,9 +1033,6 @@ ax.legend(loc='upper right') plt.show() ``` - - - For the preceding ensemble that assumed $\pi_0 = .5$, the following graph shows two paths of $w_t$'s and the $\pi_t$ sequences that gave rise to them. @@ -721,8 +1041,6 @@ Notice that one of the paths involves systematically higher $w_t$'s, outcomes th The luck of the draw early in a simulation push the subjective distribution to draw from $F$ more frequently along a sample path, and this pushes $\pi_t$ toward $0$. - - ```{code-cell} ipython3 fig, ax = plt.subplots() for i, j in enumerate([10, 100]): @@ -757,6 +1075,7 @@ The third column reports the fraction of $N = 10000$ simulations for which $\pi_ table = create_table(list(np.linspace(0,1,11)), N=10000, T=500) table ``` + The fraction of simulations for which $\pi_{t}$ had converged to $1$ is indeed always close to $\pi_{-1}$, as anticipated. @@ -811,7 +1130,166 @@ Notice how the conditional variance approaches $0$ for $\pi_{t-1}$ near either The conditional variance is nearly zero only when the agent is almost sure that $w_t$ is drawn from $F$, or is almost sure it is drawn from $G$. -## Sequels +## Related Lectures This lecture has been devoted to building some useful infrastructure that will help us understand inferences that are the foundations of results described in {doc}`this lecture ` and {doc}`this lecture ` and {doc}`this lecture `. + +```{exercise} +:label: likelihood_bayes_ex2 + +In the [first exercise](likelihood_bayes_ex1), we implemented a Bayesian learning algorithm to estimate the mixing parameter $x$ in a mixture model using a grid approximation method. + +In this exercise, we will explore sequential Bayesian updating using NumPyro. + +Please follow these steps: + +1. Generate a dataset from a mixture model with known mixing parameter $x_{true} = 0.5$. +2. Process the data in chunks of 100 observations, updating the posterior sequentially. +3. For each chunk, use the posterior from the previous chunk as the prior for the next chunk (using moment matching to fit a Beta distribution). +4. Create a visualization showing how the posterior distribution evolves, using gradually darker shades to represent later time periods. + +In the exercise, set $\alpha_0 = 1$ and $\beta_0 = 2$. +``` + +```{solution-start} likelihood_bayes_ex2 +:class: dropdown +``` + +First, let's install and import the necessary packages: + +```{code-cell} ipython3 +:tags: [hide-output] + +!pip install numpyro jax +``` + +```{code-cell} ipython3 +import jax +import jax.numpy as jnp +import numpyro +import numpyro.distributions as dist +from numpyro.infer import NUTS, MCMC +import seaborn as sns +``` + +Define the mixture model and helper functions that helps us +to fit a Beta distribution to the posterior and obtain the parameter for the next chunk + +```{code-cell} ipython3 +def mixture_model(w, α0=1.0, β0=1.0): + x = numpyro.sample("x", dist.Beta(α0, β0)) + f_dist = dist.Beta(F_a, F_b) + g_dist = dist.Beta(G_a, G_b) + mix = dist.Mixture( + dist.Categorical(probs=jnp.array([x, 1 - x])), + [f_dist, g_dist] + ) + with numpyro.plate("data", w.shape[0]): + numpyro.sample("w", mix, obs=w) + +def β_moment_match(samples, eps=1e-12): + m = float(samples.mean()) + v = float(samples.var()) + v = max(v, eps) + t = m * (1 - m) / v - 1.0 + α = max(m * t, eps) + β = max((1 - m) * t, eps) + return α, β +``` + +Now we implement sequential Bayesian updating + +```{code-cell} ipython3 +def sequential_update(w_all, α0=1.0, β0=1.0, + chunk_size=100, + num_warmup=1000, + num_samples=2000, + seed=0): + n = len(w_all) + + # Create chunks + chunks = [slice(i, min(i + chunk_size, n)) + for i in range(0, n, chunk_size)] + α, β = α0, β0 + + keys = jax.random.split( + jax.random.PRNGKey(seed), len(chunks)) + means = [α / (α + β)] + posts = [] + + # Run MCMC for each chunk + for i, sl in enumerate(chunks): + kernel = NUTS(mixture_model) + mcmc = MCMC(kernel, + num_warmup=num_warmup, + num_samples=num_samples) + mcmc.run(keys[i], w_all[sl], α, β) + xs = mcmc.get_samples()["x"] + + posts.append(xs) + + # Posterior becomes prior for next chunk + α, β = β_moment_match(xs) + means.append(xs.mean()) + + return np.array(means), posts, chunks +``` + +Generate data and run sequential updates: + +```{code-cell} ipython3 +:tags: [hide-output] + +x_true = 0.5 +T_total = 2000 + +set_seed() +w_mix = simulate_mixture_path(x_true, T_total) + +means, posts, chunks = sequential_update( + w_mix, α0=1.0, β0=2.0, chunk_size=200, + num_warmup=2000, num_samples=1000, seed=123) +``` + +Create visualization with gradually darker lines: + +```{code-cell} ipython3 +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5)) + +# Plot posterior means over time +n_seen = np.cumsum([0] + [c.stop - c.start for c in chunks]) + +# Posterior mean trajectory +ax1.plot(n_seen, means, 'o-', + color='darkblue', label='Posterior mean of $x$') +ax1.axhline(x_true, ls="--", + color="red", alpha=0.7, label=f'True $x$ = {x_true}') +ax1.set_xlabel('$t$') +ax1.set_ylabel('Posterior mean of $x$') +ax1.legend() + +# Posterior densities at each chunk +n_chunks = len(posts) +colors = plt.cm.Blues(np.linspace(0.01, 0.99, n_chunks)) + +for i, (xs, color) in enumerate(zip(posts, colors)): + sns.kdeplot(xs, color=color, ax=ax2, + alpha=0.7, label=f'n={chunks[i].stop}') + +ax2.axvline(x_true, ls="--", color="red", + alpha=0.7, label=f'True $x$ = {x_true}') +ax2.set_xlabel('$x$') +ax2.set_ylabel('density') +ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + +plt.tight_layout() +plt.show() +``` + +The left panel shows how the posterior mean converges to the true value as more data is observed. + +The right panel shows that the distribution of $x$ becomes more concentrated around the true value with more observations. + +```{solution-end} +``` diff --git a/lectures/likelihood_ratio_process.md b/lectures/likelihood_ratio_process.md index 2daa27157..624bb8b3c 100644 --- a/lectures/likelihood_ratio_process.md +++ b/lectures/likelihood_ratio_process.md @@ -4,7 +4,7 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.17.1 + jupytext_version: 1.17.2 kernelspec: display_name: Python 3 (ipykernel) language: python @@ -33,23 +33,22 @@ This lecture describes likelihood ratio processes and some of their uses. We'll study the same setting that is also used in {doc}`this lecture on exchangeability `. -Among things that we'll learn are +Among the things that we'll learn are * How a likelihood ratio process is a key ingredient in frequentist hypothesis testing * How a **receiver operator characteristic curve** summarizes information about a false alarm probability and power in frequentist hypothesis testing -* How a statistician can combine frequentist probabilities of type I and type II errors to form posterior probabilities of mistakes in a model selection or a classification problem -* How likelihood ratios helped Lawrence Blume and David Easley formulate an answer to the question ''If you're so smart, why aren't you rich?'' {cite}`blume2006if` -* How to use a Kullback-Leibler divergence to quantify the difference between two probability distributions with the same support +* How a statistician can combine frequentist probabilities of type I and type II errors to form posterior probabilities of mistakes in a model selection or in an individual-classification problem +* How likelihood ratios helped Lawrence Blume and David Easley formulate an answer to ''If you're so smart, why aren't you rich?'' {cite}`blume2006if` +* How to use a Kullback-Leibler divergence to quantify the difference between two probability distributions with the same support * How during World War II the United States Navy devised a decision rule for doing quality control on lots of ammunition, a topic that sets the stage for {doc}`this lecture ` * A peculiar property of likelihood ratio processes -Let's start by importing some Python tools. +Let's start by importing some Python tools. ```{code-cell} ipython3 import matplotlib.pyplot as plt -plt.rcParams["figure.figsize"] = (11, 5) #set default figure size import numpy as np from numba import vectorize, jit from math import gamma @@ -57,6 +56,7 @@ from scipy.integrate import quad from scipy.optimize import brentq, minimize_scalar from scipy.stats import beta as beta_dist import pandas as pd +from IPython.display import display, Math ``` ## Likelihood Ratio Process @@ -85,7 +85,7 @@ We want to use these observations to infer whether nature chose $f$ or $g$. A **likelihood ratio process** is a useful tool for this task. -To begin, we define key component of a likelihood ratio process, namely, the time $t$ likelihood ratio as the random variable +To begin, we define a key component of a likelihood ratio process, namely, the time $t$ likelihood ratio as the random variable $$ \ell (w_t)=\frac{f\left(w_t\right)}{g\left(w_t\right)},\quad t\geq1. @@ -94,9 +94,9 @@ $$ We assume that $f$ and $g$ both put positive probabilities on the same intervals of possible realizations of the random variable $W$. -That means that under the $g$ density, $\ell (w_t)= +That means that under the $g$ density, $\ell (w_t)= \frac{f\left(w_{t}\right)}{g\left(w_{t}\right)}$ -is a nonnegative random variable with mean $1$. +is a nonnegative random variable with mean $1$. A **likelihood ratio process** for sequence $\left\{ w_{t}\right\} _{t=1}^{\infty}$ is defined as @@ -108,7 +108,7 @@ $$ where $w^t=\{ w_1,\dots,w_t\}$ is a history of observations up to and including time $t$. -Sometimes for shorthand we'll write $L_t = L(w^t)$. +Sometimes for shorthand we'll write $L_t = L(w^t)$. Notice that the likelihood process satisfies the *recursion* @@ -123,7 +123,7 @@ Pearson {cite}`Neyman_Pearson`. To help us appreciate how things work, the following Python code evaluates $f$ and $g$ as two different Beta distributions, then computes and simulates an associated likelihood ratio process by generating a sequence $w^t$ from one of the two -probability distributions, for example, a sequence of IID draws from $g$. +probability distributions, for example, a sequence of IID draws from $g$. ```{code-cell} ipython3 # Parameters in the two Beta distributions. @@ -146,13 +146,11 @@ def simulate(a, b, T=50, N=500): ''' Generate N sets of T observations of the likelihood ratio, return as N x T matrix. - ''' l_arr = np.empty((N, T)) for i in range(N): - for j in range(T): w = np.random.beta(a, b) l_arr[i, j] = f(w) / g(w) @@ -185,7 +183,7 @@ plt.title("$L(w^{t})$ paths"); Evidently, as sample length $T$ grows, most probability mass shifts toward zero -To see it this more clearly clearly, we plot over time the fraction of +To see this more clearly, we plot over time the fraction of paths $L\left(w^{t}\right)$ that fall in the interval $\left[0, 0.01\right]$. @@ -195,7 +193,7 @@ plt.show() ``` Despite the evident convergence of most probability mass to a -very small interval near $0$, the unconditional mean of +very small interval near $0$, the unconditional mean of $L\left(w^t\right)$ under probability density $g$ is identically $1$ for all $t$. @@ -240,13 +238,13 @@ $t \geq 1$. ## Peculiar Property -How can $E\left[L\left(w^{t}\right)\bigm|q=g\right]=1$ possibly be true when most probability mass of the likelihood +How can $E\left[L\left(w^{t}\right)\bigm|q=g\right]=1$ possibly be true when most probability mass of the likelihood ratio process is piling up near $0$ as $t \rightarrow + \infty$? The answer is that as $t \rightarrow + \infty$, the distribution of $L_t$ becomes more and more fat-tailed: -enough mass shifts to larger and larger values of $L_t$ to make +enough mass shifts to larger and larger values of $L_t$ to make the mean of $L_t$ continue to be one despite most of the probability mass piling up near $0$. @@ -259,19 +257,19 @@ l_arr_g = simulate(G_a, G_b, N=50000) l_seq_g = np.cumprod(l_arr_g, axis=1) ``` -It would be useful to use simulations to verify that unconditional means +It would be useful to use simulations to verify that unconditional means $E\left[L\left(w^{t}\right)\right]$ equal unity by averaging across sample paths. -But it would be too computer-time-consuming for us to that here simply by applying a standard Monte Carlo simulation approach. +But it would be too computer-time-consuming for us to do that here simply by applying a standard Monte Carlo simulation approach. -The reason is that the distribution of $L\left(w^{t}\right)$ is extremely skewed for large values of $t$. +The reason is that the distribution of $L\left(w^{t}\right)$ is extremely skewed for large values of $t$. -Because the probability density in the right tail is close to $0$, it just takes too much computer time to sample enough points from the right tail. +Because the probability density in the right tail is close to $0$, it just takes too much computer time to sample enough points from the right tail. -We explain the problem in more detail in {doc}`this lecture `. +We explain the problem in more detail in {doc}`this lecture `. -There we describe a way to an alternative way to compute the mean of a likelihood ratio by computing the mean of a _different_ random variable by sampling from a _different_ probability distribution. +There we describe an alternative way to compute the mean of a likelihood ratio by computing the mean of a _different_ random variable by sampling from a _different_ probability distribution. ## Nature Permanently Draws from Density f @@ -314,7 +312,7 @@ plt.show() We also plot the probability that $L\left(w^t\right)$ falls into the interval $[10000, \infty)$ as a function of time and watch how -fast probability mass diverges to $+\infty$. +fast probability mass diverges to $+\infty$. ```{code-cell} ipython3 plt.plot(range(T), np.sum(l_seq_f > 10000, axis=0) / N) @@ -324,14 +322,14 @@ plt.show() ## Likelihood Ratio Test We now describe how to employ the machinery -of Neyman and Pearson {cite}`Neyman_Pearson` to test the hypothesis that history $w^t$ is generated by repeated +of Neyman and Pearson {cite}`Neyman_Pearson` to test the hypothesis that history $w^t$ is generated by repeated IID draws from density $f$. Denote $q$ as the data generating process, so that $q=f \text{ or } g$. Upon observing a sample $\{W_i\}_{i=1}^t$, we want to decide -whether nature is drawing from $g$ or from $f$ by performing a (frequentist) +whether nature is drawing from $g$ or from $f$ by performing a (frequentist) hypothesis test. We specify @@ -346,13 +344,13 @@ form: - reject $H_0$ if $L(W^t) < c$, -where $c$ is a given discrimination threshold. +where $c$ is a given discrimination threshold. Setting $c =1$ is a common choice. We'll discuss consequences of other choices of $c$ below. -This test is *best* in the sense that it is **uniformly most powerful**. +This test is *best* in the sense that it is **uniformly most powerful**. To understand what this means, we have to define probabilities of two important events that allow us to characterize a test associated with a given @@ -373,7 +371,7 @@ The two probabilities are: \beta \equiv \Pr\left\{ L\left(w^{t}\right)>c\mid q=g\right\} $$ -These two probabilities underly the following two concepts: +These two probabilities underlie the following two concepts: - Probability of false alarm (= significance level = probability of @@ -398,7 +396,7 @@ states that among all possible tests, a likelihood ratio test maximizes the probability of detection for a given probability of false alarm. -Another way to say the same thing is that among all possible tests, a likelihood ratio test +Another way to say the same thing is that among all possible tests, a likelihood ratio test maximizes **power** for a given **significance level**. We want a small probability of @@ -407,11 +405,11 @@ false alarm and a large probability of detection. With sample size $t$ fixed, we can change our two probabilities by adjusting $c$. -A troublesome "that's life" fact is that these two probabilities move in the same direction as we vary the critical value +A troublesome "that's life" fact is that these two probabilities move in the same direction as we vary the critical value $c$. Without specifying quantitative losses from making Type I and Type II errors, there is little that we can say -about how we *should* trade off probabilities of the two types of mistakes. +about how we *should* trade off probabilities of the two types of mistakes. We do know that increasing sample size $t$ improves statistical inference. @@ -433,17 +431,17 @@ likelihood ratios simulated above, which are generated by either $f$ or $g$. Taking logarithms has no effect on calculating the probabilities because -the log is a monotonic transformation. +the log is a monotonic transformation. As $t$ increases, the probabilities of making Type I and Type II errors both decrease, which is good. This is because most of the probability mass of log$(L(w^t))$ moves toward $-\infty$ when $g$ is the data generating -process, while log$(L(w^t))$ goes to +process, while log$(L(w^t))$ goes to $\infty$ when data are generated by $f$. -That disparate behavior of log$(L(w^t))$ under $f$ and $q$ +That disparate behavior of log$(L(w^t))$ under $f$ and $q$ is what makes it possible eventually to distinguish $q=f$ from $q=g$. @@ -475,15 +473,15 @@ plt.show() In the above graphs, * the blue areas are related to but not equal to probabilities $\alpha $ of a type I error because -they are integrals of $\log L_t$, not integrals of $L_t$, over rejection region $L_t < 1$ +they are integrals of $\log L_t$, not integrals of $L_t$, over rejection region $L_t < 1$ * the orange areas are related to but not equal to probabilities $\beta $ of a type II error because -they are integrals of $\log L_t$, not integrals of $L_t$, over acceptance region $L_t > 1$ +they are integrals of $\log L_t$, not integrals of $L_t$, over acceptance region $L_t > 1$ -When we hold $c$ fixed at $c=1$, the following graph shows that - * the probability of detection monotonically increases with increases in +When we hold $c$ fixed at $c=1$, the following graph shows that + * the probability of detection monotonically increases with increases in $t$ - * the probability of a false alarm monotonically decreases with increases in $t$. + * the probability of a false alarm monotonically decreases with increases in $t$. ```{code-cell} ipython3 PD = np.empty(T) @@ -501,13 +499,13 @@ plt.legend() plt.show() ``` -For a given sample size $t$, the threshold $c$ uniquely pins down probabilities +For a given sample size $t$, the threshold $c$ uniquely pins down probabilities of both types of error. If for a fixed $t$ we now free up and move $c$, we will sweep out the probability of detection as a function of the probability of false alarm. -This produces a [receiver operating characteristic +This produces a [receiver operating characteristic curve (ROC curve)](https://en.wikipedia.org/wiki/Receiver_operating_characteristic). Below, we plot receiver operating characteristic curves for different @@ -538,7 +536,7 @@ Notice that as $t$ increases, we are assured a larger probability of detection and a smaller probability of false alarm associated with a given discrimination threshold $c$. -For a given sample size $t$, both $\alpha$ and $\beta$ change as we vary $c$. +For a given sample size $t$, both $\alpha$ and $\beta$ change as we vary $c$. As we increase $c$ @@ -588,7 +586,8 @@ The United States Navy evidently used a procedure like this to select a sample s control tests during World War II. A Navy Captain who had been ordered to perform tests of this kind had doubts about it that he -presented to Milton Friedman, as we describe in {doc}`this lecture `. +presented to Milton Friedman, as we describe in {doc}`this lecture `. + (rel_entropy)= ## Kullback–Leibler Divergence @@ -598,7 +597,7 @@ generates the data. Instead, a third distribution $h$ does. -Let's study how accumulated likelihood ratios $L$ behave +Let's study how accumulated likelihood ratios $L$ behave when $h$ governs the data. A key tool here is called **Kullback–Leibler divergence**. @@ -635,25 +634,30 @@ Let's compute the Kullback–Leibler discrepancies by quadrature integration. ```{code-cell} ipython3 -def KL_integrand(w, q, h): - - m = h(w) / q(w) - - return np.log(m) * h(w) +def compute_KL(f, g): + """ + Compute KL divergence KL(f, g) + """ + integrand = lambda w: f(w) * np.log(f(w) / g(w)) + val, _ = quad(integrand, 1e-5, 1-1e-5) + return val ``` +Next we create a helper function to compute KL divergence with respect to a reference distribution $h$ + ```{code-cell} ipython3 -def compute_KL(h, f, g): +def compute_KL_h(h, f, g): """ Compute KL divergence with reference distribution h """ - Kf, _ = quad(KL_integrand, 0, 1, args=(f, h)) - Kg, _ = quad(KL_integrand, 0, 1, args=(g, h)) + Kf = compute_KL(h, f) + Kg = compute_KL(h, g) return Kf, Kg ``` +(KL_link)= ### A helpful formula There is a mathematical relationship between likelihood ratios and KL divergence. @@ -668,7 +672,7 @@ where $L_t=\prod_{j=1}^{t}\frac{f(w_j)}{g(w_j)}$ is the likelihood ratio process (For the proof, see [this note](https://nowak.ece.wisc.edu/ece830/ece830_fall11_lecture7.pdf).) -{eq}`eq:kl_likelihood_link` tells us that: +Equation {eq}`eq:kl_likelihood_link` tells us that: - When $K_g < K_f$ (i.e., $g$ is closer to $h$ than $f$ is), the expected log likelihood ratio is negative, so $L\left(w^t\right) \rightarrow 0$. - When $K_g > K_f$ (i.e., $f$ is closer to $h$ than $g$ is), the expected log likelihood ratio is positive, so $L\left(w^t\right) \rightarrow + \infty$. @@ -676,6 +680,16 @@ Let's verify this using simulation. In the simulation, we generate multiple paths using Beta distributions $f$, $g$, and $h$, and compute the paths of $\log(L(w^t))$. +First, we write a function to compute the likelihood ratio process + +```{code-cell} ipython3 +def compute_likelihood_ratios(sequences, f, g): + """Compute likelihood ratios and cumulative products.""" + l_ratios = f(sequences) / g(sequences) + L_cumulative = np.cumprod(l_ratios, axis=1) + return l_ratios, L_cumulative +``` + We consider three cases: (1) $h$ is closer to $f$, (2) $f$ and $g$ are approximately equidistant from $h$, and (3) $h$ is closer to $g$. ```{code-cell} ipython3 @@ -708,7 +722,7 @@ for i, scenario in enumerate(scenarios): scenario["h_params"][1]) # Compute KL divergences - Kf, Kg = compute_KL(h, f, g) + Kf, Kg = compute_KL_h(h, f, g) kl_diff = Kg - Kf # Simulate paths @@ -718,8 +732,7 @@ for i, scenario in enumerate(scenarios): # Generate data from h h_data = np.random.beta(scenario["h_params"][0], scenario["h_params"][1], (N_paths, T)) - l_ratios = f(h_data) / g(h_data) - l_cumulative = np.cumprod(l_ratios, axis=1) + l_ratios, l_cumulative = compute_likelihood_ratios(h_data, f, g) log_l_cumulative = np.log(l_cumulative) # Plot distributions @@ -748,7 +761,7 @@ for i, scenario in enumerate(scenarios): ax.set_xlabel('t') ax.set_ylabel('$log L_t$') ax.set_title(f'KL(h,f)={Kf:.3f}, KL(h,g)={Kg:.3f}\n{scenario["expected"]}', - fontsize=16) + fontsize=16) ax.legend(fontsize=16) plt.tight_layout() @@ -764,1368 +777,1359 @@ Note that These observations align with the theory. -In a [later section](hetero_agent), we will see an application of these ideas. +In the [next section](hetero_agent), we will see an application of these ideas. -+++ -## Hypothesis Testing and Classification +(hetero_agent)= +## Heterogeneous Beliefs and Financial Markets -We now describe how a statistician can combine frequentist probabilities of type I and type II errors in order to +A likelihood ratio process lies behind Lawrence Blume and David Easley's answer to their question +''If you're so smart, why aren't you rich?'' {cite}`blume2006if`. -* compute an anticipated frequency of selecting a wrong model based on a sample length $T$ -* compute an anticipated error rate in a classification problem +Blume and Easley constructed formal models to study how differences of opinions about probabilities governing risky income processes would influence outcomes and be reflected in prices of stocks, bonds, and insurance policies that individuals use to share and hedge risks. -We consider a situation in which nature generates data by mixing known densities $f$ and $g$ with known mixing -parameter $\pi_{-1} \in (0,1)$ so that the random variable $w$ is drawn from the density +```{note} +{cite}`alchian1950uncertainty` and {cite}`friedman1953essays` can conjectured that, by rewarding traders with more realistic probability models, competitive markets in financial securities put wealth in the hands of better informed traders and help +make prices of risky assets reflect realistic probability assessments. +``` + + +Here we'll provide an example that illustrates basic components of Blume and Easley's analysis. + +We'll focus only on their analysis of an environment with complete markets in which trades in all conceivable risky securities are possible. + +We'll study two alternative arrangements: + +* perfect socialism in which individuals surrender their endowments of consumption goods each period to a central planner who then dictatorially allocates those goods +* a decentralized system of competitive markets in which selfish price-taking individuals voluntarily trade with each other in competitive markets + +The fundamental theorems of welfare economics will apply and assure us that these two arrangements end up producing exactly the same allocation of consumption goods to individuals **provided** that the social planner assigns an appropriate set of **Pareto weights**. + + + +Let the random variable $s_t \in (0,1)$ at time $t =0, 1, 2, \ldots$ be distributed according to the same Beta distribution with parameters +$\theta = \{\theta_1, \theta_2\}$. + +We'll denote this probability density as $$ -h (w) = \pi_{-1} f(w) + (1-\pi_{-1}) g(w) +\pi(s_t|\theta) $$ -We assume that the statistician knows the densities $f$ and $g$ and also the mixing parameter $\pi_{-1}$. - -Below, we'll set $\pi_{-1} = .5$, although much of the analysis would follow through with other settings of $\pi_{-1} \in (0,1)$. +Below, we'll often just write $\pi(s_t)$ instead of $\pi(s_t|\theta)$ to save space. -We assume that $f$ and $g$ both put positive probabilities on the same intervals of possible realizations of the random variable $W$. +Let $s_t \equiv y_t^1$ be the endowment of a nonstorable consumption good that a person we'll call "agent 1" receives at time $t$. - +Let a history $s^t = [s_t, s_{t-1}, \ldots, s_0]$ be a sequence of i.i.d. random variables with joint distribution -In the simulations below, we specify that $f$ is a $\text{Beta}(1, 1)$ distribution and that $g$ is $\text{Beta}(3, 1.2)$ distribution. +$$ +\pi_t(s^t) = \pi(s_t) \pi(s_{t-1}) \cdots \pi(s_0) +$$ -We consider two alternative timing protocols. +So in our example, the history $s^t$ is a comprehensive record of agent $1$'s endowments of the consumption good from time $0$ up to time $t$. - * Timing protocol 1 is for the model selection problem - * Timing protocol 2 is for the individual classification problem +If agent $1$ were to live on an island by himself, agent $1$'s consumption $c^1(s_t)$ at time $t$ is -**Timing Protocol 1:** Nature flips a coin once at time $t=-1$ and with probability $\pi_{-1}$ generates a sequence $\{w_t\}_{t=1}^T$ -of IID draws from $f$ and with probability $1-\pi_{-1}$ generates a sequence $\{w_t\}_{t=1}^T$ -of IID draws from $g$. +$$c^1(s_t) = y_t^1 = s_t. $$ -Let's write some Python code that implements timing protocol 1. +But in our model, agent 1 is not alone. -```{code-cell} ipython3 -def protocol_1(π_minus_1, T, N=1000): - """ - Simulate Protocol 1: - Nature decides once at t=-1 which model to use. - """ - - # On-off coin flip for the true model - true_models_F = np.random.rand(N) < π_minus_1 - - sequences = np.empty((N, T)) - - n_f = np.sum(true_models_F) - n_g = N - n_f - if n_f > 0: - sequences[true_models_F, :] = np.random.beta(F_a, F_b, (n_f, T)) - if n_g > 0: - sequences[~true_models_F, :] = np.random.beta(G_a, G_b, (n_g, T)) - - return sequences, true_models_F -``` +### Nature and agents' beliefs -**Timing Protocol 2.** At each time $t \geq 0$, nature flips a coin and with probability $\pi_{-1}$ draws $w_t$ from $f$ and with probability $1-\pi_{-1}$ draws $w_t$ from $g$. +Nature draws i.i.d. sequences $\{s_t\}_{t=0}^\infty$ from $\pi_t(s^t)$. -Here is Python code that we'll use to implement timing protocol 2. +* so $\pi$ without a superscript is nature's model +* but in addition to nature, there are other entities inside our model -- artificial people that we call "agents" +* each agent has a sequence of probability distributions over $s^t$ for $t=0, \ldots$ +* agent $i$ thinks that nature draws i.i.d. sequences $\{s_t\}_{t=0}^\infty$ from $\pi_t^i(s^t)$ + * agent $i$ is mistaken unless $\pi_t^i(s^t) = \pi_t(s^t)$ -```{code-cell} ipython3 -def protocol_2(π_minus_1, T, N=1000): - """ - Simulate Protocol 2: - Nature decides at each time step which model to use. - """ - - # Coin flips for each time t upto T - true_models_F = np.random.rand(N, T) < π_minus_1 - - sequences = np.empty((N, T)) - - n_f = np.sum(true_models_F) - n_g = N * T - n_f - if n_f > 0: - sequences[true_models_F] = np.random.beta(F_a, F_b, n_f) - if n_g > 0: - sequences[~true_models_F] = np.random.beta(G_a, G_b, n_g) - - return sequences, true_models_F +```{note} +A **rational expectations** model would set $\pi_t^i(s^t) = \pi_t(s^t)$ for all agents $i$. ``` -**Remark:** Under timing protocol 2, the $\{w_t\}_{t=1}^T$ is a sequence of IID draws from $h(w)$. Under timing protocol 1, the the $\{w_t\}_{t=1}^T$ is -not IID. It is **conditionally IID** -- meaning that with probability $\pi_{-1}$ it is a sequence of IID draws from $f(w)$ and with probability $1-\pi_{-1}$ it is a sequence of IID draws from $g(w)$. For more about this, see {doc}`this lecture about exchangeability `. +There are two agents named $i=1$ and $i=2$. -We again deploy a **likelihood ratio process** with time $t$ component being the likelihood ratio +At time $t$, agent $1$ receives an endowment $$ -\ell (w_t)=\frac{f\left(w_t\right)}{g\left(w_t\right)},\quad t\geq1. +y_t^1 = s_t $$ -The **likelihood ratio process** for sequence $\left\{ w_{t}\right\} _{t=1}^{\infty}$ is +of a nonstorable consumption good, while agent $2$ receives an endowment of $$ -L\left(w^{t}\right)=\prod_{i=1}^{t} \ell (w_i), +y_t^2 = 1 - s_t $$ -For shorthand we'll write $L_t = L(w^t)$. +The aggregate endowment of the consumption good is -In the next cell, we write the likelihood ratio calculation that we have done [previously](nature_likeli) into a function +$$ +y_t^1 + y_t^2 = 1 +$$ -```{code-cell} ipython3 -def compute_likelihood_ratios(sequences): - """ - Compute likelihood ratios for given sequences. - """ - - l_ratios = f(sequences) / g(sequences) - L_cumulative = np.cumprod(l_ratios, axis=1) - return l_ratios, L_cumulative -``` +at each date $t \geq 0$. -## Model Selection Mistake Probability +At date $t$ agent $i$ consumes $c_t^i(s^t)$ of the good. -We first study a problem that assumes timing protocol 1. +A (non wasteful) feasible allocation of the aggregate endowment of $1$ each period satisfies -Consider a decision maker who wants to know whether model $f$ or model $g$ governs a data set of length $T$ observations. +$$ +c_t^1 + c_t^2 = 1 . +$$ -The decision makers has observed a sequence $\{w_t\}_{t=1}^T$. +### A social risk-sharing arrangement -On the basis of that observed sequence, a likelihood ratio test selects model $f$ when - $L_T \geq 1 $ and model $g$ when $L_T < 1$. - -When model $f$ generates the data, the probability that the likelihood ratio test selects the wrong model is +In order to share risks, a benevolent social planner will dictate a history-dependent consumption allocation in the form of a sequence of functions -$$ -p_f = {\rm Prob}\left(L_T < 1\Big| f\right) = \alpha_T . +$$ +c_t^i = c_t^i(s^t) $$ -When model $g$ generates the data, the probability that the likelihood ratio test selects the wrong model is +that satisfy -$$ -p_g = {\rm Prob}\left(L_T \geq 1 \Big|g \right) = \beta_T. $$ +c_t^1(s^t) + c_t^2(s^t) = 1 +$$ (eq:feasibility) -We can construct a probability that the likelihood ratio selects the wrong model by assigning a Bayesian prior probability of $\pi_{-1} = .5$ that nature selects model $f$ then averaging $p_f$ and $p_g$ to form the Bayesian posterior probability of a detection error equal to +for all $s^t$ for all $t \geq 0$. + +To design a socially optimal allocation, the social planner wants to know what agent $1$ believes about the endowment sequence and how they feel about bearing risks. +As for the endowment sequences, agent $i$ believes that nature draws i.i.d. sequences from joint densities + +$$ +\pi_t^i(s^t) = \pi(s_t)^i \pi^i(s_{t-1}) \cdots \pi^i(s_0) $$ -p(\textrm{wrong decision}) = {1 \over 2} (\alpha_T + \beta_T) . -$$ (eq:detectionerrorprob) -Now let's simulate timing protocol 1 and compute the error probabilities +As for attitudes toward bearing risks, agent $i$ has a one-period utility function -```{code-cell} ipython3 -# Set parameters -π_minus_1 = 0.5 -T_max = 30 -N_simulations = 10_000 +$$ +u(c_t^i) = u(c_t^i) = \ln (c_t^i) +$$ -sequences_p1, true_models_p1 = protocol_1( - π_minus_1, T_max, N_simulations) -l_ratios_p1, L_cumulative_p1 = compute_likelihood_ratios(sequences_p1) +with marginal utility of consumption in period $i$ -# Compute error probabilities for different sample sizes -T_range = np.arange(1, T_max + 1) +$$ +u'(c_t^i) = \frac{1}{c_t^i} +$$ -# Boolean masks for true models -mask_f = true_models_p1 -mask_g = ~true_models_p1 +Putting its beliefs about its random endowment sequence and its attitudes toward bearing risks together, agent $i$ has intertemporal utility function -# Select cumulative likelihoods for each model -L_f = L_cumulative_p1[mask_f, :] -L_g = L_cumulative_p1[mask_g, :] +$$ +V^i = \sum_{t=0}^{\infty} \sum_{s^t} \delta^t u(c_t^i(s^t)) \pi_t^i(s^t) , +$$ (eq:objectiveagenti) -α_T = np.mean(L_f < 1, axis=0) -β_T = np.mean(L_g >= 1, axis=0) +where $\delta \in (0,1)$ is an intertemporal discount factor, and $u(\cdot)$ is a strictly increasing, concave one-period utility function. -error_prob = 0.5 * (α_T + β_T) -# Plot results -fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) +### The social planner's allocation problem -ax1.plot(T_range, α_T, 'b-', - label=r'$\alpha_T$', linewidth=2) -ax1.plot(T_range, β_T, 'r-', - label=r'$\beta_T$', linewidth=2) -ax1.set_xlabel('$T$') -ax1.set_ylabel('error probability') -ax1.legend() +The benevolent dictator has all the information it requires to choose a consumption allocation that maximizes the social welfare criterion -ax2.plot(T_range, error_prob, 'g-', - label=r'$\frac{1}{2}(\alpha_T+\beta_T)$', linewidth=2) -ax2.set_xlabel('$T$') -ax2.set_ylabel('error probability') -ax2.legend() +$$ +W = \lambda V^1 + (1-\lambda) V^2 +$$ (eq:welfareW) -plt.tight_layout() -plt.show() +where $\lambda \in [0,1]$ is a Pareto weight tells how much the planner likes agent $1$ and $1 - \lambda$ is a Pareto weight that tells how much the social planner likes agent $2$. -print(f"At T={T_max}:") -print(f"α_{T_max} = {α_T[-1]:.4f}") -print(f"β_{T_max} = {β_T[-1]:.4f}") -print(f"Model selection error probability = {error_prob[-1]:.4f}") -``` +Setting $\lambda = .5$ expresses ''egalitarian'' social preferences. -Notice how the model selection error probability approaches zero as $T$ grows. +Notice how social welfare criterion {eq}`eq:welfareW` takes into account both agents' preferences as represented by formula {eq}`eq:objectiveagenti`. -## Classification +This means that the social planner knows and respects -We now consider a problem that assumes timing protocol 2. +* each agent's one period utility function $u(\cdot) = \ln(\cdot)$ +* each agent $i$'s probability model $\{\pi_t^i(s^t)\}_{t=0}^\infty$ -A decision maker wants to classify components of an observed sequence $\{w_t\}_{t=1}^T$ as having been drawn from either $f$ or $g$. +Consequently, we anticipate that these objects will appear in the social planner's rule for allocating the aggregate endowment each period. -The decision maker uses the following classification rule: -$$ -\begin{aligned} -w_t & \ {\rm is \ from \ f \ if \ } l_t > 1 \\ -w_t & \ {\rm is \ from \ g \ if \ } l_t \leq 1 . -\end{aligned} -$$ +First-order necessary conditions for maximizing welfare criterion {eq}`eq:welfareW` subject to the feasibility constraint {eq}`eq:feasibility` are -Under this rule, the expected misclassification rate is +$$\frac{\pi_t^2(s^t)}{\pi_t^1(s^t)} \frac{(1/c_t^2(s^t))}{(1/c_t^1(s^t))} = \frac{\lambda}{1 -\lambda}$$ -$$ -p(\textrm{misclassification}) = {1 \over 2} (\tilde \alpha_t + \tilde \beta_t) -$$ (eq:classerrorprob) +which can be rearranged to become -where $\tilde \alpha_t = {\rm Prob}(l_t < 1 \mid f)$ and $\tilde \beta_t = {\rm Prob}(l_t \geq 1 \mid g)$. -Since for each $t$, the decision boundary is the same, the decision boundary can be computed as -```{code-cell} ipython3 -root = brentq(lambda w: f(w) / g(w) - 1, 0.001, 0.999) -``` -we can plot the distributions of $f$ and $g$ and the decision boundary +$$ +\frac{c_t^1(s^t)}{c_t^2(s^t)} = \frac{\lambda}{1- \lambda} l_t(s^t) +$$ (eq:allocationrule0) -```{code-cell} ipython3 -:tags: [hide-input] -fig, ax = plt.subplots(figsize=(7, 6)) +where -w_range = np.linspace(1e-5, 1-1e-5, 1000) -f_values = [f(w) for w in w_range] -g_values = [g(w) for w in w_range] -ratio_values = [f(w)/g(w) for w in w_range] +$$ l_t(s^t) = \frac{\pi_t^1(s^t)}{\pi_t^2(s^t)} $$ -ax.plot(w_range, f_values, 'b-', - label=r'$f(w) \sim Beta(1,1)$', linewidth=2) -ax.plot(w_range, g_values, 'r-', - label=r'$g(w) \sim Beta(3,1.2)$', linewidth=2) - -type1_prob = 1 - beta_dist.cdf(root, F_a, F_b) -type2_prob = beta_dist.cdf(root, G_a, G_b) +is the likelihood ratio of agent 1's joint density to agent 2's joint density. -w_type1 = w_range[w_range >= root] -f_type1 = [f(w) for w in w_type1] -ax.fill_between(w_type1, 0, f_type1, alpha=0.3, color='blue', - label=fr'$\tilde \alpha_t = {type1_prob:.2f}$') +Using -w_type2 = w_range[w_range <= root] -g_type2 = [g(w) for w in w_type2] -ax.fill_between(w_type2, 0, g_type2, alpha=0.3, color='red', - label=fr'$\tilde \beta_t = {type2_prob:.2f}$') +$$c_t^1(s^t) + c_t^2(s^t) = 1$$ -ax.axvline(root, color='green', linestyle='--', alpha=0.7, - label=f'decision boundary: $w=${root:.3f}') +we can rewrite allocation rule {eq}`eq:allocationrule0` as -ax.set_xlabel('w') -ax.set_ylabel('probability density') -ax.legend() -plt.tight_layout() -plt.show() -``` -To the left of the green vertical line $g < f$, so $l_t < 1$; therefore a $w_t$ that falls to the left of the green line is classified as a type $g$ individual. +$$\frac{c_t^1(s^t)}{1 - c_t^1(s^t)} = \frac{\lambda}{1-\lambda} l_t(s^t)$$ - * The shaded orange area equals $\beta$ -- the probability of classifying someone as a type $g$ individual when it is really a type $f$ individual. +or -To the right of the green vertical line $g > f$, so $l_t >1 $; therefore a $w_t$ that falls to the right of the green line is classified as a type $f$ individual. +$$c_t^1(s^t) = \frac{\lambda}{1-\lambda} l_t(s^t)(1 - c_t^1(s^t))$$ - * The shaded blue area equals $\alpha$ -- the probability of classifying someone as a type $f$ when it is really a type $g$ individual. +which implies that the social planner's allocation rule is -This gives us clues about how to compute the theoretical classification error probability +$$ +c_t^1(s^t) = \frac{\lambda l_t(s^t)}{1-\lambda + \lambda l_t(s^t)} +$$ (eq:allocationrule1) -```{code-cell} ipython3 -# Compute theoretical tilde α_t and tilde β_t -def α_integrand(w): - """Integrand for tilde α_t = P(l_t < 1 | f)""" - return f(w) if f(w) / g(w) < 1 else 0 +If we define a temporary or **continuation Pareto weight** process as -def β_integrand(w): - """Integrand for tilde β_t = P(l_t >= 1 | g)""" - return g(w) if f(w) / g(w) >= 1 else 0 +$$ +\lambda_t(s^t) = \frac{\lambda l_t(s^t)}{1-\lambda + \lambda l_t(s^t)}, +$$ -# Compute the integrals -α_theory, _ = quad(α_integrand, 0, 1, limit=100) -β_theory, _ = quad(β_integrand, 0, 1, limit=100) +then we can represent the social planner's allocation rule as -theory_error = 0.5 * (α_theory + β_theory) +$$ +c_t^1(s^t) = \lambda_t(s^t) . +$$ -print(f"theoretical tilde α_t = {α_theory:.4f}") -print(f"theoretical tilde β_t = {β_theory:.4f}") -print(f"theoretical classification error probability = {theory_error:.4f}") -``` -Now we simulate timing protocol 2 and compute the classification error probability. -In the next cell, we also compare the theoretical classification accuracy to the empirical classification accuracy -```{code-cell} ipython3 -accuracy = np.empty(T_max) +### If you're so smart, $\ldots$ -sequences_p2, true_sources_p2 = protocol_2( - π_minus_1, T_max, N_simulations) -l_ratios_p2, _ = compute_likelihood_ratios(sequences_p2) -for t in range(T_max): - predictions = (l_ratios_p2[:, t] >= 1) - actual = true_sources_p2[:, t] - accuracy[t] = np.mean(predictions == actual) +Let's compute some values of limiting allocations {eq}`eq:allocationrule1` for some interesting possible limiting +values of the likelihood ratio process $l_t(s^t)$: -plt.figure(figsize=(10, 6)) -plt.plot(range(1, T_max + 1), accuracy, - 'b-', linewidth=2, label='empirical accuracy') -plt.axhline(1 - theory_error, color='r', linestyle='--', - label=f'theoretical accuracy = {1 - theory_error:.4f}') -plt.xlabel('$t$') -plt.ylabel('accuracy') -plt.legend() -plt.ylim(0.5, 1.0) -plt.show() -``` + $$l_\infty (s^\infty)= 1; \quad c_\infty^1 = \lambda$$ + + * In the above case, both agents are equally smart (or equally not smart) and the consumption allocation stays put at a $\lambda, 1 - \lambda $ split between the two agents. -Let's watch decisions made by the two timing protocols as more and more observations accrue. +$$l_\infty (s^\infty) = 0; \quad c_\infty^1 = 0$$ -```{code-cell} ipython3 -fig, ax = plt.subplots(figsize=(7, 6)) +* In the above case, agent 2 is smarter than agent 1, and agent 1's share of the aggregate endowment converges to zero. -ax.plot(T_range, error_prob, linewidth=2, - label='Protocol 1') -ax.plot(T_range, 1-accuracy, linestyle='--', linewidth=2, - label=f'Protocol 2') -ax.set_ylabel('error probability') -ax.legend() -plt.show() -``` -From the figure above, we can see: -- For both timing protocols, the error probability starts at the same level, subject to a little randomness. +$$l_\infty (s^\infty)= \infty; \quad c_\infty^1 = 1$$ -- For timing protocol 1, the error probability decreases as the sample size increases because we are making just **one** decision -- i.e., selecting whether $f$ or $g$ governs **all** individuals. More data provides better evidence. +* In the above case, agent 1 is smarter than agent 2, and agent 1's share of the aggregate endowment converges to 1. -- For timing protocol 2, the error probability remains constant because we are making **many** decisions -- one classification decision for each observation. -**Remark:** Think about how laws of large numbers are applied to compute error probabilities for the model selection problem and the classification problem. +Soon we'll do some simulations that will shed further light on possible outcomes. -## Measuring discrepancies between distributions +But before we do that, let's take a detour and study some "shadow prices" for the social planning problem that can readily be +converted to "equilibrium prices" for a competitive equilibrium. -A plausible guess is that the ability of a likelihood ratio to distinguish distributions $f$ and $g$ depends on how "different" they are. - -But how should we measure discrepancies between distributions? +Doing this will allow us to connect our analysis with an argument of {cite}`alchian1950uncertainty` and {cite}`friedman1953essays` that competitive market processes can make prices of risky assets better reflect realistic probability assessments. -We've already encountered one discrepancy measure -- the Kullback-Leibler (KL) divergence. -We now briefly explore two alternative discrepancy measures. -### Chernoff entropy +### Competitive Equilibrium Prices -Chernoff entropy was motivated by an early application of the [theory of large deviations](https://en.wikipedia.org/wiki/Large_deviations_theory). +The two fundamental welfare theorems for general equilibrium models lead us to anticipate that there is a connection between the allocation that solves the social planning problem we have been studying and the allocation in a **competitive equilibrium** with complete markets in history-contingent commodities. ```{note} -Large deviation theory provides refinements of the central limit theorem. +For the two welfare theorems and their history, see . ``` -The Chernoff entropy between probability densities $f$ and $g$ is defined as: - -$$ -C(f,g) = - \log \min_{\phi \in (0,1)} \int f^\phi(x) g^{1-\phi}(x) dx -$$ +Such a connection prevails for our model. -An upper bound on model selection error probabilty is +We'll sketch it now. -$$ -e^{-C(f,g)T} . -$$ +In a competitive equilibrium, there is no social planner that dictatorially collects everybody's endowments and then reallocates them. -Thus, Chernoff entropy is an upper bound on the exponential rate at which the selection error probability falls as sample size $T$ grows. +Instead, there is a comprehensive centralized market that meets at one point in time. -Let's compute Chernoff entropy numerically with some Python code +There are **prices** at which price-taking agents can buy or sell whatever goods that they want. -```{code-cell} ipython3 -def chernoff_integrand(ϕ, f, g): - """ - Compute the integrand for Chernoff entropy - """ - def integrand(w): - return f(w)**ϕ * g(w)**(1-ϕ) +Trade is multilateral in the sense that all that there is a "Walrasian auctioneer" who lives outside the model and whose job is to verify that +each agent's budget constraint is satisfied. - result, _ = quad(integrand, 1e-5, 1-1e-5) - return result +That budget constraint involves the total value of the agent's endowment stream and the total value of its consumption stream. -def compute_chernoff_entropy(f, g): - """ - Compute Chernoff entropy C(f,g) - """ - def objective(ϕ): - return chernoff_integrand(ϕ, f, g) - - # Find the minimum over ϕ in (0,1) - result = minimize_scalar(objective, - # For numerical stability - bounds=(1e-5, 1-1e-5), - method='bounded') - min_value = result.fun - ϕ_optimal = result.x - - chernoff_entropy = -np.log(min_value) - return chernoff_entropy, ϕ_optimal +Suppose that at time $-1$, before time $0$ starts, agent $i$ can purchase one unit $c_t(s^t)$ of consumption at time $t$ after history +$s^t$ at price $p_t(s^t)$. -C_fg, ϕ_optimal = compute_chernoff_entropy(f, g) -print(f"Chernoff entropy C(f,g) = {C_fg:.4f}") -print(f"Optimal ϕ = {ϕ_optimal:.4f}") -``` +Notice that there is (very long) **vector** of prices. -Now let's examine how $e^{-C(f,g)T}$ behaves as a function of $T$ and compare it to the model selection error probability +We want to study how agents' diverse beliefs influence equilibrium prices. -```{code-cell} ipython3 -T_range = np.arange(1, T_max+1) -chernoff_bound = np.exp(-C_fg * T_range) +Agent $i$ faces a **single** intertemporal budget constraint -# Plot comparison -fig, ax = plt.subplots(figsize=(10, 6)) +$$ +\sum_{t=0}\sum_{s^t} p_t(s^t) c_t^i (y_t(s^t)) \leq \sum_{t=0}\sum_{s^t} p_t(s^t) y_t^i (y_t(s^t)) +$$ (eq:budgetI) -ax.semilogy(T_range, chernoff_bound, 'r-', linewidth=2, - label=f'$e^{{-C(f,g)T}}$') -ax.semilogy(T_range, error_prob, 'b-', linewidth=2, - label='Model selection error probability') +Agent $i$ puts a Lagrange multiplier $\mu^i$ on {eq}`eq:budgetI` and once-and-for-all chooses a consumption plan $\{c^i_t(s^t)\}_{t=0}^\infty$ +to maximize criterion {eq}`eq:objectiveagenti` subject to budget constraint {eq}`eq:budgetI`. -ax.set_xlabel('T') -ax.set_ylabel('error probability (log scale)') -ax.legend() -plt.tight_layout() -plt.show() +```{note} +For convenience, let's remind ourselves of criterion {eq}`eq:objectiveagenti`: +$ +V^i = \sum_{t=0}^{\infty} \sum_{s^t} \delta^t u_t(c_t^i(s^t)) \pi_t^i(s^t)$ ``` -Evidently, $e^{-C(f,g)T}$ is an upper bound on the error rate. +First-order conditions for maximizing with respect to $c_t^i(s^t)$ are -### Jensen-Shannon divergence +$$ +\delta^t u'(c^i(s^t)) \pi_t^i(s^t) = \mu_i p_t(s^t) , +$$ -The [Jensen-Shannon divergence](https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence) is another divergence measure. +which we can rearrange to obtain -For probability densities $f$ and $g$, the **Jensen-Shannon divergence** is defined as: +$$ +p_t(s^t) = \frac{ \delta^t \pi_t^i(s^t)}{\mu^i c^i(s^t)} +$$ (eq:priceequation1) + +for $i=1,2$. + +If we divide equation {eq}`eq:priceequation1` for agent $1$ by the appropriate version of equation {eq}`eq:priceequation1` for agent 2, use +$c^2_t(s^t) = 1 - c^1_t(s^t)$, and do some algebra, we'll obtain $$ -D(f,g) = \frac{1}{2} KL(f, m) + \frac{1}{2} KL(g, m) -$$ (eq:js_divergence) +c_t^1(s^t) = \frac{\mu_1 l_t(s^t)}{\mu_2 + \mu_1 l_t(s^t)} . +$$ (eq:allocationce) -where $m = \frac{1}{2}(f+g)$ is a mixture of $f$ and $g$. - -```{note} -We studied KL divergence in the [section above](rel_entropy) with respect to a reference distribution $h$. +We now engage in an extended "guess-and-verify" exercise that involves matching objects in our competitive equilibrium with objects in +our social planning problem. -Recall that KL divergence $KL(f, g)$ measures expected excess surprisal from using misspecified model $g$ instead $f$ when $f$ is the true model. +* we'll match consumption allocations in the planning problem with equilibrium consumption allocations in the competitive equilibrium +* we'll match "shadow" prices in the planning problem with competitive equilibrium prices. -Because in general $KL(f, g) \neq KL(g, f)$, KL divergence is not symmetric, but Jensen-Shannon divergence is symmetric. +Notice that if we set $\mu_1 = \lambda$ and $\mu_2 = 1 -\lambda$, then formula {eq}`eq:allocationce` agrees with formula +{eq}`eq:allocationrule1`. -(In fact, the square root of the Jensen-Shannon divergence is a metric referred to as the Jensen-Shannon distance.) + * doing this amounts to choosing a **numeraire** or normalization for the price system $\{p_t(s^t)\}_{t=0}^\infty$ -As {eq}`eq:js_divergence` shows, the Jensen-Shannon divergence computes average of the KL divergence of $f$ and $g$ with respect to a particular reference distribution $m$ defined below the equation. +```{note} +For information about how a numeraire must be chosen to pin down the absolute price level in a model like ours that determines only +relative prices, see . ``` -Now let's create a comparison table showing KL divergence, Jensen-Shannon divergence, and Chernoff entropy for a set of pairs of Beta distributions. +If we substitute formula {eq}`eq:allocationce` for $c_t^1(s^t)$ into formula {eq}`eq:priceequation1` and rearrange, we obtain -```{code-cell} ipython3 -def js_divergence(f, g): - """ - Compute Jensen-Shannon divergence - """ - def m(w): - return 0.5 * (f(w) + g(w)) - - def kl_div(p, q): - def integrand(w): - ratio = p(w) / q(w) - return p(w) * np.log(ratio) - result, _ = quad(integrand, 1e-5, 1-1e-5) - return result - - js_div = 0.5 * kl_div(f, m) + 0.5 * kl_div(g, m) - return js_div +$$ +p_t(s^t) = \frac{\delta^t \pi_t^2(s^t)}{1 - \lambda + \lambda l_t(s^t)} +$$ (eq:pformulafinal) -def kl_divergence(f, g): - """ - Compute KL divergence KL(f, g) - """ - def integrand(w): - return f(w) * np.log(f(w) / g(w)) - - result, _ = quad(integrand, 1e-5, 1-1e-5) - return result +According to formula {eq}`eq:pformulafinal`, we have the following possible limiting cases: -distribution_pairs = [ - # (f_params, g_params) - ((1, 1), (0.1, 0.2)), - ((1, 1), (0.3, 0.3)), - ((1, 1), (0.3, 0.4)), - ((1, 1), (0.5, 0.5)), - ((1, 1), (0.7, 0.6)), - ((1, 1), (0.9, 0.8)), - ((1, 1), (1.1, 1.05)), - ((1, 1), (1.2, 1.1)), - ((1, 1), (1.5, 1.2)), - ((1, 1), (2, 1.5)), - ((1, 1), (2.5, 1.8)), - ((1, 1), (3, 1.2)), - ((1, 1), (4, 1)), - ((1, 1), (5, 1)) -] +* when $l_\infty = 0$, $c_\infty^2 = 0 $ and tails of competitive equilibrium prices reflect agent $2$'s probability model $\pi_t^2(s^t)$ +* when $l_\infty = 1$, $c_\infty^1 = 0 $ and tails competitive equilibrium prices reflect agent $1$'s probability model $\pi_t^2(s^t)$ +* for small $t$'s, competitive equilbrium prices reflect both agents' probability models. -# Create comparison table -results = [] -for i, ((f_a, f_b), (g_a, g_b)) in enumerate(distribution_pairs): - # Define the density functions - f = jit(lambda x, a=f_a, b=f_b: p(x, a, b)) - g = jit(lambda x, a=g_a, b=g_b: p(x, a, b)) - - # Compute measures - kl_fg = kl_divergence(f, g) - kl_gf = kl_divergence(g, f) - js_div = js_divergence(f, g) - chernoff_ent, _ = compute_chernoff_entropy(f, g) - - results.append({ - 'Pair': f"f=Beta({f_a},{f_b}), g=Beta({g_a},{g_b})", - 'KL(f, g)': f"{kl_fg:.4f}", - 'KL(g, f)': f"{kl_gf:.4f}", - 'JS divergence': f"{js_div:.4f}", - 'Chernoff entropy': f"{chernoff_ent:.4f}" - }) +### Simulations -df = pd.DataFrame(results) -print(df.to_string(index=False)) -``` +Now let's implement some simulations when agent $1$ believes marginal density -The above table indicates how Jensen-Shannon divergence, and Chernoff entropy, and KL divergence covary as we alter $f$ and $g$. +$$\pi^1(s_t) = f(s_t) $$ -Let's also visualize how these diverge measures covary +and agent $2$ believes marginal density -```{code-cell} ipython3 -kl_fg_values = [float(result['KL(f, g)']) for result in results] -js_values = [float(result['JS divergence']) for result in results] -chernoff_values = [float(result['Chernoff entropy']) for result in results] +$$ \pi^2(s_t) = g(s_t) $$ -fig, axes = plt.subplots(1, 2, figsize=(12, 5)) +where $f$ and $g$ are Beta distributions like ones that we used in earlier sections of this lecture. -# JS divergence and KL divergence -axes[0].scatter(kl_fg_values, js_values, alpha=0.7, s=60) -axes[0].set_xlabel('KL divergence KL(f, g)') -axes[0].set_ylabel('JS divergence') -axes[0].set_title('JS divergence and KL divergence') +Meanwhile, we'll assume that nature believes a marginal density -# Chernoff Entropy and JS divergence -axes[1].scatter(js_values, chernoff_values, alpha=0.7, s=60) -axes[1].set_xlabel('JS divergence') -axes[1].set_ylabel('Chernoff entropy') -axes[1].set_title('Chernoff entropy and JS divergence') +$$ +\pi(s_t) = h(s_t) +$$ -plt.tight_layout() -plt.show() +where $h(s_t)$ is perhaps a mixture of $f$ and $g$. + +Let's write a Python function that computes agent 1's consumption share + +```{code-cell} ipython3 +def simulate_blume_easley(sequences, f_belief=f, g_belief=g, λ=0.5): + """Simulate Blume-Easley model consumption shares.""" + l_ratios, l_cumulative = compute_likelihood_ratios(sequences, f_belief, g_belief) + c1_share = λ * l_cumulative / (1 - λ + λ * l_cumulative) + return l_cumulative, c1_share ``` -To make the comparison more concrete, let's plot the distributions and the divergence measures for a few pairs of distributions. +Now let's use this function to generate sequences in which -Note that the numbers on the title changes with the area of the overlaps of two distributions +* nature draws from $f$ each period, or +* nature draws from $g$ each period, or +* or nature flips a fair coin each period to decide whether to draw from $f$ or $g$ ```{code-cell} ipython3 -:tags: [hide-input] +λ = 0.5 +T = 100 +N = 10000 -def plot_dist_diff(): - """ - Plot overlap of two distributions and divergence measures - """ - - # Chose a subset of Beta distribution parameters - param_grid = [ - ((1, 1), (1, 1)), - ((1, 1), (1.5, 1.2)), - ((1, 1), (2, 1.5)), - ((1, 1), (3, 1.2)), - ((1, 1), (5, 1)), - ((1, 1), (0.3, 0.3)) - ] - - fig, axes = plt.subplots(3, 2, figsize=(15, 12)) - - divergence_data = [] - - for i, ((f_a, f_b), (g_a, g_b)) in enumerate(param_grid): - row = i // 2 - col = i % 2 - - # Create density functions - f = jit(lambda x, a=f_a, b=f_b: p(x, a, b)) - g = jit(lambda x, a=g_a, b=g_b: p(x, a, b)) - - # Compute divergence measures - kl_fg = kl_divergence(f, g) - js_div = js_divergence(f, g) - chernoff_ent, _ = compute_chernoff_entropy(f, g) - - divergence_data.append({ - 'f_params': (f_a, f_b), - 'g_params': (g_a, g_b), - 'kl_fg': kl_fg, - 'js_div': js_div, - 'chernoff': chernoff_ent - }) - - # Plot distributions - x_range = np.linspace(0, 1, 200) - f_vals = [f(x) for x in x_range] - g_vals = [g(x) for x in x_range] - - axes[row, col].plot(x_range, f_vals, 'b-', linewidth=2, - label=f'f ~ Beta({f_a},{f_b})') - axes[row, col].plot(x_range, g_vals, 'r-', linewidth=2, - label=f'g ~ Beta({g_a},{g_b})') - - # Fill overlap region - overlap = np.minimum(f_vals, g_vals) - axes[row, col].fill_between(x_range, 0, overlap, alpha=0.3, - color='purple', label='overlap') - - # Add divergence information - axes[row, col].set_title( - f'KL(f, g)={kl_fg:.3f}, JS={js_div:.3f}, C={chernoff_ent:.3f}', - fontsize=12) - axes[row, col].legend(fontsize=14) - - plt.tight_layout() - plt.show() - - return divergence_data +# Nature follows f, g, or mixture +s_seq_f = np.random.beta(F_a, F_b, (N, T)) +s_seq_g = np.random.beta(G_a, G_b, (N, T)) -divergence_data = plot_dist_diff() +h = jit(lambda x: 0.5 * f(x) + 0.5 * g(x)) +model_choices = np.random.rand(N, T) < 0.5 +s_seq_h = np.empty((N, T)) +s_seq_h[model_choices] = np.random.beta(F_a, F_b, size=model_choices.sum()) +s_seq_h[~model_choices] = np.random.beta(G_a, G_b, size=(~model_choices).sum()) + +l_cum_f, c1_f = simulate_blume_easley(s_seq_f) +l_cum_g, c1_g = simulate_blume_easley(s_seq_g) +l_cum_h, c1_h = simulate_blume_easley(s_seq_h) ``` -### Error probability and divergence measures +Before looking at the figure below, have some fun by guessing whether agent 1 or agent 2 will have a larger and larger consumption share as time passes in our three cases. -Now let's return to our guess that the error probability at large sample sizes is related to the Chernoff entropy between two distributions. +To make better guesses, let's visualize instances of the likelihood ratio processes in the three cases. -We verify this by computing the correlation between the log of the error probability at $T=50$ under Timing Protocol 1 and the divergence measures. +```{code-cell} ipython3 +fig, axes = plt.subplots(2, 3, figsize=(15, 10)) -In the simulation below, nature draws $N / 2$ sequences from $g$ and $N/2$ sequences from $f$. +titles = ["Nature = f", "Nature = g", "Nature = mixture"] +data_pairs = [(l_cum_f, c1_f), (l_cum_g, c1_g), (l_cum_h, c1_h)] -```{note} -Nature does this rather than flipping a fair coin to decide whether to draw from $g$ or $f$ once and for all before each simulation of length $T$. -``` +for i, ((l_cum, c1), title) in enumerate(zip(data_pairs, titles)): + # Likelihood ratios + ax = axes[0, i] + for j in range(min(50, l_cum.shape[0])): + ax.plot(l_cum[j, :], alpha=0.3, color='blue') + ax.set_yscale('log') + ax.set_xlabel('time') + ax.set_ylabel('Likelihood ratio $l_t$') + ax.set_title(title) + ax.axhline(y=1, color='red', linestyle='--', alpha=0.5) -```{code-cell} ipython3 -def compute_likelihood_ratio_stats(sequences, f, g): - """Compute likelihood ratios and cumulative products.""" - l_ratios = f(sequences) / g(sequences) - L_cumulative = np.cumprod(l_ratios, axis=1) - return l_ratios, L_cumulative + # Consumption shares + ax = axes[1, i] + for j in range(min(50, c1.shape[0])): + ax.plot(c1[j, :], alpha=0.3, color='green') + ax.set_xlabel('time') + ax.set_ylabel("Agent 1's consumption share") + ax.set_ylim([0, 1]) + ax.axhline(y=λ, color='red', linestyle='--', alpha=0.5) -def error_divergence_cor(): - """ - compute correlation between error probabilities and divergence measures. - """ - - # Parameters for simulation - T_large = 50 - N_sims = 5000 - N_half = N_sims // 2 - - # Initialize arrays - n_pairs = len(distribution_pairs) - kl_fg_vals = np.zeros(n_pairs) - kl_gf_vals = np.zeros(n_pairs) - js_vals = np.zeros(n_pairs) - chernoff_vals = np.zeros(n_pairs) - error_probs = np.zeros(n_pairs) - pair_names = [] - - for i, ((f_a, f_b), (g_a, g_b)) in enumerate(distribution_pairs): - # Create density functions - f = jit(lambda x, a=f_a, b=f_b: p(x, a, b)) - g = jit(lambda x, a=g_a, b=g_b: p(x, a, b)) +plt.tight_layout() +plt.show() +``` - # Compute divergence measures - kl_fg_vals[i] = kl_divergence(f, g) - kl_gf_vals[i] = kl_divergence(g, f) - js_vals[i] = js_divergence(f, g) - chernoff_vals[i], _ = compute_chernoff_entropy(f, g) +In the left panel, nature chooses $f$. Agent 1's consumption reaches $1$ very quickly. - # Generate samples - sequences_f = np.random.beta(f_a, f_b, (N_half, T_large)) - sequences_g = np.random.beta(g_a, g_b, (N_half, T_large)) +In the middle panel, nature chooses $g$. Agent 1's consumption ratio tends to move towards $0$ but not as fast as in the first case. +In the right panel, nature flips coins each period. We see a very similar pattern to the processes in the left panel. +The figures in the top panel remind us of the discussion in [this section](KL_link). - # Compute likelihood ratios and cumulative products - _, L_cumulative_f = compute_likelihood_ratio_stats(sequences_f, f, g) - _, L_cumulative_g = compute_likelihood_ratio_stats(sequences_g, f, g) - - # Get final values - L_cumulative_f = L_cumulative_f[:, -1] - L_cumulative_g = L_cumulative_g[:, -1] - - # Calculate error probabilities - error_probs[i] = 0.5 * (np.mean(L_cumulative_f < 1) + - np.mean(L_cumulative_g >= 1)) - pair_names.append(f"Beta({f_a},{f_b}) and Beta({g_a},{g_b})") - - return { - 'kl_fg': kl_fg_vals, - 'kl_gf': kl_gf_vals, - 'js': js_vals, - 'chernoff': chernoff_vals, - 'error_prob': error_probs, - 'names': pair_names, - 'T': T_large - } +We invite readers to revisit [that section](rel_entropy) and try to infer the relationships among $KL(f, g)$, $KL(g, f)$, $KL(h, f)$, and $KL(h,g)$. -cor_data = error_divergence_cor() -``` -Now let's visualize the correlations +Let's compute values of KL divergence ```{code-cell} ipython3 -:tags: [hide-input] - -def plot_error_divergence(data): - """ - Plot correlations between error probability and divergence measures. - """ - # Filter out near-zero error probabilities for log scale - nonzero_mask = data['error_prob'] > 1e-6 - log_error = np.log(data['error_prob'][nonzero_mask]) - js_vals = data['js'][nonzero_mask] - chernoff_vals = data['chernoff'][nonzero_mask] +shares = [np.mean(c1_f[:, -1]), np.mean(c1_g[:, -1]), np.mean(c1_h[:, -1])] +Kf_g, Kg_f = compute_KL(f, g), compute_KL(g, f) +Kf_h, Kg_h = compute_KL_h(h, f, g) - # Create figure and axes - fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5)) - - # function for plotting correlation - def plot_correlation(ax, x_vals, x_label, color): - ax.scatter(x_vals, log_error, alpha=0.7, s=60, color=color) - ax.set_xlabel(x_label) - ax.set_ylabel(f'log(Error probability) at T={data["T"]}') - - # Calculate correlation and trend line - corr = np.corrcoef(x_vals, log_error)[0, 1] - z = np.polyfit(x_vals, log_error, 2) - x_trend = np.linspace(x_vals.min(), x_vals.max(), 100) - ax.plot(x_trend, np.poly1d(z)(x_trend), - "r--", alpha=0.8, linewidth=2) - ax.set_title(f'Log error probability and {x_label}\n' - f'Correlation = {corr:.3f}') - - # Plot both correlations - plot_correlation(ax1, js_vals, 'JS divergence', 'C0') - plot_correlation(ax2, chernoff_vals, 'Chernoff entropy', 'C1') +print(f"Final shares: f={shares[0]:.3f}, g={shares[1]:.3f}, mix={shares[2]:.3f}") +print(f"KL divergences: \nKL(f,g)={Kf_g:.3f}, KL(g,f)={Kg_f:.3f}") +print(f"KL(h,f)={Kf_h:.3f}, KL(h,g)={Kg_h:.3f}") +``` - plt.tight_layout() - plt.show() +We find that $KL(f,g) > KL(g,f)$ and $KL(h,g) > KL(h,f)$. -plot_error_divergence(cor_data) -``` +The first inequality tells us that the average "surprise" from having belief $g$ when nature chooses $f$ is greater than the "surprise" from having belief $f$ when nature chooses $g$. -Evidently, Chernoff entropy and Jensen-Shannon entropy each covary tightly with the model selection error probability. +This explains the difference between the first two panels we noted above. -We'll encounter related ideas in {doc}`wald_friedman`. +The second inequality tells us that agent 1's belief distribution $f$ is closer to nature's pick than agent 2's belief $g$. +++ -(hetero_agent)= -## Consumption and Heterogeneous Beliefs +To make this idea more concrete, let's compare two cases: -A likelihood ratio process lies behind Lawrence Blume and David Easley's answer to their question -''If you're so smart, why aren't you rich?'' {cite}`blume2006if`. +- agent 1's belief distribution $f$ is close to agent 2's belief distribution $g$; +- agent 1's belief distribution $f$ is far from agent 2's belief distribution $g$. -Here we'll provide an example that illustrates basic components of their analysis. -Let the random variable $s_t \in (0,1)$ at time $t =0, 1, 2, \ldots$ be distributed according to the same Beta distribution with parameters -$\theta = [\theta_1, \theta_2]$. - -We'll denote this probability density as +We use the two distributions visualized below -$$ -\pi(s_t|\theta) -$$ +```{code-cell} ipython3 +def plot_distribution_overlap(ax, x_range, f_vals, g_vals, + f_label='f', g_label='g', + f_color='blue', g_color='red'): + """Plot two distributions with their overlap region.""" + ax.plot(x_range, f_vals, color=f_color, linewidth=2, label=f_label) + ax.plot(x_range, g_vals, color=g_color, linewidth=2, label=g_label) + + overlap = np.minimum(f_vals, g_vals) + ax.fill_between(x_range, 0, overlap, alpha=0.3, color='purple', label='Overlap') + ax.set_xlabel('x') + ax.set_ylabel('Density') + ax.legend() + +# Define close and far belief distributions +f_close = jit(lambda x: p(x, 1, 1)) +g_close = jit(lambda x: p(x, 1.1, 1.05)) -Below, we'll often just write $\pi(s_t)$ instead of $\pi(s_t|\theta)$ to save space. +f_far = jit(lambda x: p(x, 1, 1)) +g_far = jit(lambda x: p(x, 3, 1.2)) -Let $s_t \equiv y_t^1$ be the endowment of a nonstorable consumption good that a person we'll call "agent 1" receives at time $t$. +# Visualize the belief distributions +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) -Let a history $s^t = [s_t, s_{t-1}, \ldots, s_0]$ be a sequence of i.i.d. random variables with joint distribution +x_range = np.linspace(0.001, 0.999, 200) -$$ -\pi_t(s^t) = \pi(s_t) \pi(s_{t-1}) \cdots \pi(s_0) -$$ +# Close beliefs +f_close_vals = [f_close(x) for x in x_range] +g_close_vals = [g_close(x) for x in x_range] +plot_distribution_overlap(ax1, x_range, f_close_vals, g_close_vals, + f_label='f (Beta(1, 1))', g_label='g (Beta(1.1, 1.05))') +ax1.set_title(f'Close Beliefs') -So in our example, the history $s^t$ is a comprehensive record of agent $1$'s endowments of the consumption good from time $0$ up to time $t$. +# Far beliefs +f_far_vals = [f_far(x) for x in x_range] +g_far_vals = [g_far(x) for x in x_range] +plot_distribution_overlap(ax2, x_range, f_far_vals, g_far_vals, + f_label='f (Beta(1, 1))', g_label='g (Beta(3, 1.2))') +ax2.set_title(f'Far Beliefs') -If agent $1$ were to lives on an island by himself, agent $1$'s consumption $c^1(s_t)$ at time $t$ is +plt.tight_layout() +plt.show() +``` -$$c^1(s_t) = y_t^1 = s_t. $$ +Let's draw the same consumption ratio plots as above for agent 1. -But in our model, agent 1 is not alone. +We replace the simulation paths with median and percentiles to make the figure cleaner. -### Nature and beliefs +Staring at the figure below, can we infer the relation between $KL(f,g)$ and $KL(g,f)$? -Nature draws i.i.d. sequences $\{s_t\}_{t=0}^\infty$ from $\pi_t(s^t)$. +From the right panel, can we infer the relation between $KL(h,g)$ and $KL(h,f)$? -* so $\pi$ without a superscript is nature's model -* but in addition to nature, there are other entities inside our model -- artificial people that we call "agents" -* each agent has a sequence of probability distributions over $s^t$ for $t=0, \ldots$ -* agent $i$ thinks that nature draws i.i.d. sequences $\{s_t\}_{t=0}^\infty$ from $\pi_t^i(s^t)$ - * agent $i$ is mistaken unless $\pi_t^i(s^t) = \pi_t(s^t)$ +```{code-cell} ipython3 +fig, axes = plt.subplots(2, 3, figsize=(15, 10)) +nature_params = {'close': [(1, 1), (1.1, 1.05), (2, 1.5)], + 'far': [(1, 1), (3, 1.2), (2, 1.5)]} +nature_labels = ["Nature = f", "Nature = g", "Nature = h"] +colors = {'close': 'blue', 'far': 'red'} -```{note} -A **rational expectations** model would set $\pi_t^i(s^t) = \pi_t(s^t)$ for all agents $i$. -``` +threshold = 1e-5 # "close to zero" cutoff +for row, (f_belief, g_belief, label) in enumerate([ + (f_close, g_close, 'close'), + (f_far, g_far, 'far')]): + + for col, nature_label in enumerate(nature_labels): + params = nature_params[label][col] + s_seq = np.random.beta(params[0], params[1], (1000, 200)) + _, c1 = simulate_blume_easley(s_seq, f_belief, g_belief, λ) + + median_c1 = np.median(c1, axis=0) + p10, p90 = np.percentile(c1, [10, 90], axis=0) + + ax = axes[row, col] + color = colors[label] + ax.plot(median_c1, color=color, linewidth=2, label='Median') + ax.fill_between(range(len(median_c1)), p10, p90, alpha=0.3, color=color, label='10–90%') + ax.set_xlabel('time') + ax.set_ylabel("Agent 1's share") + ax.set_ylim([0, 1]) + ax.set_title(nature_label) + ax.axhline(y=λ, color='gray', linestyle='--', alpha=0.5) + below = np.where(median_c1 < threshold)[0] + above = np.where(median_c1 > 1-threshold)[0] + if below.size > 0: first_zero = (below[0], True) + elif above.size > 0: first_zero = (above[0], False) + else: first_zero = None + if first_zero is not None: + ax.axvline(x=first_zero[0], color='black', linestyle='--', + alpha=0.7, + label=fr'Median $\leq$ {threshold}' if first_zero[1] + else fr'Median $\geq$ 1-{threshold}') + ax.legend() +plt.tight_layout() +plt.show() +``` -There are two agents named $i=1$ and $i=2$. +Holding to our guesses, let's calculate the four values -At time $t$, agent $1$ receives an endowment +```{code-cell} ipython3 +# Close case +Kf_g, Kg_f = compute_KL(f_close, g_close), compute_KL(g_close, f_close) +Kf_h, Kg_h = compute_KL_h(h, f_close, g_close) -$$ -y_t^1 = s_t -$$ +print(f"KL divergences (close): \nKL(f,g)={Kf_g:.3f}, KL(g,f)={Kg_f:.3f}") +print(f"KL(h,f)={Kf_h:.3f}, KL(h,g)={Kg_h:.3f}") -of a nonstorable consumption good, while agent $2$ receives an endowment of +# Far case +Kf_g, Kg_f = compute_KL(f_far, g_far), compute_KL(g_far, f_far) +Kf_h, Kg_h = compute_KL_h(h, f_far, g_far) +print(f"KL divergences (far): \nKL(f,g)={Kf_g:.3f}, KL(g,f)={Kg_f:.3f}") +print(f"KL(h,f)={Kf_h:.3f}, KL(h,g)={Kg_h:.3f}") +``` -$$ -y_t^2 = 1 - s_t -$$ +We find that in the first case, $KL(f,g) \approx KL(g,f)$ and both are relatively small, so although either agent 1 or agent 2 will eventually consume everything, convergence displaying in first two panels on the top is pretty slowly. -The aggregate endowment of the consumption good is +In the first two panels at the bottom, we see convergence occurring faster (as indicated by the black dashed line) because the divergence gaps $KL(f, g)$ and $KL(g, f)$ are larger. -$$ -y_t^1 + y_t^2 = 1 -$$ +Since $KL(f,g) > KL(g,f)$, we see faster convergence in the first panel at the bottom when nature chooses $f$ than in the second panel where nature chooses $g$. -at each date $t \geq 0$. +This ties in nicely with {eq}`eq:kl_likelihood_link`. -At date $t$ agent $i$ consumes $c_t^i(s^t)$ of the good. +## Hypothesis Testing and Classification -A (non wasteful) feasible allocation of the aggregate endowment of $1$ each period satisfies +This section discusses another application of likelihood ratio processes. -$$ -c_t^1 + c_t^2 = 1 . -$$ +We describe how a statistician can combine frequentist probabilities of type I and type II errors in order to -### A social risk-sharing arrangement +* compute an anticipated frequency of selecting a wrong model based on a sample length $T$ +* compute an anticipated error rate in a classification problem -In order to share risks, a benevolent social planner will dictate a a history-dependent consumption allocation in the form of a sequence of functions +We consider a situation in which nature generates data by mixing known densities $f$ and $g$ with known mixing +parameter $\pi_{-1} \in (0,1)$ so that the random variable $w$ is drawn from the density $$ -c_t^i = c_t^i(s^t) +h (w) = \pi_{-1} f(w) + (1-\pi_{-1}) g(w) $$ -that satisfy - -$$ -c_t^1(s^t) + c_t^2(s^t) = 1 -$$ (eq:feasibility) +We assume that the statistician knows the densities $f$ and $g$ and also the mixing parameter $\pi_{-1}$. -for all $s^t$ for all $t \geq 0$. +Below, we'll set $\pi_{-1} = .5$, although much of the analysis would follow through with other settings of $\pi_{-1} \in (0,1)$. -To design a socially optimal allocation, the social planner wants to know what agents $1$ believe about the endowment sequence and how they feel about bearing risks. +We assume that $f$ and $g$ both put positive probabilities on the same intervals of possible realizations of the random variable $W$. -As for the endowment sequences, agent $i$ believes that nature draws i.i.d. sequences from joint densities + -$$ -\pi_t^i(s^t) = \pi(s_t)^i \pi^i(s_{t-1}) \cdots \pi^i(s_0) -$$ +In the simulations below, we specify that $f$ is a $\text{Beta}(1, 1)$ distribution and that $g$ is $\text{Beta}(3, 1.2)$ distribution. -As for attitudes toward bearing risks, agent $i$ has a one-period utility function +We consider two alternative timing protocols. -$$ -u(c_t^i) = u(c_t^i) = \ln (c_t^i) -$$ + * Timing protocol 1 is for the model selection problem + * Timing protocol 2 is for the individual classification problem -with marginal utility of consumption in period $i$ +**Timing Protocol 1:** Nature flips a coin only **once** at time $t=-1$ and with probability $\pi_{-1}$ generates a sequence $\{w_t\}_{t=1}^T$ +of IID draws from $f$ and with probability $1-\pi_{-1}$ generates a sequence $\{w_t\}_{t=1}^T$ +of IID draws from $g$. -$$ -u'(c_t^i) = \frac{1}{c_t^i} -$$ +Let's write some Python code that implements timing protocol 1. -Putting its beliefs about its random endowment sequence and its attitudes toward bearing risks together, agent $i$ has intertemporal utility function +```{code-cell} ipython3 +def protocol_1(π_minus_1, T, N=1000): + """ + Simulate Protocol 1: + Nature decides once at t=-1 which model to use. + """ + + # On-off coin flip for the true model + true_models_F = np.random.rand(N) < π_minus_1 + + sequences = np.empty((N, T)) + + n_f = np.sum(true_models_F) + n_g = N - n_f + if n_f > 0: + sequences[true_models_F, :] = np.random.beta(F_a, F_b, (n_f, T)) + if n_g > 0: + sequences[~true_models_F, :] = np.random.beta(G_a, G_b, (n_g, T)) + + return sequences, true_models_F +``` -$$ -V^i = \sum_{t=0}^{\infty} \sum_{s^t} \delta^t u(c_t^i(s^t)) \pi_t^i(s^t) , -$$ (eq:objectiveagenti) +**Timing Protocol 2.** Nature flips a coin **often**. At each time $t \geq 0$, nature flips a coin and with probability $\pi_{-1}$ draws $w_t$ from $f$ and with probability $1-\pi_{-1}$ draws $w_t$ from $g$. -where $\delta \in (0,1)$ is an intertemporal discount factor, and $u(\cdot)$ is a strictly increasing, concave one-period utility function. +Here is Python code that we'll use to implement timing protocol 2. +```{code-cell} ipython3 +def protocol_2(π_minus_1, T, N=1000): + """ + Simulate Protocol 2: + Nature decides at each time step which model to use. + """ + + # Coin flips for each time t upto T + true_models_F = np.random.rand(N, T) < π_minus_1 + + sequences = np.empty((N, T)) + + n_f = np.sum(true_models_F) + n_g = N * T - n_f + if n_f > 0: + sequences[true_models_F] = np.random.beta(F_a, F_b, n_f) + if n_g > 0: + sequences[~true_models_F] = np.random.beta(G_a, G_b, n_g) + + return sequences, true_models_F +``` -### The social planner's allocation problem +**Remark:** Under timing protocol 2, the $\{w_t\}_{t=1}^T$ is a sequence of IID draws from $h(w)$. Under timing protocol 1, the the $\{w_t\}_{t=1}^T$ is +not IID. It is **conditionally IID** -- meaning that with probability $\pi_{-1}$ it is a sequence of IID draws from $f(w)$ and with probability $1-\pi_{-1}$ it is a sequence of IID draws from $g(w)$. For more about this, see {doc}`this lecture about exchangeability `. -The benevolent dictator has all the information it requires to choose a consumption allocation that maximizes the social welfare criterion +We again deploy a **likelihood ratio process** with time $t$ component being the likelihood ratio $$ -W = \lambda V^1 + (1-\lambda) V^2 -$$ (eq:welfareW) +\ell (w_t)=\frac{f\left(w_t\right)}{g\left(w_t\right)},\quad t\geq1. +$$ -where $\lambda \in [0,1]$ is a Pareto weight tells how much the planner likes agent $1$ and $1 - \lambda$ is a Pareto weight that tells how much the socical planner likes agent $2$. +The **likelihood ratio process** for sequence $\left\{ w_{t}\right\} _{t=1}^{\infty}$ is -Setting $\lambda = .5$ expresses ''egalitarian'' social preferences. +$$ +L\left(w^{t}\right)=\prod_{i=1}^{t} \ell (w_i), +$$ -Notice how social welfare criterion {eq}`eq:welfareW` takes into account both agent's preferences as represented by formula {eq}`eq:objectiveagenti`. +For shorthand we'll write $L_t = L(w^t)$. -This means that the social planner knows and respects +### Model Selection Mistake Probability -* the one period utility function $u(\cdot) = \ln(\cdot)$ -* each agent $i$'s probability model $\{\pi_t^i(s^t)\}_{t=0}^\infty$ +We first study a problem that assumes timing protocol 1. -Consequently, we anticipate that these objects will appear in the social planner's rule for allocating the aggregate endowment each period. +Consider a decision maker who wants to know whether model $f$ or model $g$ governs a data set of length $T$ observations. +The decision makers has observed a sequence $\{w_t\}_{t=1}^T$. -First-order necessary conditions for maximizing welfare criterion {eq}`eq:welfareW` subject to the feasibility constraint {eq}`eq:feasibility` are +On the basis of that observed sequence, a likelihood ratio test selects model $f$ when + $L_T \geq 1 $ and model $g$ when $L_T < 1$. + +When model $f$ generates the data, the probability that the likelihood ratio test selects the wrong model is -$$\frac{\pi_t^2(s^t)}{\pi_t^1(s^t)} \frac{(1/c_t^2(s^t))}{(1/c_t^1(s^t))} = \frac{\lambda}{1 -\lambda}$$ +$$ +p_f = {\rm Prob}\left(L_T < 1\Big| f\right) = \alpha_T . +$$ -which can be rearranged to become +When model $g$ generates the data, the probability that the likelihood ratio test selects the wrong model is +$$ +p_g = {\rm Prob}\left(L_T \geq 1 \Big|g \right) = \beta_T. +$$ +We can construct a probability that the likelihood ratio selects the wrong model by assigning a Bayesian prior probability of $\pi_{-1} = .5$ that nature selects model $f$ then averaging $p_f$ and $p_g$ to form the Bayesian posterior probability of a detection error equal to +$$ +p(\textrm{wrong decision}) = {1 \over 2} (\alpha_T + \beta_T) . +$$ (eq:detectionerrorprob) -$$ -\frac{c_t^1(s^t)}{c_t^2(s^t)} = \frac{\lambda}{1- \lambda} l_t(s^t) -$$ (eq:allocationrule0) +Now let's simulate timing protocol 1 and compute the error probabilities +```{code-cell} ipython3 +# Set parameters +π_minus_1 = 0.5 +T_max = 30 +N_simulations = 10_000 -where +sequences_p1, true_models_p1 = protocol_1( + π_minus_1, T_max, N_simulations) +l_ratios_p1, L_cumulative_p1 = compute_likelihood_ratios(sequences_p1, f, g) -$$ l_t(s^t) = \frac{\pi_t^1(s^t)}{\pi_t^2(s^t)} $$ +# Compute error probabilities for different sample sizes +T_range = np.arange(1, T_max + 1) -is the likelihood ratio of agent 1's joint density to agent 2's joint density. +# Boolean masks for true models +mask_f = true_models_p1 +mask_g = ~true_models_p1 -Using +# Select cumulative likelihoods for each model +L_f = L_cumulative_p1[mask_f, :] +L_g = L_cumulative_p1[mask_g, :] -$$c_t^1(s^t) + c_t^2(s^t) = 1$$ +α_T = np.mean(L_f < 1, axis=0) +β_T = np.mean(L_g >= 1, axis=0) -we can rewrite allocation rule {eq}`eq:allocationrule0` as +error_prob = 0.5 * (α_T + β_T) +# Plot results +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) +ax1.plot(T_range, α_T, 'b-', + label=r'$\alpha_T$', linewidth=2) +ax1.plot(T_range, β_T, 'r-', + label=r'$\beta_T$', linewidth=2) +ax1.set_xlabel('$T$') +ax1.set_ylabel('error probability') +ax1.legend() -$$\frac{c_t^1(s^t)}{1 - c_t^1(s^t)} = \frac{\lambda}{1-\lambda} l_t(s^t)$$ +ax2.plot(T_range, error_prob, 'g-', + label=r'$\frac{1}{2}(\alpha_T+\beta_T)$', linewidth=2) +ax2.set_xlabel('$T$') +ax2.set_ylabel('error probability') +ax2.legend() -or +plt.tight_layout() +plt.show() -$$c_t^1(s^t) = \frac{\lambda}{1-\lambda} l_t(s^t)(1 - c_t^1(s^t))$$ +print(f"At T={T_max}:") +print(f"α_{T_max} = {α_T[-1]:.4f}") +print(f"β_{T_max} = {β_T[-1]:.4f}") +print(f"Model selection error probability = {error_prob[-1]:.4f}") +``` -which implies that the social planner's allocation rule is +Notice how the model selection error probability approaches zero as $T$ grows. -$$ -c_t^1(s^t) = \frac{\lambda l_t(s^t)}{1-\lambda + \lambda l_t(s^t)} -$$ (eq:allocationrule1) +### Classification -If we define a temporary or **continuation Pareto weight** process as +We now consider a problem that assumes timing protocol 2. -$$ -\lambda_t(s^t) = \frac{\lambda l_t(s^t)}{1-\lambda + \lambda l_t(s^t)}, -$$ +A decision maker wants to classify components of an observed sequence $\{w_t\}_{t=1}^T$ as having been drawn from either $f$ or $g$. -then we can represent the social planner's allocation rule as +The decision maker uses the following classification rule: $$ -c_t^1(s^t) = \lambda_t(s^t) . +\begin{aligned} +w_t & \ {\rm is \ from \ f \ if \ } l_t > 1 \\ +w_t & \ {\rm is \ from \ g \ if \ } l_t \leq 1 . +\end{aligned} $$ +Under this rule, the expected misclassification rate is +$$ +p(\textrm{misclassification}) = {1 \over 2} (\tilde \alpha_t + \tilde \beta_t) +$$ (eq:classerrorprob) +where $\tilde \alpha_t = {\rm Prob}(l_t < 1 \mid f)$ and $\tilde \beta_t = {\rm Prob}(l_t \geq 1 \mid g)$. -### If you're so smart, $\ldots$ - - -Let's compute some values of limiting allocations {eq}`eq:allocationrule1` for some interesting possible limiting -values of the likelihood ratio process $l_t(s^t)$: +Since for each $t$, the decision boundary is the same, the decision boundary can be computed as - $$l_\infty (s^\infty)= 1; \quad c_\infty^1 = \lambda$$ - - * In the above case, both agents are equally smart (or equally not smart) and the consumption allocation stays put at a $\lambda, 1 - \lambda $ split between the two agents. +```{code-cell} ipython3 +root = brentq(lambda w: f(w) / g(w) - 1, 0.001, 0.999) +``` -$$l_\infty (s^\infty) = 0; \quad c_\infty^1 = 0$$ +we can plot the distributions of $f$ and $g$ and the decision boundary -* In the above case, agent 2 is smarter than agent 1, and agent 1's share of the aggregate endowment converges to zero. +```{code-cell} ipython3 +:tags: [hide-input] +fig, ax = plt.subplots(figsize=(7, 6)) +w_range = np.linspace(1e-5, 1-1e-5, 1000) +f_values = [f(w) for w in w_range] +g_values = [g(w) for w in w_range] +ratio_values = [f(w)/g(w) for w in w_range] -$$l_\infty (s^\infty)= \infty; \quad c_\infty^1 = 1$$ +ax.plot(w_range, f_values, 'b-', + label=r'$f(w) \sim Beta(1,1)$', linewidth=2) +ax.plot(w_range, g_values, 'r-', + label=r'$g(w) \sim Beta(3,1.2)$', linewidth=2) -* In the above case, agent 1 is smarter than agent 2, and agent 1's share of the aggregate endowment converges to 1. +type1_prob = 1 - beta_dist.cdf(root, F_a, F_b) +type2_prob = beta_dist.cdf(root, G_a, G_b) +w_type1 = w_range[w_range >= root] +f_type1 = [f(w) for w in w_type1] +ax.fill_between(w_type1, 0, f_type1, alpha=0.3, color='blue', + label=fr'$\tilde \alpha_t = {type1_prob:.2f}$') -Soon we'll do some simulations that will shed further light on possible outcomes. +w_type2 = w_range[w_range <= root] +g_type2 = [g(w) for w in w_type2] +ax.fill_between(w_type2, 0, g_type2, alpha=0.3, color='red', + label=fr'$\tilde \beta_t = {type2_prob:.2f}$') -But before we do that, let's take a detour and study some "shadow prices" for the social planning problem that can readily be -converted to "equilibrium prices" for a competitive equilibrium. +ax.axvline(root, color='green', linestyle='--', alpha=0.7, + label=f'decision boundary: $w=${root:.3f}') -Doing this will allow us to connect our analysis with an argument of {cite}`alchian1950uncertainty` and {cite}`friedman1953essays` that competitive market processes can make prices of risky assets better reflect realistic probability assessments. +ax.set_xlabel('w') +ax.set_ylabel('probability density') +ax.legend() +plt.tight_layout() +plt.show() +``` +To the left of the green vertical line $g < f$, so $l_t < 1$; therefore a $w_t$ that falls to the left of the green line is classified as a type $g$ individual. -### Competitive Equilibrium Prices + * The shaded orange area equals $\beta$ -- the probability of classifying someone as a type $g$ individual when it is really a type $f$ individual. -The two fundamental welfare theorems for general equilibrium models lead us to anticipate that there is a connection between the allocation that solves the social planning problem we have been studying and the allocation in a **competitive equilibrium** with complete markets in history-contingent commodities. +To the right of the green vertical line $g > f$, so $l_t >1 $; therefore a $w_t$ that falls to the right of the green line is classified as a type $f$ individual. -```{note} -For the two welfare theorems and their history, see . -``` + * The shaded blue area equals $\alpha$ -- the probability of classifying someone as a type $f$ when it is really a type $g$ individual. -Such a connection prevails for our model. +This gives us clues about how to compute the theoretical classification error probability -We'll sketch it now. +```{code-cell} ipython3 +# Compute theoretical tilde α_t and tilde β_t +def α_integrand(w): + """Integrand for tilde α_t = P(l_t < 1 | f)""" + return f(w) if f(w) / g(w) < 1 else 0 -In a competitive equilibrium, there is no social planner that dictatorially collects everybody's endowments and then reallocates them. +def β_integrand(w): + """Integrand for tilde β_t = P(l_t >= 1 | g)""" + return g(w) if f(w) / g(w) >= 1 else 0 -Instead, there is a comprehensive centralized market that meets at one point in time. +# Compute the integrals +α_theory, _ = quad(α_integrand, 0, 1, limit=100) +β_theory, _ = quad(β_integrand, 0, 1, limit=100) -There are **prices** at which price-taking agents can buy or sell whatever goods that they want. +theory_error = 0.5 * (α_theory + β_theory) -Trade is multilateral in the sense that all that there is a "Walrasian auctioneer" who lives outside the model and whose job is to verify that -each agent's budget constraint is satisfied. +print(f"theoretical tilde α_t = {α_theory:.4f}") +print(f"theoretical tilde β_t = {β_theory:.4f}") +print(f"theoretical classification error probability = {theory_error:.4f}") +``` -That budget constraint involves the total value of the agent's endowment stream and the total value of its consumption stream. +Now we simulate timing protocol 2 and compute the classification error probability. -Suppose that at time $-1$, before time $0$ starts, agent $i$ can purchase one unit $c_t(s^t)$ of consumption at time $t$ after history -$s^t$ at price $p_t(s^t)$. +In the next cell, we also compare the theoretical classification accuracy to the empirical classification accuracy -Notice that there is (very long) **vector** of prices. +```{code-cell} ipython3 +accuracy = np.empty(T_max) -We want to study how agents' diverse beliefs influence equilibrium prices. +sequences_p2, true_sources_p2 = protocol_2( + π_minus_1, T_max, N_simulations) +l_ratios_p2, _ = compute_likelihood_ratios(sequences_p2, f, g) -Agent $i$ faces a **single** intertemporal budget constraint +for t in range(T_max): + predictions = (l_ratios_p2[:, t] >= 1) + actual = true_sources_p2[:, t] + accuracy[t] = np.mean(predictions == actual) -$$ -\sum_{t=0}\sum_{s^t} p_t(s^t) c_t^i (y_t(s^t)) \leq \sum_{t=0}\sum_{s^t} p_t(s^t) y_t^i (y_t(s^t)) -$$ (eq:budgetI) +plt.figure(figsize=(10, 6)) +plt.plot(range(1, T_max + 1), accuracy, + 'b-', linewidth=2, label='empirical accuracy') +plt.axhline(1 - theory_error, color='r', linestyle='--', + label=f'theoretical accuracy = {1 - theory_error:.4f}') +plt.xlabel('$t$') +plt.ylabel('accuracy') +plt.legend() +plt.ylim(0.5, 1.0) +plt.show() +``` -Agent $i$ puts a Lagrange multiplier $\mu^i$ on {eq}`eq:budgetI` and once-and-for-all chooses a consumption plan $\{c^i_t(s^t)\}_{t=0}^\infty$ -to maximize criterion {eq}`eq:objectiveagenti` subject to budget constraint {eq}`eq:budgetI`. +Let's watch decisions made by the two timing protocols as more and more observations accrue. -```{note} -For convenience, let's remind ourselves of criterion {eq}`eq:objectiveagenti`: -$ -V^i = \sum_{t=0}^{\infty} \sum_{s^t} \delta^t u_t(c_t^i(s^t)) \pi_t^i(s^t)$ +```{code-cell} ipython3 +fig, ax = plt.subplots(figsize=(7, 6)) + +ax.plot(T_range, error_prob, linewidth=2, + label='Protocol 1') +ax.plot(T_range, 1-accuracy, linestyle='--', linewidth=2, + label=f'Protocol 2') +ax.set_ylabel('error probability') +ax.legend() +plt.show() ``` -First-order conditions for maximizing with respect to $c_t^i(s^t)$ are +From the figure above, we can see: -$$ -\delta^t u'(c^i(s^t)) \pi_t^i(s^t) = \mu_i p_t(s^t) , -$$ +- For both timing protocols, the error probability starts at the same level, subject to a little randomness. -which we can rearrange to obtain +- For timing protocol 1, the error probability decreases as the sample size increases because we are making just **one** decision -- i.e., selecting whether $f$ or $g$ governs **all** individuals. More data provides better evidence. -$$ -p_t(s^t) = \frac{ \delta^t \pi_t^i(s^t)}{\mu^i c^i(s^t)} -$$ (eq:priceequation1) +- For timing protocol 2, the error probability remains constant because we are making **many** decisions -- one classification decision for each observation. -for $i=1,2$. +**Remark:** Think about how laws of large numbers are applied to compute error probabilities for the model selection problem and the classification problem. -If we divide equation {eq}`eq:priceequation1` for agent $1$ by the appropriate version of equation {eq}`eq:priceequation1` for agent 2, use -$c^2_t(s^t) = 1 - c^1_t(s^t)$, and do some algebra, we'll obtain +## Measuring discrepancies between distributions -$$ -c_t^1(s^t) = \frac{\mu_1 l_t(s^t)}{\mu_2 + \mu_1 l_t(s^t)} . -$$ (eq:allocationce) +A plausible guess is that the ability of a likelihood ratio to distinguish distributions $f$ and $g$ depends on how "different" they are. + +But how should we measure discrepancies between distributions? -We now engage in an extended "guess-and-verify" exercise that involves matching objects in our competitive equilibrium with objects in -our social planning problem. +We've already encountered one discrepancy measure -- the Kullback-Leibler (KL) divergence. -* we'll match consumption allocations in the planning problem with equilibrium consumption allocations in the competitive equilibrium -* we'll match "shadow" prices in the planning problem with competitive equilibrium prices. +We now briefly explore two alternative discrepancy measures. -Notice that if we set $\mu_1 = \lambda$ and $\mu_2 = 1 -\lambda$, then formula {eq}`eq:allocationce` agrees with formula -{eq}`eq:allocationrule1`. +### Chernoff entropy - * doing this amounts to choosing a **numeraire** or normalization for the price system $\{p_t(s^t)\}_{t=0}^\infty$ +Chernoff entropy was motivated by an early application of the [theory of large deviations](https://en.wikipedia.org/wiki/Large_deviations_theory). ```{note} -For information about how a numeraire must be chosen to pin down the absolute price level in a model like ours that determines only -relative prices, see . +Large deviation theory provides refinements of the central limit theorem. ``` -If we substitute formula {eq}`eq:allocationce` for $c_t^1(s^t)$ into formula {eq}`eq:priceequation1` and rearrange, we obtain +The Chernoff entropy between probability densities $f$ and $g$ is defined as: $$ -p_t(s^t) = \frac{\delta^t \pi_t^2(s^t)}{1 - \lambda + \lambda l_t(s^t)} -$$ (eq:pformulafinal) +C(f,g) = - \log \min_{\phi \in (0,1)} \int f^\phi(x) g^{1-\phi}(x) dx +$$ -According to formula {eq}`eq:pformulafinal`, we have the following possible limiting cases: +An upper bound on model selection error probabilty is -* when $l_\infty = 0$, $c_\infty^2 = 0 $ and tails of competitive equilibrium prices reflect agent $2$'s probability model $\pi_t^2(s^t)$ -* when $l_\infty = 1$, $c_\infty^1 = 0 $ and tails competitive equilibrium prices reflect agent $1$'s probability model $\pi_t^2(s^t)$ -* for small $t$'s, competitive equilbrium prices reflect both agents' probability models. +$$ +e^{-C(f,g)T} . +$$ -### Simulations +Thus, Chernoff entropy is an upper bound on the exponential rate at which the selection error probability falls as sample size $T$ grows. -Now let's implement some simulations when agent $1$ believes marginal density +Let's compute Chernoff entropy numerically with some Python code -$$\pi^1(s_t) = f(s_t) $$ +```{code-cell} ipython3 +def chernoff_integrand(ϕ, f, g): + """ + Compute the integrand for Chernoff entropy + """ + def integrand(w): + return f(w)**ϕ * g(w)**(1-ϕ) -and agent $2$ believes marginal density + result, _ = quad(integrand, 1e-5, 1-1e-5) + return result -$$ \pi^2(s_t) = g(s_t) $$ +def compute_chernoff_entropy(f, g): + """ + Compute Chernoff entropy C(f,g) + """ + def objective(ϕ): + return chernoff_integrand(ϕ, f, g) + + # Find the minimum over ϕ in (0,1) + result = minimize_scalar(objective, + # For numerical stability + bounds=(1e-5, 1-1e-5), + method='bounded') + min_value = result.fun + ϕ_optimal = result.x + + chernoff_entropy = -np.log(min_value) + return chernoff_entropy, ϕ_optimal -where $f$ and $g$ are Beta distributions like ones that we used in earlier sections of this lecture. +C_fg, ϕ_optimal = compute_chernoff_entropy(f, g) +print(f"Chernoff entropy C(f,g) = {C_fg:.4f}") +print(f"Optimal ϕ = {ϕ_optimal:.4f}") +``` -Meanwhile, we'll assume that nature believes a marginal density +Now let's examine how $e^{-C(f,g)T}$ behaves as a function of $T$ and compare it to the model selection error probability -$$ -\pi(s_t) = h(s_t) -$$ +```{code-cell} ipython3 +T_range = np.arange(1, T_max+1) +chernoff_bound = np.exp(-C_fg * T_range) -where $h(s_t)$ is perhaps a mixture of $f$ and $g$. +# Plot comparison +fig, ax = plt.subplots(figsize=(10, 6)) -Let's write a Python function that computes agent 1's consumption share +ax.semilogy(T_range, chernoff_bound, 'r-', linewidth=2, + label=f'$e^{{-C(f,g)T}}$') +ax.semilogy(T_range, error_prob, 'b-', linewidth=2, + label='Model selection error probability') -```{code-cell} ipython3 -def simulate_blume_easley(sequences, f_belief=f, g_belief=g, λ=0.5): - """Simulate Blume-Easley model consumption shares.""" - l_ratios, l_cumulative = compute_likelihood_ratio_stats(sequences, f_belief, g_belief) - c1_share = λ * l_cumulative / (1 - λ + λ * l_cumulative) - return l_cumulative, c1_share +ax.set_xlabel('T') +ax.set_ylabel('error probability (log scale)') +ax.legend() +plt.tight_layout() +plt.show() ``` -Now let's use this function to generate sequences in which - -* nature draws from $f$ each period, or -* nature draws from $g$ each period, or -* or nature flips a fair coin each period to decide whether to draw from $f$ or $g$ +Evidently, $e^{-C(f,g)T}$ is an upper bound on the error rate. -```{code-cell} ipython3 -λ = 0.5 -T = 100 -N = 10000 +### Jensen-Shannon divergence -# Nature follows f, g, or mixture -s_seq_f = np.random.beta(F_a, F_b, (N, T)) -s_seq_g = np.random.beta(G_a, G_b, (N, T)) +The [Jensen-Shannon divergence](https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence) is another divergence measure. -h = jit(lambda x: 0.5 * f(x) + 0.5 * g(x)) -model_choices = np.random.rand(N, T) < 0.5 -s_seq_h = np.empty((N, T)) -s_seq_h[model_choices] = np.random.beta(F_a, F_b, size=model_choices.sum()) -s_seq_h[~model_choices] = np.random.beta(G_a, G_b, size=(~model_choices).sum()) +For probability densities $f$ and $g$, the **Jensen-Shannon divergence** is defined as: -l_cum_f, c1_f = simulate_blume_easley(s_seq_f) -l_cum_g, c1_g = simulate_blume_easley(s_seq_g) -l_cum_h, c1_h = simulate_blume_easley(s_seq_h) -``` +$$ +D(f,g) = \frac{1}{2} KL(f, m) + \frac{1}{2} KL(g, m) +$$ (eq:compute_JS) -Before looking at the figure below, have some fun by guessing whether agent 1 or agent 2 will have a larger and larger consumption share as time passes in our three cases. +where $m = \frac{1}{2}(f+g)$ is a mixture of $f$ and $g$. -To make better guesses, let's visualize instances of the likelihood ratio processes in the three cases. +Below we compute Jensen-Shannon divergence numerically with some Python code ```{code-cell} ipython3 -fig, axes = plt.subplots(2, 3, figsize=(15, 10)) - -titles = ["Nature = f", "Nature = g", "Nature = mixture"] -data_pairs = [(l_cum_f, c1_f), (l_cum_g, c1_g), (l_cum_h, c1_h)] - -for i, ((l_cum, c1), title) in enumerate(zip(data_pairs, titles)): - # Likelihood ratios - ax = axes[0, i] - for j in range(min(50, l_cum.shape[0])): - ax.plot(l_cum[j, :], alpha=0.3, color='blue') - ax.set_yscale('log') - ax.set_xlabel('time') - ax.set_ylabel('Likelihood ratio $l_t$') - ax.set_title(title) - ax.axhline(y=1, color='red', linestyle='--', alpha=0.5) +def compute_JS(f, g): + """ + Compute Jensen-Shannon divergence + """ + def m(w): + return 0.5 * (f(w) + g(w)) + + js_div = 0.5 * compute_KL(f, m) + 0.5 * compute_KL(g, m) + return js_div +``` - # Consumption shares - ax = axes[1, i] - for j in range(min(50, c1.shape[0])): - ax.plot(c1[j, :], alpha=0.3, color='green') - ax.set_xlabel('time') - ax.set_ylabel("Agent 1's consumption share") - ax.set_ylim([0, 1]) - ax.axhline(y=λ, color='red', linestyle='--', alpha=0.5) + +```{note} +We studied KL divergence in the [section above](rel_entropy) with respect to a reference distribution $h$. -plt.tight_layout() -plt.show() -``` +Recall that KL divergence $KL(f, g)$ measures expected excess surprisal from using misspecified model $g$ instead $f$ when $f$ is the true model. -In the left panel, nature chooses $f$. Agent 1's consumption reaches $1$ very quickly. +Because in general $KL(f, g) \neq KL(g, f)$, KL divergence is not symmetric, but Jensen-Shannon divergence is symmetric. -In the middle panel, nature chooses $g$. Agent 1's consumption ratio tends to move towards $0$ but not as fast as in the first case. +(In fact, the square root of the Jensen-Shannon divergence is a metric referred to as the Jensen-Shannon distance.) -In the right panel, nature flips coins each period. We see a very similar pattern to the processes in the left panel. +As {eq}`eq:compute_JS` shows, the Jensen-Shannon divergence computes average of the KL divergence of $f$ and $g$ with respect to a particular reference distribution $m$ defined below the equation. +``` -The figures in the top panel remind us of the discussion in [this section](rel_entropy). +Now let's create a comparison table showing KL divergence, Jensen-Shannon divergence, and Chernoff entropy for a set of pairs of Beta distributions. -We invite readers to revisit [that section](rel_entropy) and try to infer the relationships among $KL(f, g)$, $KL(g, f)$, $KL(h, f)$, and $KL(h,g)$. +```{code-cell} ipython3 +:tags: [hide-input] +distribution_pairs = [ + # (f_params, g_params) + ((1, 1), (0.1, 0.2)), + ((1, 1), (0.3, 0.3)), + ((1, 1), (0.3, 0.4)), + ((1, 1), (0.5, 0.5)), + ((1, 1), (0.7, 0.6)), + ((1, 1), (0.9, 0.8)), + ((1, 1), (1.1, 1.05)), + ((1, 1), (1.2, 1.1)), + ((1, 1), (1.5, 1.2)), + ((1, 1), (2, 1.5)), + ((1, 1), (2.5, 1.8)), + ((1, 1), (3, 1.2)), + ((1, 1), (4, 1)), + ((1, 1), (5, 1)) +] -Let's compute values of KL divergence +# Create comparison table +results = [] +for i, ((f_a, f_b), (g_a, g_b)) in enumerate(distribution_pairs): + # Define the density functions + f = jit(lambda x, a=f_a, b=f_b: p(x, a, b)) + g = jit(lambda x, a=g_a, b=g_b: p(x, a, b)) + + # Compute measures + kl_fg = compute_KL(f, g) + kl_gf = compute_KL(g, f) + js_div = compute_JS(f, g) + chernoff_ent, _ = compute_chernoff_entropy(f, g) + + results.append({ + 'Pair (f, g)': f"\\text{{Beta}}({f_a},{f_b}), \\text{{Beta}}({g_a},{g_b})", + 'KL(f, g)': f"{kl_fg:.4f}", + 'KL(g, f)': f"{kl_gf:.4f}", + 'JS': f"{js_div:.4f}", + 'C': f"{chernoff_ent:.4f}" + }) -```{code-cell} ipython3 -shares = [np.mean(c1_f[:, -1]), np.mean(c1_g[:, -1]), np.mean(c1_h[:, -1])] -Kf_g, Kg_f = kl_divergence(f, g), kl_divergence(g, f) -Kf_h, Kg_h = compute_KL(h, f, g) +df = pd.DataFrame(results) -print(f"Final shares: f={shares[0]:.3f}, g={shares[1]:.3f}, mix={shares[2]:.3f}") -print(f"KL divergences: \nKL(f,g)={Kf_g:.3f}, KL(g,f)={Kg_f:.3f}") -print(f"KL(h,f)={Kf_h:.3f}, KL(h,g)={Kg_h:.3f}") +# Sort by JS divergence +df['JS_numeric'] = df['JS'].astype(float) +df = df.sort_values('JS_numeric').drop('JS_numeric', axis=1) + +# Generate LaTeX table manually +columns = ' & '.join([f'\\text{{{col}}}' for col in df.columns]) +rows = ' \\\\\n'.join( + [' & '.join([f'{val}' for val in row]) + for row in df.values]) + +latex_code = rf""" +\begin{{array}}{{lcccc}} +{columns} \\ +\hline +{rows} +\end{{array}} +""" + +display(Math(latex_code)) ``` -We find that $KL(f,g) > KL(g,f)$ and $KL(h,g) > KL(h,f)$. +The above table indicates how Jensen-Shannon divergence, and Chernoff entropy, and KL divergence covary as we alter $f$ and $g$. -The first inequality tells us that the average "surprise" or "inefficiency" of using belief $g$ when nature chooses $f$ is greater than the "surprise" of using belief $f$ when nature chooses $g$. +Let's also visualize how these diverge measures covary -The second inequality tells us that agent 1's belief distribution $f$ is closer to nature's pick than agent 2's belief $g$. +```{code-cell} ipython3 +kl_fg_values = [float(result['KL(f, g)']) for result in results] +js_values = [float(result['JS']) for result in results] +chernoff_values = [float(result['C']) for result in results] -+++ +fig, axes = plt.subplots(1, 2, figsize=(12, 5)) -To make this idea more concrete, let's compare two cases: +# JS divergence and KL divergence +axes[0].scatter(kl_fg_values, js_values, alpha=0.7, s=60) +axes[0].set_xlabel('KL divergence KL(f, g)') +axes[0].set_ylabel('JS divergence') +axes[0].set_title('JS divergence and KL divergence') -- agent 1's belief distribution $f$ is close to agent 2's belief distribution $g$; -- agent 1's belief distribution $f$ is far from agent 2's belief distribution $g$. +# Chernoff Entropy and JS divergence +axes[1].scatter(js_values, chernoff_values, alpha=0.7, s=60) +axes[1].set_xlabel('JS divergence') +axes[1].set_ylabel('Chernoff entropy') +axes[1].set_title('Chernoff entropy and JS divergence') +plt.tight_layout() +plt.show() +``` -We use the two distributions visualized below +To make the comparison more concrete, let's plot the distributions and the divergence measures for a few pairs of distributions. + +Note that the numbers on the title changes with the area of the overlaps of two distributions ```{code-cell} ipython3 -def plot_distribution_overlap(ax, x_range, f_vals, g_vals, - f_label='f', g_label='g', - f_color='blue', g_color='red'): - """Plot two distributions with their overlap region.""" - ax.plot(x_range, f_vals, color=f_color, linewidth=2, label=f_label) - ax.plot(x_range, g_vals, color=g_color, linewidth=2, label=g_label) +:tags: [hide-input] + +def plot_dist_diff(): + """ + Plot overlap of two distributions and divergence measures + """ - overlap = np.minimum(f_vals, g_vals) - ax.fill_between(x_range, 0, overlap, alpha=0.3, color='purple', label='Overlap') - ax.set_xlabel('x') - ax.set_ylabel('Density') - ax.legend() + # Chose a subset of Beta distribution parameters + param_grid = [ + ((1, 1), (1, 1)), + ((1, 1), (1.5, 1.2)), + ((1, 1), (2, 1.5)), + ((1, 1), (3, 1.2)), + ((1, 1), (5, 1)), + ((1, 1), (0.3, 0.3)) + ] -# Define close and far belief distributions -f_close = jit(lambda x: p(x, 1, 1)) -g_close = jit(lambda x: p(x, 1.1, 1.05)) - -f_far = jit(lambda x: p(x, 1, 1)) -g_far = jit(lambda x: p(x, 3, 1.2)) + fig, axes = plt.subplots(3, 2, figsize=(15, 12)) + + divergence_data = [] + + for i, ((f_a, f_b), (g_a, g_b)) in enumerate(param_grid): + row = i // 2 + col = i % 2 + + # Create density functions + f = jit(lambda x, a=f_a, b=f_b: p(x, a, b)) + g = jit(lambda x, a=g_a, b=g_b: p(x, a, b)) + + # Compute divergence measures + kl_fg = compute_KL(f, g) + js_div = compute_JS(f, g) + chernoff_ent, _ = compute_chernoff_entropy(f, g) + + divergence_data.append({ + 'f_params': (f_a, f_b), + 'g_params': (g_a, g_b), + 'kl_fg': kl_fg, + 'js_div': js_div, + 'chernoff': chernoff_ent + }) + + # Plot distributions + x_range = np.linspace(0, 1, 200) + f_vals = [f(x) for x in x_range] + g_vals = [g(x) for x in x_range] + + axes[row, col].plot(x_range, f_vals, 'b-', linewidth=2, + label=f'f ~ Beta({f_a},{f_b})') + axes[row, col].plot(x_range, g_vals, 'r-', linewidth=2, + label=f'g ~ Beta({g_a},{g_b})') + + # Fill overlap region + overlap = np.minimum(f_vals, g_vals) + axes[row, col].fill_between(x_range, 0, overlap, alpha=0.3, + color='purple', label='overlap') + + # Add divergence information + axes[row, col].set_title( + f'KL(f, g)={kl_fg:.3f}, JS={js_div:.3f}, C={chernoff_ent:.3f}', + fontsize=12) + axes[row, col].legend(fontsize=14) + + plt.tight_layout() + plt.show() + + return divergence_data -js_close = js_divergence(f_close, g_close) -js_far = js_divergence(f_far, g_far) +divergence_data = plot_dist_diff() +``` -# Visualize the belief distributions -fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) +### Error probability and divergence measures -x_range = np.linspace(0.001, 0.999, 200) +Now let's return to our guess that the error probability at large sample sizes is related to the Chernoff entropy between two distributions. -# Close beliefs -f_close_vals = [f_close(x) for x in x_range] -g_close_vals = [g_close(x) for x in x_range] -plot_distribution_overlap(ax1, x_range, f_close_vals, g_close_vals, - f_label='f (Beta(1, 1))', g_label='g (Beta(1.1, 1.05))') -ax1.set_title(f'Close Beliefs\nJS divergence={js_close:.4f}') +We verify this by computing the correlation between the log of the error probability at $T=50$ under Timing Protocol 1 and the divergence measures. -# Far beliefs -f_far_vals = [f_far(x) for x in x_range] -g_far_vals = [g_far(x) for x in x_range] -plot_distribution_overlap(ax2, x_range, f_far_vals, g_far_vals, - f_label='f (Beta(1, 1))', g_label='g (Beta(3, 1.2))') -ax2.set_title(f'Far Beliefs\nJS divergence={js_far:.4f}') +In the simulation below, nature draws $N / 2$ sequences from $g$ and $N/2$ sequences from $f$. -plt.tight_layout() -plt.show() +```{note} +Nature does this rather than flipping a fair coin to decide whether to draw from $g$ or $f$ once and for all before each simulation of length $T$. ``` -Let's draw the same consumption ratio plots as above for agent 1. - -We replace the simulation paths with median and percentiles to make the figure cleaner. - -Staring at the figure below, can we infer the relation between $KL(f,g)$ and $KL(g,f)$? +```{code-cell} ipython3 +# Parameters for simulation +T_large = 50 +N_sims = 5000 +N_half = N_sims // 2 + +# Initialize arrays +n_pairs = len(distribution_pairs) +kl_fg_vals = np.zeros(n_pairs) +kl_gf_vals = np.zeros(n_pairs) +js_vals = np.zeros(n_pairs) +chernoff_vals = np.zeros(n_pairs) +error_probs = np.zeros(n_pairs) +pair_names = [] -From the right panel, can we infer the relation between $KL(h,g)$ and $KL(h,f)$? +for i, ((f_a, f_b), (g_a, g_b)) in enumerate(distribution_pairs): + # Create density functions + f = jit(lambda x, a=f_a, b=f_b: p(x, a, b)) + g = jit(lambda x, a=g_a, b=g_b: p(x, a, b)) -```{code-cell} ipython3 -fig, axes = plt.subplots(2, 3, figsize=(15, 10)) -nature_params = {'close': [(1, 1), (1.1, 1.05), (2, 1.5)], - 'far': [(1, 1), (3, 1.2), (2, 1.5)]} -nature_labels = ["Nature = f", "Nature = g", "Nature = h"] -colors = {'close': 'blue', 'far': 'red'} + # Compute divergence measures + kl_fg_vals[i] = compute_KL(f, g) + kl_gf_vals[i] = compute_KL(g, f) + js_vals[i] = compute_JS(f, g) + chernoff_vals[i], _ = compute_chernoff_entropy(f, g) -threshold = 1e-5 # “close to zero” cutoff + # Generate samples + sequences_f = np.random.beta(f_a, f_b, (N_half, T_large)) + sequences_g = np.random.beta(g_a, g_b, (N_half, T_large)) -for row, (f_belief, g_belief, label) in enumerate([ - (f_close, g_close, 'close'), - (f_far, g_far, 'far')]): + # Compute likelihood ratios and cumulative products + _, L_cumulative_f = compute_likelihood_ratios(sequences_f, f, g) + _, L_cumulative_g = compute_likelihood_ratios(sequences_g, f, g) - for col, nature_label in enumerate(nature_labels): - params = nature_params[label][col] - s_seq = np.random.beta(params[0], params[1], (1000, 200)) - _, c1 = simulate_blume_easley(s_seq, f_belief, g_belief, λ) - - median_c1 = np.median(c1, axis=0) - p10, p90 = np.percentile(c1, [10, 90], axis=0) - - ax = axes[row, col] - color = colors[label] - ax.plot(median_c1, color=color, linewidth=2, label='Median') - ax.fill_between(range(len(median_c1)), p10, p90, alpha=0.3, color=color, label='10–90%') - ax.set_xlabel('time') - ax.set_ylabel("Agent 1's share") - ax.set_ylim([0, 1]) - ax.set_title(nature_label) - ax.axhline(y=λ, color='gray', linestyle='--', alpha=0.5) - below = np.where(median_c1 < threshold)[0] - above = np.where(median_c1 > 1-threshold)[0] - if below.size > 0: first_zero = (below[0], True) - elif above.size > 0: first_zero = (above[0], False) - else: first_zero = None - if first_zero is not None: - ax.axvline(x=first_zero[0], color='black', linestyle='--', - alpha=0.7, - label=fr'Median $\leq$ {threshold}' if first_zero[1] - else fr'Median $\geq$ 1-{threshold}') - ax.legend() - -plt.tight_layout() -plt.show() + # Get final values + L_cumulative_f = L_cumulative_f[:, -1] + L_cumulative_g = L_cumulative_g[:, -1] + + # Calculate error probabilities + error_probs[i] = 0.5 * (np.mean(L_cumulative_f < 1) + + np.mean(L_cumulative_g >= 1)) + pair_names.append(f"Beta({f_a},{f_b}) and Beta({g_a},{g_b})") + +cor_data = { + 'kl_fg': kl_fg_vals, + 'kl_gf': kl_gf_vals, + 'js': js_vals, + 'chernoff': chernoff_vals, + 'error_prob': error_probs, + 'names': pair_names, + 'T': T_large} ``` -Holding to our guesses, let's calculate the four values +Now let's visualize the correlations ```{code-cell} ipython3 -# Close case -Kf_g, Kg_f = kl_divergence(f_close, g_close), kl_divergence(g_close, f_close) -Kf_h, Kg_h = compute_KL(h, f_close, g_close) - -print(f"KL divergences (close): \nKL(f,g)={Kf_g:.3f}, KL(g,f)={Kg_f:.3f}") -print(f"KL(h,f)={Kf_h:.3f}, KL(h,g)={Kg_h:.3f}") - -# Far case -Kf_g, Kg_f = kl_divergence(f_far, g_far), kl_divergence(g_far, f_far) -Kf_h, Kg_h = compute_KL(h, f_far, g_far) - -print(f"KL divergences (far): \nKL(f,g)={Kf_g:.3f}, KL(g,f)={Kg_f:.3f}") -print(f"KL(h,f)={Kf_h:.3f}, KL(h,g)={Kg_h:.3f}") -``` +:tags: [hide-input] -We find that in the first case, $KL(f,g) \approx KL(g,f)$ and both are relatively small, so although either agent 1 or agent 2 will eventually consume everything, convergence displaying in first two panels on the top is pretty slowly. +def plot_error_divergence(data): + """ + Plot correlations between error probability and divergence measures. + """ + # Filter out near-zero error probabilities for log scale + nonzero_mask = data['error_prob'] > 1e-6 + log_error = np.log(data['error_prob'][nonzero_mask]) + js_vals = data['js'][nonzero_mask] + chernoff_vals = data['chernoff'][nonzero_mask] -In the first two panels at the bottom, we see convergence occurring faster (as indicated by the black dashed line) because the divergence gaps $KL(f, g)$ and $KL(g, f)$ are larger. + # Create figure and axes + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5)) + + # function for plotting correlation + def plot_correlation(ax, x_vals, x_label, color): + ax.scatter(x_vals, log_error, alpha=0.7, s=60, color=color) + ax.set_xlabel(x_label) + ax.set_ylabel(f'log(Error probability) at T={data["T"]}') + + # Calculate correlation and trend line + corr = np.corrcoef(x_vals, log_error)[0, 1] + z = np.polyfit(x_vals, log_error, 2) + x_trend = np.linspace(x_vals.min(), x_vals.max(), 100) + ax.plot(x_trend, np.poly1d(z)(x_trend), + "r--", alpha=0.8, linewidth=2) + ax.set_title(f'Log error probability and {x_label}\n' + f'Correlation = {corr:.3f}') + + # Plot both correlations + plot_correlation(ax1, js_vals, 'JS divergence', 'C0') + plot_correlation(ax2, chernoff_vals, 'Chernoff entropy', 'C1') -Since $KL(f,g) > KL(g,f)$, we see faster convergence in the first panel at the bottom when nature chooses $f$ than in the second panel where nature chooses $g$. + plt.tight_layout() + plt.show() -This ties in nicely with {eq}`eq:kl_likelihood_link`. +plot_error_divergence(cor_data) +``` +Evidently, Chernoff entropy and Jensen-Shannon entropy each covary tightly with the model selection error probability. -+++ +We'll encounter related ideas in {doc}`wald_friedman` very soon. ## Related Lectures diff --git a/lectures/lln_clt.md b/lectures/lln_clt.md index b306a7fb5..0dc910dfd 100644 --- a/lectures/lln_clt.md +++ b/lectures/lln_clt.md @@ -262,7 +262,7 @@ for ax in axes: # Plot ax.plot(list(range(n)), data, 'o', color='grey', alpha=0.5) - axlabel = '$\\bar{X}_n$ for $X_i \sim$' + name + axlabel = r'$\bar{X}_n$ for $X_i \sim$' + name ax.plot(list(range(n)), sample_mean, 'g-', lw=3, alpha=0.6, label=axlabel) m = distribution.mean() ax.plot(list(range(n)), [m] * n, 'k--', lw=1.5, label=r'$\mu$') @@ -699,7 +699,7 @@ xmax = -xmin ax.set_xlim(xmin, xmax) ax.hist(error_obs, bins=60, alpha=0.5, density=True) xgrid = np.linspace(xmin, xmax, 200) -lb = "$N(0, g'(\mu)^2 \sigma^2)$" +lb = r"$N(0, g'(\mu)^2 \sigma^2)$" ax.plot(xgrid, norm.pdf(xgrid, scale=asymptotic_sd), 'k-', lw=2, label=lb) ax.legend() plt.show() diff --git a/lectures/mix_model.md b/lectures/mix_model.md index 3e324c18f..0cb249cbc 100644 --- a/lectures/mix_model.md +++ b/lectures/mix_model.md @@ -1,10 +1,10 @@ --- jupytext: text_representation: - extension: .myst + extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.13.8 + jupytext_version: 1.17.2 kernelspec: display_name: Python 3 (ipykernel) language: python @@ -17,8 +17,8 @@ kernelspec: ```{include} _admonition/gpu.md ``` -```{code-cell} ipython -:tags: [hide-output] +```{code-cell} ipython3 +:tags: [no-execute] !pip install numpyro jax ``` @@ -459,7 +459,7 @@ def plot_π_seq(α, π1=0.2, π2=0.8, T=200): ax1.plot(range(T+1), π_seq_mixed[i, :], label=rf"$\pi_0$={π_seq_mixed[i, 0]}") ax1.plot(np.nan, np.nan, '--', color='b', label='Log likelihood ratio process') - ax1.set_ylabel("$\pi_t$") + ax1.set_ylabel(r"$\pi_t$") ax1.set_xlabel("t") ax1.legend() ax1.set_title("when $\\alpha G + (1-\\alpha)$ F governs data") diff --git a/lectures/navy_captain.md b/lectures/navy_captain.md index fb507abeb..56431f556 100644 --- a/lectures/navy_captain.md +++ b/lectures/navy_captain.md @@ -447,13 +447,13 @@ fig, axs = plt.subplots(1, 2, figsize=(14, 5)) axs[0].plot(π_star_arr, t_optimal_arr) axs[0].set_xlabel(r'$\pi^*$') -axs[0].set_title('optimal sample size given $\pi^*$') +axs[0].set_title(r'optimal sample size given $\pi^*$') -axs[1].plot(π_star_arr, PFA_optimal_arr, label='$PFA^*(\pi^*)$') -axs[1].plot(π_star_arr, PD_optimal_arr, label='$PD^*(\pi^*)$') +axs[1].plot(π_star_arr, PFA_optimal_arr, label=r'$PFA^*(\pi^*)$') +axs[1].plot(π_star_arr, PD_optimal_arr, label=r'$PD^*(\pi^*)$') axs[1].set_xlabel(r'$\pi^*$') axs[1].legend() -axs[1].set_title('optimal PFA and PD given $\pi^*$') +axs[1].set_title(r'optimal PFA and PD given $\pi^*$') plt.show() ``` @@ -689,7 +689,7 @@ plt.text(β+0.01, wf.L0/2, 'β') plt.vlines(α, 0, wf.L0, linestyle='--') plt.text(α+0.01, wf.L0/2, 'α') plt.xlabel(r'$\pi$') -plt.title('Objective value function $V(\pi)$') +plt.title(r'Objective value function $V(\pi)$') plt.legend() plt.show() ``` @@ -732,7 +732,7 @@ for i, π_star in enumerate(π_star_arr): axs[row_i, col_i].hlines(V_baye_bar, 0, 1, linestyle='--') axs[row_i, col_i].vlines(π_optimal, V_baye_bar, V_baye.max(), linestyle='--') axs[row_i, col_i].text(π_optimal+0.05, (V_baye_bar + V_baye.max()) / 2, - '${\pi_0^*}=$'+f'{π_optimal:0.2f}') + r'${\pi_0^*}=$'+f'{π_optimal:0.2f}') axs[row_i, col_i].set_xlabel(r'$\pi$') axs[row_i, col_i].set_ylabel(r'$\overline{V}_{baye}(\pi)$') axs[row_i, col_i].set_title(r'$\pi^*=$' + f'{π_star}') @@ -770,7 +770,7 @@ axs[1].plot([π_star_arr.min(), π_star_arr.max()], [π_star_arr.min(), π_star_arr.max()], c='k', linestyle='--', label='45 degree line') axs[1].set_xlabel(r'$\pi^*$') -axs[1].set_title('optimal prior given $\pi^*$') +axs[1].set_title(r'optimal prior given $\pi^*$') axs[1].legend() plt.show() @@ -942,14 +942,14 @@ fig, axs = plt.subplots(1, 2, figsize=(14, 4)) axs[0].plot(np.arange(1, π0_arr.shape[1]+1), np.mean(π0_arr, 0), label='f0 generates') axs[0].plot(np.arange(1, π1_arr.shape[1]+1), 1 - np.mean(π1_arr, 0), label='f1 generates') axs[0].set_xlabel('t') -axs[0].set_ylabel('$E(\pi_t)$ or ($1 - E(\pi_t)$)') +axs[0].set_ylabel(r'$E(\pi_t)$ or ($1 - E(\pi_t)$)') axs[0].set_title('Expectation of beliefs after drawing t observations') axs[0].legend() axs[1].plot(np.arange(1, π0_arr.shape[1]+1), np.var(π0_arr, 0), label='f0 generates') axs[1].plot(np.arange(1, π1_arr.shape[1]+1), np.var(π1_arr, 0), label='f1 generates') axs[1].set_xlabel('t') -axs[1].set_ylabel('var($\pi_t$)') +axs[1].set_ylabel(r'var($\pi_t$)') axs[1].set_title('Variance of beliefs after drawing t observations') axs[1].legend()