diff --git a/.gitattributes b/.gitattributes old mode 100644 new mode 100755 diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/CNAME b/CNAME old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/_config.yml b/_config.yml old mode 100644 new mode 100755 index 5b5fc73..c540148 --- a/_config.yml +++ b/_config.yml @@ -1,7 +1,7 @@ # Site settings title: Probabilistic Models of Cognition v. 2 -baseurl: "" -url: "http://probmods.org" +baseurl: "/probmods2" +url: "http://marielajennings.github.io/probmods2/" # Build settings markdown: kramdown diff --git a/_layouts/chapter.html b/_layouts/chapter.html old mode 100644 new mode 100755 diff --git a/_layouts/default.html b/_layouts/default.html old mode 100644 new mode 100755 diff --git a/_layouts/exercise.html b/_layouts/exercise.html old mode 100644 new mode 100755 diff --git a/_prod.yml b/_prod.yml old mode 100644 new mode 100755 index 8b689b6..9fa261d --- a/_prod.yml +++ b/_prod.yml @@ -1,2 +1,2 @@ markdown: kramdown -baseurl: https://probmods.org +baseurl: https://marielajennings.github.io/probmods2/ diff --git a/assets/bibliography.bib b/assets/bibliography.bib old mode 100644 new mode 100755 index 46d24fe..4d24402 --- a/assets/bibliography.bib +++ b/assets/bibliography.bib @@ -16,7 +16,7 @@ @article{piantadosi2012bootstrapping pages={199--217}, year={2012}, publisher={Elsevier}, - Url={https://colala.bcs.rochester.edu/papers/piantadosi2012bootstrapping.pdf} + Url={http://old.nbu.bg/cogs/events/5_4_D_piantadosi2012bootstrapping.pdf} } @article{cheng1997covariation, diff --git a/assets/css/bootstrap-theme.min.css b/assets/css/bootstrap-theme.min.css old mode 100644 new mode 100755 diff --git a/assets/css/bootstrap-theme.min.css.map b/assets/css/bootstrap-theme.min.css.map old mode 100644 new mode 100755 diff --git a/assets/css/bootstrap.min.css b/assets/css/bootstrap.min.css old mode 100644 new mode 100755 diff --git a/assets/css/bootstrap.min.css.map b/assets/css/bootstrap.min.css.map old mode 100644 new mode 100755 diff --git a/assets/css/default.css b/assets/css/default.css old mode 100644 new mode 100755 diff --git a/assets/css/draw.css b/assets/css/draw.css old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_AMS-Regular.eot b/assets/css/fonts/KaTeX_AMS-Regular.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_AMS-Regular.ttf b/assets/css/fonts/KaTeX_AMS-Regular.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_AMS-Regular.woff b/assets/css/fonts/KaTeX_AMS-Regular.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_AMS-Regular.woff2 b/assets/css/fonts/KaTeX_AMS-Regular.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Caligraphic-Bold.eot b/assets/css/fonts/KaTeX_Caligraphic-Bold.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Caligraphic-Bold.ttf b/assets/css/fonts/KaTeX_Caligraphic-Bold.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Caligraphic-Bold.woff b/assets/css/fonts/KaTeX_Caligraphic-Bold.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Caligraphic-Bold.woff2 b/assets/css/fonts/KaTeX_Caligraphic-Bold.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Caligraphic-Regular.eot b/assets/css/fonts/KaTeX_Caligraphic-Regular.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Caligraphic-Regular.ttf b/assets/css/fonts/KaTeX_Caligraphic-Regular.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Caligraphic-Regular.woff b/assets/css/fonts/KaTeX_Caligraphic-Regular.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Caligraphic-Regular.woff2 b/assets/css/fonts/KaTeX_Caligraphic-Regular.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Fraktur-Bold.eot b/assets/css/fonts/KaTeX_Fraktur-Bold.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Fraktur-Bold.ttf b/assets/css/fonts/KaTeX_Fraktur-Bold.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Fraktur-Bold.woff b/assets/css/fonts/KaTeX_Fraktur-Bold.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Fraktur-Bold.woff2 b/assets/css/fonts/KaTeX_Fraktur-Bold.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Fraktur-Regular.eot b/assets/css/fonts/KaTeX_Fraktur-Regular.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Fraktur-Regular.ttf b/assets/css/fonts/KaTeX_Fraktur-Regular.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Fraktur-Regular.woff b/assets/css/fonts/KaTeX_Fraktur-Regular.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Fraktur-Regular.woff2 b/assets/css/fonts/KaTeX_Fraktur-Regular.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Main-Bold.eot b/assets/css/fonts/KaTeX_Main-Bold.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Main-Bold.ttf b/assets/css/fonts/KaTeX_Main-Bold.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Main-Bold.woff b/assets/css/fonts/KaTeX_Main-Bold.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Main-Bold.woff2 b/assets/css/fonts/KaTeX_Main-Bold.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Main-Italic.eot b/assets/css/fonts/KaTeX_Main-Italic.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Main-Italic.ttf b/assets/css/fonts/KaTeX_Main-Italic.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Main-Italic.woff b/assets/css/fonts/KaTeX_Main-Italic.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Main-Italic.woff2 b/assets/css/fonts/KaTeX_Main-Italic.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Main-Regular.eot b/assets/css/fonts/KaTeX_Main-Regular.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Main-Regular.ttf b/assets/css/fonts/KaTeX_Main-Regular.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Main-Regular.woff b/assets/css/fonts/KaTeX_Main-Regular.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Main-Regular.woff2 b/assets/css/fonts/KaTeX_Main-Regular.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Math-BoldItalic.eot b/assets/css/fonts/KaTeX_Math-BoldItalic.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Math-BoldItalic.ttf b/assets/css/fonts/KaTeX_Math-BoldItalic.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Math-BoldItalic.woff b/assets/css/fonts/KaTeX_Math-BoldItalic.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Math-BoldItalic.woff2 b/assets/css/fonts/KaTeX_Math-BoldItalic.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Math-Italic.eot b/assets/css/fonts/KaTeX_Math-Italic.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Math-Italic.ttf b/assets/css/fonts/KaTeX_Math-Italic.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Math-Italic.woff b/assets/css/fonts/KaTeX_Math-Italic.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Math-Italic.woff2 b/assets/css/fonts/KaTeX_Math-Italic.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Math-Regular.eot b/assets/css/fonts/KaTeX_Math-Regular.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Math-Regular.ttf b/assets/css/fonts/KaTeX_Math-Regular.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Math-Regular.woff b/assets/css/fonts/KaTeX_Math-Regular.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Math-Regular.woff2 b/assets/css/fonts/KaTeX_Math-Regular.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_SansSerif-Bold.eot b/assets/css/fonts/KaTeX_SansSerif-Bold.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_SansSerif-Bold.ttf b/assets/css/fonts/KaTeX_SansSerif-Bold.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_SansSerif-Bold.woff b/assets/css/fonts/KaTeX_SansSerif-Bold.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_SansSerif-Bold.woff2 b/assets/css/fonts/KaTeX_SansSerif-Bold.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_SansSerif-Italic.eot b/assets/css/fonts/KaTeX_SansSerif-Italic.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_SansSerif-Italic.ttf b/assets/css/fonts/KaTeX_SansSerif-Italic.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_SansSerif-Italic.woff b/assets/css/fonts/KaTeX_SansSerif-Italic.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_SansSerif-Italic.woff2 b/assets/css/fonts/KaTeX_SansSerif-Italic.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_SansSerif-Regular.eot b/assets/css/fonts/KaTeX_SansSerif-Regular.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_SansSerif-Regular.ttf b/assets/css/fonts/KaTeX_SansSerif-Regular.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_SansSerif-Regular.woff b/assets/css/fonts/KaTeX_SansSerif-Regular.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_SansSerif-Regular.woff2 b/assets/css/fonts/KaTeX_SansSerif-Regular.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Script-Regular.eot b/assets/css/fonts/KaTeX_Script-Regular.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Script-Regular.ttf b/assets/css/fonts/KaTeX_Script-Regular.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Script-Regular.woff b/assets/css/fonts/KaTeX_Script-Regular.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Script-Regular.woff2 b/assets/css/fonts/KaTeX_Script-Regular.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Size1-Regular.eot b/assets/css/fonts/KaTeX_Size1-Regular.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Size1-Regular.ttf b/assets/css/fonts/KaTeX_Size1-Regular.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Size1-Regular.woff b/assets/css/fonts/KaTeX_Size1-Regular.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Size1-Regular.woff2 b/assets/css/fonts/KaTeX_Size1-Regular.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Size2-Regular.eot b/assets/css/fonts/KaTeX_Size2-Regular.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Size2-Regular.ttf b/assets/css/fonts/KaTeX_Size2-Regular.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Size2-Regular.woff b/assets/css/fonts/KaTeX_Size2-Regular.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Size2-Regular.woff2 b/assets/css/fonts/KaTeX_Size2-Regular.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Size3-Regular.eot b/assets/css/fonts/KaTeX_Size3-Regular.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Size3-Regular.ttf b/assets/css/fonts/KaTeX_Size3-Regular.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Size3-Regular.woff b/assets/css/fonts/KaTeX_Size3-Regular.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Size3-Regular.woff2 b/assets/css/fonts/KaTeX_Size3-Regular.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Size4-Regular.eot b/assets/css/fonts/KaTeX_Size4-Regular.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Size4-Regular.ttf b/assets/css/fonts/KaTeX_Size4-Regular.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Size4-Regular.woff b/assets/css/fonts/KaTeX_Size4-Regular.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Size4-Regular.woff2 b/assets/css/fonts/KaTeX_Size4-Regular.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Typewriter-Regular.eot b/assets/css/fonts/KaTeX_Typewriter-Regular.eot old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Typewriter-Regular.ttf b/assets/css/fonts/KaTeX_Typewriter-Regular.ttf old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Typewriter-Regular.woff b/assets/css/fonts/KaTeX_Typewriter-Regular.woff old mode 100644 new mode 100755 diff --git a/assets/css/fonts/KaTeX_Typewriter-Regular.woff2 b/assets/css/fonts/KaTeX_Typewriter-Regular.woff2 old mode 100644 new mode 100755 diff --git a/assets/css/index.css b/assets/css/index.css old mode 100644 new mode 100755 diff --git a/assets/css/katex.min.css b/assets/css/katex.min.css old mode 100644 new mode 100755 diff --git a/assets/css/littlefoot.css b/assets/css/littlefoot.css old mode 100644 new mode 100755 diff --git a/assets/css/webppl-editor.css b/assets/css/webppl-editor.css old mode 100644 new mode 100755 diff --git a/assets/css/webppl-viz.css b/assets/css/webppl-viz.css old mode 100644 new mode 100755 diff --git a/assets/data/enumerateToW1.csv b/assets/data/enumerateToW1.csv old mode 100644 new mode 100755 diff --git a/assets/data/mcmc100_positiveStrength_ToW1.csv b/assets/data/mcmc100_positiveStrength_ToW1.csv old mode 100644 new mode 100755 diff --git a/assets/data/towData.Rdata b/assets/data/towData.Rdata old mode 100644 new mode 100755 diff --git a/assets/data/towData.csv b/assets/data/towData.csv old mode 100644 new mode 100755 diff --git a/assets/img/04_01_a.png b/assets/img/04_01_a.png old mode 100644 new mode 100755 index 9127018..84c80b4 Binary files a/assets/img/04_01_a.png and b/assets/img/04_01_a.png differ diff --git a/assets/img/04_01_b.png b/assets/img/04_01_b.png old mode 100644 new mode 100755 index d9a3029..68e4363 Binary files a/assets/img/04_01_b.png and b/assets/img/04_01_b.png differ diff --git a/assets/img/04_01_c.png b/assets/img/04_01_c.png old mode 100644 new mode 100755 index 584ba95..7ce442d Binary files a/assets/img/04_01_c.png and b/assets/img/04_01_c.png differ diff --git a/assets/img/04_01_d.png b/assets/img/04_01_d.png old mode 100644 new mode 100755 index c00a214..4bc5506 Binary files a/assets/img/04_01_d.png and b/assets/img/04_01_d.png differ diff --git a/assets/img/04_01_e.png b/assets/img/04_01_e.png old mode 100644 new mode 100755 diff --git a/assets/img/Beta_distribution_pdf.png b/assets/img/Beta_distribution_pdf.png old mode 100644 new mode 100755 diff --git a/assets/img/CRP.swf b/assets/img/CRP.swf old mode 100644 new mode 100755 diff --git a/assets/img/Cancer-world-tree.png b/assets/img/Cancer-world-tree.png old mode 100644 new mode 100755 diff --git a/assets/img/Checkershadow_illusion_small.png b/assets/img/Checkershadow_illusion_small.png old mode 100644 new mode 100755 diff --git a/assets/img/Checkershadow_proof_small.png b/assets/img/Checkershadow_proof_small.png old mode 100644 new mode 100755 diff --git a/assets/img/Concentration.png b/assets/img/Concentration.png old mode 100644 new mode 100755 diff --git a/assets/img/Cond-dep1.jpg b/assets/img/Cond-dep1.jpg old mode 100644 new mode 100755 diff --git a/assets/img/Curve_fitting.png b/assets/img/Curve_fitting.png old mode 100644 new mode 100755 diff --git a/assets/img/Gamma-dist.png b/assets/img/Gamma-dist.png old mode 100644 new mode 100755 diff --git a/assets/img/Kersten_et_al_explaining_away.png b/assets/img/Kersten_et_al_explaining_away.png old mode 100644 new mode 100755 diff --git a/assets/img/Marg-dep1.jpg b/assets/img/Marg-dep1.jpg old mode 100644 new mode 100755 diff --git a/assets/img/Med-diag-bnet1.jpg b/assets/img/Med-diag-bnet1.jpg old mode 100644 new mode 100755 diff --git a/assets/img/Medin54-bugs.png b/assets/img/Medin54-bugs.png old mode 100644 new mode 100755 diff --git a/assets/img/Normal_distribution_pdf.png b/assets/img/Normal_distribution_pdf.png old mode 100644 new mode 100755 diff --git a/assets/img/Pme.png b/assets/img/Pme.png old mode 100644 new mode 100755 diff --git a/assets/img/Sicp-lambda-diagram.png b/assets/img/Sicp-lambda-diagram.png old mode 100644 new mode 100755 diff --git a/assets/img/blocks-world.png b/assets/img/blocks-world.png old mode 100644 new mode 100755 diff --git a/assets/img/blocks.png b/assets/img/blocks.png old mode 100644 new mode 100755 diff --git a/assets/img/boa-learningcurves-1bag.png b/assets/img/boa-learningcurves-1bag.png old mode 100644 new mode 100755 diff --git a/assets/img/boa-learningcurves-manybags.png b/assets/img/boa-learningcurves-manybags.png old mode 100644 new mode 100755 diff --git a/assets/img/box.png b/assets/img/box.png old mode 100644 new mode 100755 diff --git a/assets/img/ch1_donut_new.png b/assets/img/ch1_donut_new.png old mode 100644 new mode 100755 diff --git a/assets/img/cog_32x32.png b/assets/img/cog_32x32.png old mode 100644 new mode 100755 diff --git a/assets/img/favicon.ico b/assets/img/favicon.ico old mode 100644 new mode 100755 diff --git a/assets/img/flip0.7.png b/assets/img/flip0.7.png old mode 100644 new mode 100755 diff --git a/assets/img/flip0.7.svg b/assets/img/flip0.7.svg old mode 100644 new mode 100755 diff --git a/assets/img/grey_wash_wall.png b/assets/img/grey_wash_wall.png old mode 100644 new mode 100755 diff --git a/assets/img/nisbett_model_humans.png b/assets/img/nisbett_model_humans.png old mode 100644 new mode 100755 diff --git a/assets/img/particles_1.svg b/assets/img/particles_1.svg new file mode 100644 index 0000000..a4ae90a --- /dev/null +++ b/assets/img/particles_1.svg @@ -0,0 +1 @@ +234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950(state)0.00.00500.0100.0150.0200.0250.0300.0350.0400.0450.0500.0550.060frequency \ No newline at end of file diff --git a/assets/img/particles_2.svg b/assets/img/particles_2.svg new file mode 100644 index 0000000..36ffeda --- /dev/null +++ b/assets/img/particles_2.svg @@ -0,0 +1 @@ +12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849(state)0.00.0500.100.150.200.250.300.35frequency \ No newline at end of file diff --git a/assets/img/pedagogy-pic.png b/assets/img/pedagogy-pic.png old mode 100644 new mode 100755 diff --git a/assets/img/plate_notation.png b/assets/img/plate_notation.png new file mode 100755 index 0000000..09d5fe1 Binary files /dev/null and b/assets/img/plate_notation.png differ diff --git a/assets/img/pomdp_graph.png b/assets/img/pomdp_graph.png old mode 100644 new mode 100755 diff --git a/assets/img/rsa_scene.png b/assets/img/rsa_scene.png old mode 100644 new mode 100755 diff --git a/assets/img/rsa_schema.png b/assets/img/rsa_schema.png old mode 100644 new mode 100755 diff --git a/assets/img/russ_cow_roc.png b/assets/img/russ_cow_roc.png old mode 100644 new mode 100755 diff --git a/assets/img/russ_model_graphical.png b/assets/img/russ_model_graphical.png old mode 100644 new mode 100755 diff --git a/assets/img/russ_results_categories.png b/assets/img/russ_results_categories.png old mode 100644 new mode 100755 diff --git a/assets/img/scalar.png b/assets/img/scalar.png old mode 100644 new mode 100755 diff --git a/assets/img/shape_bias_results_model.png b/assets/img/shape_bias_results_model.png old mode 100644 new mode 100755 diff --git a/assets/img/unifying-table.png b/assets/img/unifying-table.png old mode 100644 new mode 100755 diff --git a/assets/img/unifying.png b/assets/img/unifying.png old mode 100644 new mode 100755 diff --git a/assets/js/bootstrap.min.js b/assets/js/bootstrap.min.js old mode 100644 new mode 100755 diff --git a/assets/js/box2d.js b/assets/js/box2d.js old mode 100644 new mode 100755 diff --git a/assets/js/chapter.js b/assets/js/chapter.js old mode 100644 new mode 100755 diff --git a/assets/js/custom.js b/assets/js/custom.js old mode 100644 new mode 100755 diff --git a/assets/js/draw.js b/assets/js/draw.js old mode 100644 new mode 100755 diff --git a/assets/js/ga.js b/assets/js/ga.js old mode 100644 new mode 100755 diff --git a/assets/js/index.js b/assets/js/index.js old mode 100644 new mode 100755 diff --git a/assets/js/jquery.min.js b/assets/js/jquery.min.js old mode 100644 new mode 100755 diff --git a/assets/js/katex.min.js b/assets/js/katex.min.js old mode 100644 new mode 100755 diff --git a/assets/js/littlefoot.min.js b/assets/js/littlefoot.min.js old mode 100644 new mode 100755 diff --git a/assets/js/paper-full.js b/assets/js/paper-full.js old mode 100644 new mode 100755 diff --git a/assets/js/parse-bibtex.js b/assets/js/parse-bibtex.js old mode 100644 new mode 100755 diff --git a/assets/js/physics.js b/assets/js/physics.js old mode 100644 new mode 100755 diff --git a/assets/js/plinko.js b/assets/js/plinko.js old mode 100644 new mode 100755 diff --git a/assets/js/towConfigurations.js b/assets/js/towConfigurations.js old mode 100644 new mode 100755 diff --git a/assets/js/towData.js b/assets/js/towData.js old mode 100644 new mode 100755 diff --git a/assets/js/webppl-editor.min.css b/assets/js/webppl-editor.min.css old mode 100644 new mode 100755 diff --git a/assets/js/webppl-editor.min.js b/assets/js/webppl-editor.min.js old mode 100644 new mode 100755 diff --git a/assets/js/webppl-viz.min.css b/assets/js/webppl-viz.min.css old mode 100644 new mode 100755 diff --git a/assets/js/webppl-viz.min.js b/assets/js/webppl-viz.min.js old mode 100644 new mode 100755 diff --git a/assets/js/webppl.min.js b/assets/js/webppl.min.js old mode 100644 new mode 100755 diff --git a/assets/pdfs/MarkovModels.pdf b/assets/pdfs/MarkovModels.pdf old mode 100644 new mode 100755 diff --git a/assets/scripts/14-bda-of-tow.Rmd b/assets/scripts/14-bda-of-tow.Rmd old mode 100644 new mode 100755 diff --git a/chapters/14-bayesian-data-analysis.md b/chapters/14-bayesian-data-analysis.md new file mode 100755 index 0000000..305a2d5 --- /dev/null +++ b/chapters/14-bayesian-data-analysis.md @@ -0,0 +1,1158 @@ +--- +layout: chapter +title: Bayesian data analysis +description: Making scientific inferences about data and models +custom_js: +- assets/js/towData.js +- assets/js/towConfigurations.js +--- + +### Authors: Michael Henry Tessler; Noah Goodman + + + + +Inference by conditioning a generative model is also a basic building block of Bayesian statistics. +In cognitive science this tool can be used in two ways. +If the generative model is a hypothesis about a person's model of the world, then we have a Bayesian *cognitive model* -- the main topic of this book. +If the generative model is instead the scientist's model of how the data are generated, then we have *Bayesian data analysis*. +Bayesian data analysis can be an extremely useful tool to us as scientists, when we are trying to understand what our data mean about psychological hypotheses. +This can become confusing: a particular modeling assumption can be something we hypothesize that people assume about the world, or can be something that we as scientists want to assume (but don't assume that people assume). +A pithy way of saying this is that we can make assumptions about "Bayes in the head" (Bayesian cognitive models) or about "Bayes in the notebook" (Bayesian data analysis). + +# Prologue: Spinning coins + + +Coins, in their production, have a number of physical dimensions along which they vary. +These idiosyncracies have no impact on the behavior of a coin when flipped. +Flipping, it turns out, cross-cuts certain dimensions, and the probability that any modern coin in production will land on heads when flipped is roughly 0.5. + +It turns out spinning a coin is quite different. +The heterogeneity of individual coins can be seen in their behavior when spun: The probability that any given coin will land on heads after being *spun* is not 0.5. +The probability that a given coin will spin to heads depends in complicated ways upon the idiosyncracies of that coin. +(N.B. This knowledge is attributed to Persi Diaconis, Dept. of Statistics, Stanford, who M.H.T. was fortunate enough to hear describe his experiments on coins.) + +## People's models of coins + + +Imagine I give you a coin, and want you to estimate the probability it will spin to heads. +Given what you know, the most reasonable prior belief is to expect *any* probability of the coin spinning to heads. +This can be captured in a uniform prior on $$p$$, the probability that a coin when spun will land on heads: `var p = uniform(0,1)`. + +You conduct an experiment. +You spin the coin 20 times. +Fifteen out of these 20 times, the coin spins to heads. + +Let's say you can win $10 if you can predict the next coin flip (heads or tail). +What would you predict (Heads or Tails)? + +Let's consider a slightly more challenging question. +You pay me $10 if you are wrong, but if you are correct, I pay you. +How much do you propose you take from me, should you guess correctly? +Imagine you demand $10 for being correct. +(If you guess correctly, you win $10. +If you guess incorrectly, you lose $10.) +If we believed the coin would spin to Heads or Tails with equal probability, this is a fair betting scheme. +But if we believe the coin is biased to spin to one side or the other with greater than 50% probability, you should offer me more than $10, given that you get to choose which side counts as winning for you. + +~~~~ +var observerModel = function(){ + var p = uniform(0, 1) + var coinSpinner = Binomial({n:20, p:p}) + observe(coinSpinner, 15) + return p +} + +var opts = {method: "rejection", samples: 5000} +var posteriorBeliefs = Infer(opts, observerModel) +print("Expected value = " + expectation(posteriorBeliefs)) +viz.density(posteriorBeliefs, {bounds: [0,1]}) +~~~~ + +The model above is a hypothesis about how a person updates her prior beliefs about the probability of a coin being spun to heads, upon conducting 20 trials of a spinning experiment. +We can use this model to make predictions about other kinds of questions we could then ask the observer. +For instance, let's take up the bet of whether or not the *next spin* will go to heads. +Also, consider if you were to make 10 more spins: How many of them would go to heads? + +~~~~ +var observerModel = function(){ + var p = uniform(0, 1) + var coinSpinner = Binomial({n:20, p:p}) + observe(coinSpinner, 15) + return { + nextOutcome: bernoulli(p), + nextTenOutcomes: binomial(p, 10) + } +} + +var opts = {method: "rejection", samples: 5000} +var posteriorBeliefs = Infer(opts, observerModel) +viz.marginals(posteriorBeliefs) +~~~~ + +A model can be used to make predictions about different tasks. +Models are useful in this way: *They are tools for thinking through the implications of hypotheses*. +In this model, we formalized the hypothesis that people reason about the weight of a coin by assuming the outcomes they observe (i.e., the 15 out of 20 heads) are independent results of spinning the coin, which has some unknown proclivity to be spun to heads. +This reasoning supports responses to other types of questions such as "How likely is the next spin to land on heads?". +We use the same model to see what our hypothesis predicts given this new question. +Each question could be thought of as a different experiment you could run to test the same hypothesis/model. + +Above, we have one hypothesis about a person thinking about the spinning coins experiments. +There are other hypotheses we could make about this experiment. +Rather than have uniform prior beliefs, maybe people port their knowledge of flipping behavior, and believe with some probability that spinning coins will behave similarly: There is a reasonable chance that spinning coins is as random as flipping coins. +But if that belief is not well supported, the observer can accommodate that by inferring that the behavior of spinning coins is quite different than that of flipping coins (as I explained above). If the observer believes the behavior of spinning coins is *different* than that of flipping coins, the model takes on the form of the model above: uniform beliefs about `p`. +We'll call the new model that assumes people's expectations about spinning behavior are influenced by their knowledge of flipping behavior (namely, the observer believes the behavior of spinning coins could be the *same* as that of flipping coins) the `skepticalModel` because it is, in a way, skeptical of the story I told you above about spinning coins. + +~~~~ +var skepticalModel = function(){ + var sameAsFlipping = flip(0.5) + var p = sameAsFlipping ? 0.5 : uniform(0, 1) + var coinSpinner = Binomial({n:20, p:p}) + observe(coinSpinner, 15) + return { + sameAsFlipping: sameAsFlipping, + p: p, + nextOutcome: flip(p), + nextTenOutcomes: binomial(p, 10) + } +} + +var opts = {method: "rejection", samples: 5000} +var posteriorBeliefs = Infer(opts, skepticalModel) +viz.marginals(posteriorBeliefs) +~~~~ + +The predictions are subtly different. +`skepticalModel` pulls the predictions more towards 50/50. +Why is that? + +One way to understand this model is to examine the prior. +Try commenting out the `observe` statement and looking at the predictions. +Examine the plots for `p` and `nextTenOutcomes`. +The prior favors `p` around 0.5 and `nextTenOutcomes` to be about 5. +This is because we assume there is a `0.5` probability that the behavior is the same as flipping (`sameAsFlipping = flip(0.5)`). +If indeed it is the same as flipping, `p = 0.5` and the resulting `nextTenOutcomes` would be biased towards expecting around 5 heads. +If it's not the same as flipping, then the predictions are the same as the `observerModel`. +The prior is a mixture of these two possibilities. + +## Scientists' models of people + +The above models instantiate different hypotheses about the coin spinning experiemnts, and make different predictions about what people will do in this situation. +We could recruit volunteers, run experiments in a laboratory, and record our participants' responses. +But, how are we to decide which model is better? + +One way to begin to address this would be to ask ourselves: How much do we believe in each model? +At this point, both models seem totally reasonable to me: I would say I believe in each of them equally. +Okay, then how are we supposed to update these beliefs? + +Since each model makes predictions about what we should observe in an experiment, we already know how to update our beliefs in these models based on the experimental data we observed. +We simply `observe` the data, assuming that it was generated by the better model. + + +~~~~ norun +var scientistModel = function(){ + + var theBetterModel = flip(0.5) ? observerModel : skepticalModel + + observe(theBetterModel, experimentalData) + + return theBetterModel +} +~~~~ + +Note that we are assuming "the better model" generated the data because that is how we are defining what "better" is (i.e., it is a better explanation of how the data were generated). + +We have now instantiated our scientific process as a probabilistic program, doing Bayesian inference. +This is the foundation of Bayesian data analysis. + +We must specify prior beliefs about which is the better model; in this case we say that we don't have any bias in our prior beliefs: each model is equally likely to be better *a priori*. +We then seek to update our beliefs about which is the better model, by observing `experimentalData`, assuming that it came from `theBetterModel`. + +Imagine we ran the "predict the next 10" experiment with 20 participants, and observed the following responses: + + + + +~~~~ norun +var experimentalData = [9,8,7,5,4,5,6,7,9,4,8,7,8,3,9,6,5,7,8,5] +~~~~ + +Look again at the model predictions for `nextTenOutcomes` for the two models above. +Which model do you think is better? + +We are now ready to put all the pieces together: + +~~~~ +///fold: +var opts = {method: "rejection", samples: 5000} +print("generating observer model predictions") +var observerModel = Infer(opts, function(){ + var p = uniform(0, 1) + var coinSpinner = Binomial({n:20, p:p}) + observe(coinSpinner, 15) + return binomial(p, 10) +}) +viz(observerModel) + +print("generating skeptical model predictions") +var skepticalModel = Infer(opts, function(){ + var sameAsFlipping = flip(0.5) + var p = sameAsFlipping ? 0.5 : uniform(0, 1) + var coinSpinner = Binomial({n:20, p:p}) + observe(coinSpinner, 15) + return binomial(p, 10) +}) +/// +viz(skepticalModel) + +var experimentalData = [9,8,7,5,4,5,6,7,9,4,8,7,8,3,9,6,5,7,8,5] + +// package the models up in an Object (for ease of reference) +var modelObject = {observerModel: observerModel, skepticalModel: skepticalModel}; + +var scientistModel = function(){ + var theBetterModel_name = flip(0.5) ? "observerModel" : "skepticalModel" + var theBetterModel = modelObject[theBetterModel_name] + map(function(d){ observe(theBetterModel, d) }, experimentalData) + return {betterModel: theBetterModel_name} +} + +var modelPosterior = Infer({method: "enumerate"}, scientistModel) + +viz(modelPosterior) +~~~~ + +What is the result of the model comparison model (which model is preferred)? + +Examine the predictions of each individual model and look at the `experimentalData`. +The differences in model predictions are subtle. +What parts of the experimental data are leading to the `scientistModel` preferring one model over another? + + +### Closing remarks to Prologue + +We've just walked through a complete example of formalizing a cognitive model (i.e., a hypothesis) in a probabilistic program and building a probabilistic program to decide among competing hypotheses. +The rest of this chapter will be focused on examining each part in more detail. + + + +# Learning about a hypothesis + + + +Bayesian data analysis is a general purpose data analysis approach for making explicit hypotheses about the generative process behind the experimental data (i.e., how was the experimental data generated? e.g., the hypothesis that data from two experimental conditions came from two different distributions). +After making explicit hypotheses, Bayesian inference can be used to *invert* the model: go from experimental data to updated beliefs about the hypotheses. + +For further reading on Bayesian data analysis: see [Lee & Wagenmakers (2013)](https://bayesmodels.com/), +[Kruschke (2014)](https://sites.google.com/site/doingbayesiandataanalysis/), and [Gelman et al. (2014)](http://www.stat.columbia.edu/~gelman/book/). + +## Parameters and predictives + +Models have parameters. +Parameters can be of theoretical interest or not (so-called, nuisance parameters). +Learning about a hypothesis involves inferences based on the values of parameters. +For example, you want to find out how likely it is that Candidate A will win an election. +To do this, you try to estimate the proportion of eligible voters in the United States who will vote for Candidate A in the election. + +Parameters are in general unobservable (or, "latent"). +Trying to estimate how many (voting age, likely to vote) people prefer Candidate A vs. Candidate B would require asking over 100 million people (it's estimated that about 130 million people voted in the US Presidential Elections in 2008 and 2012). +It's impractical to measure the whole distribution. +Instead, what is done is measuring a sample (maybe we ask 1000 people), and use that to make inference about the "true population proportion" (an unobservable parameter). + +Bayes’ rule provides a bridge between the unobserved parameters of models and the observed data. +We can update our beliefs about parameters from data. +Additionally, the "Bayes bridge" can go from parameters to data: we can use our updated beliefs about a parameter to make predictions about future data sets. + +For a given Bayesian model (together with data), there are four conceptually distinct distributions of interest: + +For parameters, we have priors and posteriors: + ++ The *prior distribution over parameters* captures our initial state of knowledge (or, our beliefs) about the values that the latent parameters could have. ++ The *posterior distribution over parameters* captures what we know about the latent parameters having updated our beliefs with the evidence provided by data. + +We can run either the prior or the posterior model forward, and have it make predictions about data sets: + ++ The *prior predictive distribution* tells us what data to expect, given our model and our initial beliefs about the parameters. +The prior predictive is a distribution over data, and gives the relative probability of different *observable* outcomes before we have seen any data. ++ The *posterior predictive distribution* tells us what data to expect, given the same model we started with, but with beliefs that have been updated by the observed data. The posterior predictive is a distribution over data, and gives the relative probability of different observable outcomes, after some data has been seen. + +Loosely speaking, *predictive* distributions are in "data space" and *parameter* distributions are in "latent parameter space". + +## A simple illustration + +Here, we explore the result of an experiment with 20 trials and binary outcomes (e.g., asking people if they'll vote for Candidate A or Candidate B, ...). +Note that is the same model as the "observerModel" above. + +~~~~ +// observed data +var k = 1 // number of people who support candidate A +var n = 20 // number of people asked + +var model = function() { + + // true population proportion who support candidate A + var p = uniform(0, 1); + + // Observed k people support "A" + // Assuming each person's response is independent of each other + observe(Binomial({p : p, n: n}), k); + + // predict what the next n will say + var posteriorPredictive = binomial(p, n); + + // recreate model structure, without observe + var prior_p = uniform(0, 1); + var priorPredictive = binomial(prior_p, n); + + return { + prior: prior_p, priorPredictive : priorPredictive, + posterior : p, posteriorPredictive : posteriorPredictive + }; +} + +var opts = {method: "rejection", samples: 2000}; +var posterior = Infer(opts, model); + +viz.marginals(posterior) +~~~~ + +Try to interpret each plot, and how they relate to each other. +Why are some plots densities and others bar graphs? +Understanding these ideas is a key to understanding Bayesian analysis. +Check your understanding by trying other data sets, varying both `k` and `n`. + + + +## Posterior prediction and model checking + +The posterior predictive distribution describes what data you should expect to see, given the model you've assumed and the data you've collected so far. +If the model is a good description of the data you've collected, then the model shouldn't be surprised if you got the same data by running the experiment again. +That is, the most likely data for your model after observing your data should be the data you observed. + +It's natural then to use the posterior predictive distribution to examine the descriptive adequacy of a model. +If these predictions do not match the data *already seen* (i.e., the data used to arrive at the posterior distribution over parameters), the model is descriptively inadequate. + +Imagine you're a developmental psychologist, piloting a two-alternative forced choice task on young children. +(Just for fun, let's pretend it's a helping study, where the child either chooses to help or not help a confederate in need.) +You have two research assistants that you send to two different preschools to collect data. +You got your first batch of data back today: For one of your research assistants, 10 out of 10 children tested helped the confederate in need. For the other research assitant, 0 out of 10 children tested helped. + +We'll use the `editor.put()` function to save our results so we can look at the them in different code boxes. + +~~~~ +///fold: +var marginalize = function(dist, key){ + return Infer({method: "enumerate"}, function(){ + return sample(dist)[key] + }) +} +/// +// "Kids who help" in 2 experiments +var k1 = 0; +var k2 = 10; + +// Number of kids in 2 experiments +var n1 = 10; +var n2 = 10; + +var model = function() { + + // "true effect in the population" + var p = uniform(0, 1); + + // observed data from 2 experiments + observe(Binomial({p: p, n: n1}), k1); + observe(Binomial({p: p, n: n2}), k2); + + // posterior prediction + var posteriorPredictive1 = binomial(p, n1) + var posteriorPredictive2 = binomial(p, n2) + + return { + parameter : p, + predictive: { + predictive1: posteriorPredictive1, + predictive2: posteriorPredictive2 + } + }; +} + +var opts = { + method: "MCMC", callbacks: [editor.MCMCProgress()], + samples: 20000, burn: 10000 +}; + +var posterior = Infer(opts, model); + +var posteriorPredictive = marginalize(posterior, "predictive") +// save results for future code boxes +editor.put("posteriorPredictive", posteriorPredictive) + +var parameterPosterior = marginalize(posterior, "parameter") +viz.density(parameterPosterior, {bounds: [0, 1]}) +~~~~ + +Looks like a reasonable posterior distribution. + +How does the posterior predictive look? + +~~~~ +var posteriorPredictive = editor.get("posteriorPredictive") +viz(posteriorPredictive) +~~~~ + +This plot will be a heat map because our posterior predictive distributions is over two dimensions (i.e., future data points collected by experimenter 1 and experimenter 2). +The intensity of the color represents the probability. + +How well does it recreate the observed data? +Where in this 2-d grid would our observed data land? + +Another way of visualizing the model-data fit is to examine a scatterplot. +Here, we will plot the "Maximum A-Posteriori" value as a point-estimate of the posterior predictive distribution. +If the data is well predicted by the posterior predictive (i.e., the model is able to accommodate the data well), it would fall along the y = x line. + +~~~~ +var k1 = 0, k2 = 10; +var posteriorPredictive = editor.get("posteriorPredictive") +var posteriorPredictiveMAP = posteriorPredictive.MAP().val +viz.scatter( + [ + {model: posteriorPredictiveMAP.predictive1, data: k1}, + {model: posteriorPredictiveMAP.predictive2, data: k2} + ] +) +~~~~ + +How well does the posterior predictive match the data? +What can you conclude about the parameter `p`? + + + + + + +# Comparing hypotheses + +In the above examples, we've had a single data-analysis model and used the experimental data to learn about the parameters of the models and the descriptive adequacy of the models. +Often as scientists, we are in fortunate position of having multiple, distinct models in hand, and want to decide if one or another is a better description of the data. +Indeed, we saw an example with the spinning coins when we decided whether `"observerModel"` or `"skepticalModel"` was a better explanation of some data. + +In that example, model comparison was shown a special case of learning about the parameters of a model. +In that case, we defined an uber model (`scientistModel`), that had a binary decision parameter that we wanted to learn about (which one of the models was better). +We did this by having a binary decision variable gate between which of our two models we let generate the data. +We then go backwards (performing Bayesian inference) to decide which model was more likely to have generated the data we observed. + +We take the same approach here, articulating a simple data analysis model for model comparison. +We observe some number of binary outcomes and want to decide if the pattern we see is random or not (e.g., to see if the helping behavior "is systematic" [assuming we've figured out the weird stuff going on with the experimenters before]). +This model mimics a simple cognitive model for subjective randomness, which we will explore in another chapter of this book on Subjective Randomness. + +Formally, a "true random" pattern would be generated by a coin with probability of doing one or other binary outcome as 0.5 (a fair coin); a "non random" pattern would be generated by a trick coin. +We run into our first complication: A fair coin is simple to define: `Bernoulli({p: 0.5})`, but how can we treat a trick coin? +For purposes of scientific hypothesis testing, a trick coin is formalized as a coin with some *unknown* weight: + +~~~~ norun +var p = uniform(0,1); +Bernoulli({p: p}) +~~~~ + +Using this to now compare models: + +~~~~ +var k = 7, n = 20; + +var compareModels = function() { + + // binary decision variable for which hypothesis is better + var x = flip(0.5) ? "simple" : "complex"; + var p = (x == "simple") ? 0.5 : uniform(0, 1); + + observe(Binomial({p: p, n: n}), k); + + return {model: x} +} + +var opts = {method: "rejection", samples: 2000}; +print("We observed " + k + " successes out of " + n + " attempts") +var modelPosterior = Infer(opts, compareModels); +viz(modelPosterior) +~~~~ + +This model is an example from the classical hypothesis testing framework. +We consider a model that fixes one of its parameters to a pre-specified value of interest (here $$\mathcal{H_0} : p = 0.5$$). +This is sometimes referred to as a *null hypothesis*. +The other model says that the parameter is free to vary. +In the classical hypothesis testing framework, we would write: $${H_1} : p \neq 0.5$$. +With Bayesian hypothesis testing, we must be explicit about what $$p$$ is (not just what p is not), so we write $${H_1} : p \sim \text{Uniform}(0, 1) $$. + +One might have a conceptual worry: Isn't the second model just a more general case of the first model? +That is, if the second model has a uniform distribution over `p`, then `p: 0.5` is included in the second model. +This is what's called a *nested model*. + +Shouldn't the more general model always be better? +If we're at a track, and you bet on horse A, and I bet on horse A and B, aren't I strictly in a better position than you? +The answer is no, and the reason has to do with our metric for winning. +Intuitively, we don't care whether your horse won or not, but how much money you win. +How much money you win depends on how much money you bet, and the rule is, when we go to track, we have the same amount of money. + +In probabilistic models, our money is probabilities. Each model must allocate its probability so that it sums to 1. +So my act of betting on horse A and horse B actually requires me to split my money (say, betting 50 / 50 on each). +On the other hand, you put all your money on horse A (100 on A, 0 on B). +If A wins, you will gain more money because you put more money down. + +This idea is called the principle of parsimony or Occam's razor, and will be discussed at length later in this book. +For now, it's sufficient to know that more complex models will be penalized for being more complex, intuitively because they will be diluting their predictions. +At the same time, more complex models are more flexible and can capture a wider variety of data (they are able to bet on more horses, which increases the chance that they will win some money). +Bayesian model comparison lets us weigh these costs and benefits. + + +## Bayes' factor + +What we are plotting above are **posterior model probabilities**. +These are a function of the marginal likelihoods of the data under each hypothesis and the prior model probabilities (here, defined to be equal: `flip(0.5)`). +Sometimes, scientists feel a bit strange about reporting values that are based on prior model probabilities (what if scientists have different priors as to the relative plausibility of the hypotheses?) and so often report the ratio of marginal likelihoods, a quantity known as a *Bayes Factor*. + +Let's compute the Bayes' Factor, by computing the likelihood of the data under each hypothesis. + +~~~~ +var k = 7, n = 20; + +var simpleLikelihood = Math.exp(Binomial({p: 0.5, n: n}).score(k)) + +var complexModel = Infer({method: "forward", samples: 10000}, function(){ + var p = uniform(0, 1); + return binomial(p, n) +}) +var complexLikelihood = Math.exp(complexModel.score(k)) + +var bayesFactor_01 = simpleLikelihood / complexLikelihood +bayesFactor_01 +~~~~ + +How does the Bayes Factor in this case relate to posterior model probabilities above? + +## Savage-Dickey method + +For this example, the Bayes factor can be obtained by integrating out the model parameter (using `Infer` with `{method: "forward"}`). +However, it is not always easy to get good estimates of the two marginal probabilities. +It turns out, the Bayes factor can also be obtained by considering *only* the more complex hypothesis ($$\mathcal{H}_1$$). +What you do is look at the distribution over the parameter of interest (here, $$p$$) at the point of interest (here, $$p = 0.5$$). +Dividing the probability density of the posterior by the density of the prior (of the parameter at the point of interest) also gives you the Bayes Factor! +This perhaps surprising result was described by Dickey and Lientz (1970), and they attribute it to Leonard "Jimmie" Savage. +The method is called the *Savage-Dickey density ratio* and is widely used in experimental science. + +We would use it like so: + +~~~~ +var k = 7, n = 20; + +var complexModelPrior = Infer({method: "forward", samples: 10000}, function(){ + var p = uniform(0, 1); + return p +}) + +var complexModelPosterior = Infer({method: "rejection", samples: 10000}, function(){ + var p = uniform(0, 1); + observe(Binomial({p: p, n: n}), k); + return p +}) + +var savageDickeyDenomenator = expectation(complexModelPrior, function(x){return Math.abs(x-0.5)<0.05}) +var savageDickeyNumerator = expectation(complexModelPosterior, function(x){return Math.abs(x-0.5)<0.05}) +var savageDickeyRatio = savageDickeyNumerator / savageDickeyDenomenator +print( savageDickeyRatio ) +~~~~ + +(Note that we have approximated the densities by looking at the expectation that $$p$$ is within $$0.05$$ of the target value $$p=0.5$$.) + + + + + +# Example: Linear regression and tug of war + +One of the virtues of Bayesian data analysis is its ability to interface with Bayesian models of cognition in a natural way. +Bayesian cognitive models are formalizations of hypotheses about cognition, which we then can test with an experiment. +We can contrast our rich Bayesian cognitive models with more standard models from data science, like linear regression, and evaluate them all using Bayesian data analysis. + +Regression is the workhorse of data science. +Regression models are useful in situations when you have (1) data and (2) some (potentially vague) hypotheses about how variables relate to each other (e.g., that demographics might predict political party affiliation [in some unspecified way]). +In psychology and many other behavioral sciences, experiments are often constructed with discrete/categorical manipulations (e.g., measuring processing time of words vs. pseudowords). +The question "is A greater than B?" (is the processing time of words faster than the processing time of pseudowords?) can be answered using a regression model. + +To explore a Bayesian linear regression model, we will use data from the Tug-of-War experiment by @Gerstenberg2012. +Let's be good data scientists, and start by just taking a look at the data set, found in the `towData` variable (available in this page only). + +~~~~ +var levels = function(a, lvl){ return _.uniq(_.map(a, lvl)) } + +// display single row of the data frame +print(towData[0]) +// display unique levels of "pattern" variable +print(levels(towData, "pattern")) +// display unique levels of "tournament" variable +print(levels(towData, "tournament")) +// display unique levels of "nWins" variable +print(levels(towData, "nWins")) +// display unique levels of "id" variable [participant id] +print(levels(towData, "id")) +~~~~ + +The first line printed is a line from our data set: one participant on one trial. +We see that it has many different fields, including the trial number, their raw rating ("rating"), a normalized score ("ratingZ"), and information about the experimental condition. +Here, this was the condition "confounded evidence" in a "singles" tournament: here, the target player won 3 times against the same player (for a full list of the experimental conditions see @Gerstenberg2012 Tables 2 and 3). + +The other lines show the unique values different variables can take on. + +Let's plot the `ratingZ` variable (a normalized rating). + +~~~~ +viz.hist(_.map(towData, "ratingZ")) +~~~~ + +This distribution of ratings is from all trials, all participants, all experimental conditions. +We see that the ratings range from about -2 to 2. +The most likely ratings are one standard deviation above or below the mean, though some ratings are at the mean of 0. + +## Single regression + +Let's say we ran this experiment and hypothesized that the number of times the target character won (`"nWins"` in the data set) is a predictor of how participants' ratings of strength. +We'll formalize this in a Bayesian regression framework, where ratings of strength $$r$$ are a linear combination of a fixed intercept $$\beta_0$$ and weighted component of number of wins $$\beta_1 * n_{wins}$$. + +$$y_{predicted} = \beta_0 + \beta_1 * n_{wins}$$ + +Because we're in the business of building generative models, we will have to be explicit about how $$y_{predicted}$$ relates to the actual rating data we observed. +We make the standard assumption that the actual ratings are normally distributed around $$y_{predicted}$$, with some noise $$\sigma$$. [This is analagous to having "randomly distributed errors".] + +$$d \sim \mathcal{N}(y_{predicted}, \sigma)$$ + +This is a model of our data. +As in cognitive models, we will put priors on the parameters: $$\beta_0, \beta_1, \sigma$$, and infer their likely values by conditioning on the observed data. + +~~~~ +// alternative proposal distribution for metropolis-hastings algorithm +var uniformKernel = function(prevVal) { + return Uniform({a: prevVal - 0.2, b: prevVal + 0.2}); +}; + +var singleRegression = function(){ + // parameters of a simple linear regression + var b0 = sample(Uniform({a: -1, b: 1}), {driftKernel: uniformKernel}) + var b1 = sample(Uniform({a: -1, b: 1}), {driftKernel: uniformKernel}) + var sigma = sample(Uniform({a: 0, b: 2}), {driftKernel: uniformKernel}) + + map(function(d){ + + // linear regression formula + var predicted_y = b0 + d.nWins*b1 + + observe(Gaussian({mu: predicted_y, sigma: sigma}), d.ratingZ) + + }, towData) + + return {b0: b0, b1: b1, sigma: sigma} +} + +var nSamples = 2500 +var opts = { method: "MCMC", callbacks: [editor.MCMCProgress()], + samples: nSamples, burn: nSamples/2 } + +var posterior = Infer(opts, singleRegression) + +editor.put("singleRegression", posterior) +~~~~ + + +~~~~ +var posterior = editor.get("singleRegression") +viz.marginals(posterior) +~~~~ + +The posteriors are somewhat noisy because we haven't taken that many samples. +We see that the intercept $$\beta_0$$ is around 0, which we might expect given that our data is normalized. +The slope weight $$\beta_1$$ is around 0.35, with relatively low variance around that. +The fact that it's very unlikely for $$\beta_1$$ to be 0 suggests that there is an effect of the number of times the actor has won in Tug of War on participants' judgments of the relative strength of that actor, as we might hope. +$$\sigma$$ is almost around 0.5, which seems a little bit high given that the full range of the response ratings is 4 (-2 to +2). + +### Model criticism with posterior prediction + +We can now critique the model by asking how well it would generate our data. +To do this, we look at the posterior predictive distribution. +There are 20 different experimental conditions (wins vs. loss, singles vs. doubles, and 4 - 6 different kinds of tournaments). +We want to examine our predictions for each of these conditions separately, so we rewrite the model slightly by mapping over each condition variable separately. + +~~~~ +var merge = function(m, d){ + var keys = _.keys(d) + map(function(k){return {model: m[k], data: d[k], item:k} }, keys) +} + +var levels = function(a, lvl){ return _.uniq(_.map(a, lvl)) } + +var outcomes = levels(towData, "outcome"); +var tournaments = levels(towData, "tournament"); +var patterns = { + single: levels(_.where(towData, {tournament: "single"}), "pattern"), + double: levels(_.where(towData, {tournament: "double"}), "pattern") +}; + +// alternative proposal distribution for metropolis-hastings algorithm +var uniformKernel = function(prevVal) { + return Uniform({a: prevVal - 0.2, b: prevVal + 0.2}); +}; + +var singleRegression = function(){ + var b0 = sample(Uniform({a: -1, b: 1}), {driftKernel: uniformKernel}) + var b1 = sample(Uniform({a: -1, b: 1}), {driftKernel: uniformKernel}) + var sigma = sample(Uniform({a: 0, b: 2}), {driftKernel: uniformKernel}) + + var predictions = map(function(tournament){ + return map(function(outcome){ + return map(function(pattern){ + + var itemInfo = {pattern: pattern, tournament: tournament, outcome: outcome} + var itemData = _.where(towData, itemInfo) + + // linear regression formula + var predicted_y = b0 + itemData[0]["nWins"]*b1 + + map(function(d){ observe(Gaussian({mu: predicted_y, sigma: sigma}), d.ratingZ)}, itemData) + + return _.zipObject([[pattern + "_" + tournament + "_" + outcome, predicted_y]]) + + }, patterns[tournament]) // singles tournaments don't have all patterns + }, outcomes) + }, tournaments) + + // nasty data munging + return _.zipObject(_.flatten(map(function(i){ _.pairs(i) }, _.flatten(predictions)), true)) +} + +var nSamples = 500 +var opts = { method: "MCMC", callbacks: [editor.MCMCProgress()], + samples: nSamples, burn: nSamples/2 } + +var posterior = Infer(opts, singleRegression) + +var modelDataDF = merge(posterior.MAP().val, towMeans) + +viz.scatter(modelDataDF) +editor.put('singleRegression', posterior) +editor.put('modelDataDF', modelDataDF) + +~~~~ + + + +~~~ +///fold: +var correlation = function(xs, ys) { + var mx = sum(xs)/xs.length, + my = sum(ys)/ys.length; + var num = sum(map2(function(x,y) { (x-mx) * (y-my)}, xs, ys)); + var den = Math.sqrt(sum(map(function(x) { (x-mx) * (x-mx)},xs))) * + Math.sqrt(sum(map(function(y) { (y-my) * (y-my)},ys))); + return num/den +} +/// + +var modelDataDF = editor.get('modelDataDF') + +var summaryData = map(function(x){ + return _.extend(x, {sqErr: Math.pow(x.model-x.data, 2)}) +}, modelDataDF) + +print("Mean squared error = " + listMean(_.map(summaryData, "sqErr"))) + +var varianceExplained = Math.pow(correlation(_.map(summaryData, "data"), _.map(summaryData, "model")), 2) +print("Model explains " + Math.round(varianceExplained*100) + "% of the data") + +viz.table(summaryData) +~~~ + +The simple linear regression does surprisingly well on this data set (at least at predicting the mean responses). +This is important to know; it provides a standard against which we can evaluate richer models. + +At the same time, we observe in the posterior predictive scatterplot that not all the linear model is predicting certain symmetries that don't come out. +Why might that be? + +## Mutiple regression + +Now, some of the conditions has Alice winning against the same person, so maybe it's also important how many unique wins she has. + +$$y_{predicted} = \beta_0 + \beta_1 * n_{wins} + \beta_2 * wins_{unique}$$ + + +~~~~ +///fold: +var levels = function(a, lvl){ return _.uniq(_.map(a, lvl)) } + +var outcomes = levels(towData, "outcome"); +var tournaments = levels(towData, "tournament"); +var patterns = { + single: levels(_.where(towData, {tournament: "single"}), "pattern"), + double: levels(_.where(towData, {tournament: "double"}), "pattern") +}; + +// alternative proposal distribution for metropolis-hastings algorithm +var uniformKernel = function(prevVal) { + return Uniform({a: prevVal - 0.2, b: prevVal + 0.2}); +}; +/// + +var multipleRegression = function(){ + var b0 = sample(Uniform({a: -1, b: 1}), {driftKernel: uniformKernel}) + var b1 = sample(Uniform({a: -1, b: 1}), {driftKernel: uniformKernel}) + var b2 = sample(Uniform({a: -1, b: 1}), {driftKernel: uniformKernel}) + var sigma = sample(Uniform({a: 0, b: 2}), {driftKernel: uniformKernel}) + + + var predictions = map(function(tournament){ + return map(function(outcome){ + return map(function(pattern){ + + var itemInfo = {pattern: pattern, tournament: tournament, outcome: outcome} + var itemData = _.where(towData, itemInfo) + + // linear equation + var predicted_y = b0 + itemData[0]["nWins"]*b1 + itemData[0]["nUniqueWins"]*b2 + + map(function(d){ observe(Gaussian({mu: predicted_y, sigma: sigma}), d.ratingZ) }, itemData) + + return _.zipObject([[pattern + "_" + tournament + "_" + outcome, predicted_y]]) + + }, patterns[tournament]) // singles tournaments don't have all patterns + }, outcomes) + }, tournaments) + + return { + parameters: {b0: b0, b1: b1, b2: b2, sigma: sigma}, + predictives: _.zipObject(_.flatten(map(function(i){ _.pairs(i) }, _.flatten(predictions)), true)) + } +} + +var nSamples = 250 +var opts = { method: "MCMC", kernel: {HMC: {steps: 5, stepSize: 0.01}}, + callbacks: [editor.MCMCProgress()], + samples: nSamples, burn: nSamples/2 } + +var posterior = Infer(opts, multipleRegression) +editor.put('multiRegression', posterior) +~~~~ + +Look at parameters. + +~~~~ +///fold: +var marginalize = function(dist, key){ + return Infer({method: "enumerate"}, function(){ + return sample(dist)[key]; + }) +} +/// +var posterior = editor.get('multiRegression'); +var parameterPosterior = marginalize(posterior, "parameters") +viz.marginals(parameterPosterior) +~~~~ + +We see that $$\beta_2$$ is also probably not 0, suggesting that the number of *unique* wins a player has is relavent for predicting participants' judgments of their strength. +How well does the model fit the data? + +~~~~ +///fold: +var marginalize = function(dist, key){ + return Infer({method: "enumerate"}, function(){ + return sample(dist)[key]; + }) +} +var merge = function(m, d){ + var keys = _.keys(d) + return map(function(k){return {model: m[k], data: d[k], item:k} }, keys) +} +var correlation = function(xs, ys) { + var mx = sum(xs)/xs.length, + my = sum(ys)/ys.length; + var num = sum(map2(function(x,y) { (x-mx) * (y-my)}, xs, ys)); + var den = Math.sqrt(sum(map(function(x) { (x-mx) * (x-mx)},xs))) * + Math.sqrt(sum(map(function(y) { (y-my) * (y-my)},ys))); + return num/den +} +/// +var posterior = editor.get('multiRegression'); +var posteriorPredictive = marginalize(posterior, "predictives") + +var modelDataDF = merge(posteriorPredictive.MAP().val, towMeans) + + +var summaryData = map(function(x){ + return _.extend(x, {sqErr: Math.pow(x.model-x.data, 2)}) +}, modelDataDF) + +print("Mean squared error = " + listMean(_.map(summaryData, "sqErr"))) +var varianceExplained = Math.pow(correlation(_.map(summaryData, "data"), _.map(summaryData, "model")), 2) +print("Model explains " + Math.round(varianceExplained*100) + "% of the data") + +viz.scatter(modelDataDF) + +viz.table(summaryData) +~~~~ + +The multiple linear regression model fit is improved a little bit, but still fails to predict meaningful difference between certain conditions. + +With regressions like these, we're often asking binary questions (e.g., "is this parameter 0 or not?"). +These kinds of questions provide just a few bits of information. +Instantiating a hypothesis in a cognitive model can answer more than just categorical questions by testing a richer theory of the data. + +## BDA of Tug-of-war model + +Recall the Tug-of-war model from the chapter on [conditioning]({{site.baseurl}}/chapters/03-conditioning.html). + +~~~~ +var options = {method: 'rejection', samples: 1000} + +var lazinessPrior = 0.3; +var lazyPulling = 0.5; + +var model = function() { + + var strength = mem(function(person){ + return gaussian(0, 1) + }) + var lazy = function(person){ + return flip(lazinessPrior) + } + var pulling = function(person) { + return lazy(person) ? + strength(person) * lazyPulling : + strength(person) + } + var totalPulling = function(team){return sum(map(pulling, team)) } + var winner = function(team1, team2){ + totalPulling(team1) > totalPulling(team2) ? team1 : team2 + } + var beat = function(team1,team2){winner(team1,team2) == team1} + + condition(beat(["bob", "mary"], ["tom", "sue"])) + + return strength("bob") +} + +var posterior = Infer(options, model) +print("Bob's strength, given that he and Mary beat Tom and Sue") + +print("Expected value = " + expectation(posterior)) +viz(posterior) +~~~~ + + +### Learning about the Tug-of-War model + +To learn more about (and test) the tug-of-war model, we're going to connect it to the data from the experiment. +You'll notice that we have two parameters in this model: the proportion of a person's strength they pull with when they are being lazy (`lazyPulling`) and the prior probability of a person being lazy (`lazinessPrior`). +Above, we set these parameters to be `0.5` and `0.3`, respectively. +People are lazy about a third of the time, and when they are lazy, they pull with half their strength. +(Technical note: Because we are comparing relative strengths and we have normalized the human ratings, we don't have to infer the parameters of the gaussian in `strength`. +We just use the standard normal distribution.) + +Those parameter values aren't central to our hypothesis. +They are peripheral details to the larger hypothesis which is that people reason about team games like Tug of War by running a structured, generative model in their heads and doing posterior inference. +Rather than guessing at what values we should put for these parameters, we can use the data to inform our beliefs about what those parameters are likely to be (assuming the general model is a good one). + +~~~~ +///fold: +var levels = function(a, lvl){ return _.uniq(_.map(a, lvl)) } + +var outcomes = levels(towData, "outcome"); +var tournaments = levels(towData, "tournament"); +var patterns = { + single: levels(_.where(towData, {tournament: "single"}), "pattern"), + double: levels(_.where(towData, {tournament: "double"}), "pattern") +}; + +var round = function(x){ + return Math.round(x*10)/10 +} + +var bins = map(round, _.range(-2.2, 2.2, 0.1)) + +// alternative proposal distribution for metropolis-hastings algorithm +var lazinessPriorKernel = function(prevVal) { + return Uniform({a: prevVal - 0.1, b: prevVal + 0.1}); +}; +var lazyPullingKernel = function(prevVal) { + return Uniform({a: prevVal - 0.2, b: prevVal + 0.2}); +}; +/// + +// add a tiny bit of noise, and make sure every bin has at least epsilon probability +var smoothToBins = function(dist, sigma, bins){ + Infer({method: "enumerate"}, function(){ + var x = sample(dist); + var smoothedProbs = map(function(b){return Number.EPSILON+Math.exp(Gaussian({mu: x, sigma: sigma}).score(b)) }, bins) + return categorical(smoothedProbs, bins) + }) +} + +var tugOfWarOpts = {method: "rejection", samples: 500} + +var tugOfWarModel = function(lazyPulling, lazinessPrior, matchInfo){ + Infer(tugOfWarOpts, function(){ + + var strength = mem(function(person){ + return gaussian(0, 1) + }) + + var lazy = function(person){ + return flip(lazinessPrior) + } + var pulling = function(person) { + return lazy(person) ? + strength(person) * lazyPulling : + strength(person) + } + var totalPulling = function(team){return sum(map(pulling, team)) } + var winner = function(team1, team2){ + totalPulling(team1) > totalPulling(team2) ? team1 : team2 + } + var beat = function(team1,team2){winner(team1,team2) == team1} + + condition(beat(matchInfo.winner1, matchInfo.loser1)) + condition(beat(matchInfo.winner2, matchInfo.loser2)) + condition(beat(matchInfo.winner3, matchInfo.loser3)) + + return round(strength("A")) + + }) +} + +var dataAnalysisModel = function(){ + var lazinessPrior = sample(Uniform({a: 0, b: 0.5}), {driftKernel: lazinessPriorKernel}) + var lazyPulling = sample(Uniform({a: 0, b: 1}), {driftKernel: lazyPullingKernel}) + + var predictions = map(function(tournament){ + return map(function(outcome){ + return map(function(pattern){ + + + var itemInfo = {pattern: pattern, tournament: tournament, outcome: outcome} + // participants' ratings + var itemData = _.where(towData, itemInfo) + + // information about the winners and losers + var matchInformation = _.where(matchConfigurations, itemInfo)[0] + + var modelPosterior = tugOfWarModel(lazyPulling, lazinessPrior, matchInformation) + var smoothedPredictions = smoothToBins(modelPosterior, 0.05, bins) + + map(function(d){ observe(smoothedPredictions, d.roundedRating) }, itemData) + + return _.zipObject([[pattern + "_" + tournament + "_" + outcome, expectation(modelPosterior)]]) + + }, patterns[tournament]) // singles tournaments don't have all patterns + }, outcomes) + }, tournaments) + + return { + parameters: {lazinessPrior: lazinessPrior, lazyPulling: lazyPulling}, + predictives: _.zipObject(_.flatten(map(function(i){ _.pairs(i) }, _.flatten(predictions)), true)) + } +} + +var nSamples = 20 +var opts = { method: "MCMC", + callbacks: [editor.MCMCProgress()], + samples: nSamples, burn: 0 } + +var posterior = Infer(opts, dataAnalysisModel) +editor.put("bda_bcm", posterior) +~~~~ + +Look at parameters. + +~~~~ +///fold: +var marginalize = function(dist, key){ + return Infer({method: "enumerate"}, function(){ + return sample(dist)[key]; + }) +} +/// +var posterior = editor.get('bda_bcm'); +var parameterPosterior = marginalize(posterior, "parameters") +viz.marginals(parameterPosterior) +~~~~ + +Critique posterior predictive + +~~~~ +///fold: +var marginalize = function(dist, key){ + return Infer({method: "enumerate"}, function(){ + return sample(dist)[key]; + }) +} +var merge = function(m, d){ + var keys = _.keys(d) + return map(function(k){return {model: m[k], data: d[k], item:k} }, keys) +} +var correlation = function(xs, ys) { + var mx = sum(xs)/xs.length, + my = sum(ys)/ys.length; + var num = sum(map2(function(x,y) { (x-mx) * (y-my)}, xs, ys)); + var den = Math.sqrt(sum(map(function(x) { (x-mx) * (x-mx)},xs))) * + Math.sqrt(sum(map(function(y) { (y-my) * (y-my)},ys))); + return num/den +} +/// +var posterior = editor.get('bda_bcm'); +var posteriorPredictive = marginalize(posterior, "predictives") + +var modelDataDF = merge(posteriorPredictive.MAP().val, towMeans) + + +var summaryData = map(function(x){ + return _.extend(x, {sqErr: Math.pow(x.model-x.data, 2)}) +}, modelDataDF) + +print("Mean squared error = " + listMean(_.map(summaryData, "sqErr"))) +var varianceExplained = Math.pow(correlation(_.map(summaryData, "data"), _.map(summaryData, "model")), 2) +print("Model explains " + Math.round(varianceExplained*100) + "% of the data") + +viz.scatter(modelDataDF) +viz.table(summaryData) +~~~~ + +An extended analysis of the Tug of War model (using [RWebPPL](https://github.com/mhtess/rwebppl)) can be found [here](http://rpubs.com/mhtess/bda-tow). + +Test your knowledge: [Exercises]({{site.baseurl}}/exercises/14-bayesian-data-analysis.html) diff --git a/chapters/145-non-parametric-models.md b/chapters/145-non-parametric-models.md old mode 100644 new mode 100755 diff --git a/chapters/150-agents-as-programs.md b/chapters/150-agents-as-programs.md old mode 100644 new mode 100755 index af4862c..85064f8 --- a/chapters/150-agents-as-programs.md +++ b/chapters/150-agents-as-programs.md @@ -1,11 +1,11 @@ --- -layout: +layout: chapter title: "Agents as probabilistic programs" description: "One-shot decision problems, softmax choice, and RSA." is_section: true --- Adapted from "[Modeling agents with probabilistic programs](http://agentmodels.org)" by Owain Evans, Andreas Stuhlmüller, John Salvatier, and Daniel Filan, and "[Probabilistic language understanding](https://gscontras.github.io/probLang/)" by Gregory Scontras and Michael Henry Tessler. -Note: To be edited. Need permisions. + ## Introduction diff --git a/chapters/160-sequential-decisions.md b/chapters/160-sequential-decisions.md old mode 100644 new mode 100755 index dafdc5e..e146eeb --- a/chapters/160-sequential-decisions.md +++ b/chapters/160-sequential-decisions.md @@ -1,11 +1,10 @@ --- -layout: +layout: chapter title: "Sequential decisions" description: "Markov Decision Processes and Partially-Observable Markof Decision Processes" is_section: true --- Adapted from "[Modeling agents with probabilistic programs](http://agentmodels.org)" by Owain Evans, Andreas Stuhlmüller, John Salvatier, and Daniel Filan. -Note: To be editted. Need permissions. ## Introduction diff --git a/chapters/appendix-js-basics.md b/chapters/appendix-js-basics.md old mode 100644 new mode 100755 diff --git a/chapters/appendix-math-review.md b/chapters/appendix-math-review.md new file mode 100755 index 0000000..d0eb0ef --- /dev/null +++ b/chapters/appendix-math-review.md @@ -0,0 +1,357 @@ +--- +layout: chapter +title: Appendix - Mathematics Review +description: A very brief primer on mathematical concepts used in this book. +--- + +FUBAR FUBAR FUBAR + +# Introduction to JavaScript + +JavaScipt is a high-level, untyped programming language commonly used in web development. +WebPPL uses a functional subset of JavaScript, and some basic uses will be reviewed below. + +[JavaScript: The Good Parts](http://bdcampbell.net/javascript/book/javascript_the_good_parts.pdf) is an excellent introduction to the language. +Online tutorials can be found [here](http://www.w3schools.com/js/), [there](https://www.javascript.com), and [elsewhere](https://www.codeschool.com/learn/javascript). + +You can do basic arithmetical operations: + +~~~~ +3 + 3 +~~~~ + +The `+` symbol is also used to concatenate strings: + +~~~~ +"My favorite food is " + "pizza" +~~~~ + +Numeric variables will automatically modified into strings during concatenation: + +~~~~ +3 + " is my favorite number" +~~~~ + +Boolean variables will be automatically changed into numbers when added (`false` becomes 0 and `true` becomes 1) + +~~~~ +true + true +~~~~ + +Equality can be checked using `==` and `===`. +`===` is a stricter comparison which cares about the type of variable (e.g., string, numeric, boolean). + +~~~~ +print(3 == 3) +print("3" == 3) +print("3" === 3) + +print("Booleans can equal numbers when you don't care about type.") +print(true == 1) +print(true === 1) +~~~~ + +A summary of comparison and logical operators can be found [here](http://www.w3schools.com/js/js_comparisons.asp) + +# Mathematical functions and constants + +JavaScript has a built-in "Math" object with properties and methods for mathematical constants and functions. ("Objects" will be described in more detail below.) + +For example, to write: $$ 3^2 $$ + +~~~~ +Math.pow(3,2) // 3^2 +~~~~ + +The area of a 12 inch pizza. + +~~~~ +var radius = 12 / 2 +Math.round(Math.PI*Math.pow(radius, 2)) + " square inches" +~~~~ + +A full list of the functions and constants can be found [here](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Math). + +# Building More Complex Programs + +Consider the following complex expression built out of Boolean operators `||` (*or*) and `&&` (*and*): + +~~~~ +true && (true || false) +~~~~ + +This expression has an *operator*---the function `&&`---and *arguments*---`true` on the left and `(true || false)` on the right. +The latter argument itself is a *subexpression* consisting of a different operator---the function `||`---and different arguments---`true` and `false`. +When reasoning about the evaluation of a program, it is best to think of evaluating the subexpressions first, then substituting their return value into the larger expression. +In this example, we first evaluate the expression `true || false`, returning true. +After we substitute this into the larger expression, we have `true && true`, against returning true. + +As a slightly more complex example, consider: + +~~~~ +// this line is a comment +if (1 == 2) { // the condition of "if" + 100 // the consequent ("then") +} else { + (true || false) // the alternative ("else") +} +~~~~ + +This expression is composed of an `if` conditional that evaluates the first expression (a test here of whether `1` equals `2`) then evaluates the second expression if the first is true or otherwise evaluates the third expression. +The operator `if` is strictly not a function, because it does not evaluate all of its arguments, but instead *short-circuits* evaluating only the second or third. It has a value like any other function. +(We have also used comments here: anything after a `//` is ignored when evaluating.) + +JavaScript has a very useful and common shorthand for `if` statements: it is called the "ternary" operator, using a question mark `?` and colon `:` to demarcate the three components. + +The syntax is: `condition ? consequent : alternative` + +~~~~ +(1 == 2) ? // the condition of "if" + 100 : // the consequent ("then") + (true || false) // the alternative ("else") +~~~~ + +Ternary statements can be strung together to create multiple different conditions + +~~~~ +(1 == 2) ? 100 : +(2 == 3) ? 200 : +(3 == 4) ? 300 : +(3 == 3) ? 400 : + 500 +~~~~ + +Note the particular indentation style used above (called ''pretty-printing''). +To clarify the structure of a function call, the arguments can split up across different lines and can aid readability: + +~~~~ +(3 * ( + (2 * 4) + (3 + 5) +)) + + ( + (10 - 7) + 6 +) +~~~~ + +The online editor will automatically pretty-print for you. +You can re-indent according to this style by selecting some lines and pressing the TAB key. + +We often want to name objects in our programs so that they can be reused. +This can be done with the `var` statement. +`var` looks like this: + +~~~~ norun +var variableName = expression +~~~~ + +`variableName` is a *symbol* that is bound to the value that `expression` evaluates to. +When variables themselves are evaluated they return the value that they have been bound to: + +~~~~ +var someVariable = 3 // assign the value 3 to the variable someVariable +someVariable // when this is evaluated it looks up and returns the value 3 +~~~~ + +Assignment of variables requires use of `var` + +~~~ +someVariable = 3 +someVariable +~~~ + +Multiple variable can be assigned in the same line using a `,`. +To declare the end a line in JavaScript, use a `;`. +In WebPPL as in standard JavaScript, the use of `;` is optional, but can be useful for readability. + +~~~~ +var x = 3, y = 2; +y +~~~~ + +# Arrays and objects + +There are several special kinds of values in JavaScript. +One kind of special value is an *array*: a sequence of other values. + +~~~~ +["this", "is", "an", "array"] +~~~~ + +Arrays can be indexed using `[index]` Note: indexing starts at 0). + +~~~~ +var myArray = ["this", "is", "my", "array"] +myArray[1] +~~~~ + +The length can be computed using `.length` + +~~~~ +var myArray = ["this", "is", "my", "array"] +myArray.length +~~~~ + +You can grab subsets of the array using `.slice(being, end)`. + +~~~~ +var myArray = ["this", "is", "my", "array"] +myArray.slice(1,3) +~~~~ + +If you don't put an `end`, it will default to the end. + +~~~~ +var myArray = ["this", "is", "my", "array"] +myArray.slice(1) +~~~~ + +Arrays can be concatenated together, forming new arrays. + +~~~~ +var myFirstArray = ["this", "is"] +var mySecondArray = ["my", "array"] + +myFirstArray.concat(mySecondArray) +~~~~ + +`.concat` can take multiple arguments, concatenating together multiple arrays or values simultaneously. + +Other helpful methods: + +~~~~ +var myArray = ["this", "is", "my", "array"] +print( myArray.join(" _ ") ) +print( myArray.toString() ) +print( myArray.indexOf("my") ) +~~~~ + +A list of all the properties of arrays can be found [here](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/of). +Use caution with these. +Not all JavaScript methods are supported on arrays in WebPPL. +Some of these JavaScript methods will have their own WebPPL version. +A list of the WebPPL functions for arrays can be found [here](http://docs.webppl.org/en/master/functions/arrays.html). + +In addition, the WebPPL language has available a JavaScript library useful for dealing with arrays and objects, called Underscore. +Underscore functions can be accessed using the `_.` prefix. +A full list of functions available from Underscore can be found [here](http://underscorejs.org/). +Note: underscore functions that take others functions (called predicates in underscore) as arguments are not available in WebPPL (e.g., `_.map`, `_.filter`, ...). + +Here is one example, for reshaping arrays: + +~~~~ +_.zip(['frodo', 'gandalf', 'gimli'], ["hobbit", "wizard", "dwarf"], [true, false, false]) +~~~~ + +In real life, you encounter objects. +Objects have properties. +Properties can be accessed using `.property` or `["property"]` syntax. + +~~~~ +var bilbo = { firstName: "Bilbo", lastName: "Baggins" } +print( bilbo.lastName ) +print( bilbo["lastName"] ) +~~~~ + +The latter is useful when the property is itself a variable + +~~~~ +var bilbo = { firstName: "Bilbo", lastName: "Baggins"} +var prop = "lastName" +bilbo[prop] +// try: bilbo.prop +~~~~ + +Objects in JavaScript are very useful for structured data (like a dictionary in Python, or a dataframe in R). +Underscore has several helpful functions for interacting with objects + +~~~~ +var bilbo = { + firstName: "Bilbo", + lastName: "Baggins", + race: "hobbit", + age: 111, + ringbearer: true +} + +_.keys(bilbo) +~~~~ + +# Building Functions: `function` + +The power of programming languages as a model of computation comes from the ability to make new functions. +To do so, we use the `function` primitive. +For example, we can construct a function that doubles any number it is applied to: + +~~~~ +var double = function(x) { + return x + x +} + +double(3) +~~~~ + +In WebPPL, the use of the `return` keyword is optional. +By default, WebPPL will return the last line of the function. +We use the `return` keyword for explicitness and clarity. + +The general form of a function expression is: `function(arguments){ body }`. +The first sub-expression of the function, the arguments, is a list of symbols that tells us what the inputs to the function will be called; the second sub-expression, the body, tells us what to do with these inputs. +The value which results from a function is called a *compound procedure*. +When a compound procedure is applied to input values (e.g. when `double` was applied to `3`) we imagine identifying (also called *binding*) the argument variables with these inputs, then evaluating the body. + +In functional programming, we can build procedures that manipulate any kind of value---even other procedures. +Here we define a function `twice` which takes a procedure and returns a new procedure that applies the original twice: + +~~~~ +var double = function(x) { return x + x } + +var twice = function(f) { + return function(x) { + return f(f(x)) + } +} + +var twiceDouble = twice(double) + +twiceDouble(3) + +// same as: twice(double)(3) +~~~~ + +When functions take other functions as arguments, that is called a higher-order function + +# Higher-Order Functions + +Higher-order functions can be used to represent common patterns of computation. +Several such higher-order functions are provided in WebPPL. + +`map` is a higher-order function that takes a procedure and applies it to each element of a list. +For instance we could use map to test whether each element of a list of numbers is greater than zero: + +~~~~ +map(function(x){ + return x > 0 +}, [1, -3, 2, 0]) +~~~~ + +The `map` higher-order function can also be used to map a function of more than one argument over multiple lists, element by element. +For example, here is the MATLAB "dot-star" function (or ".*") written using `map2`, which maps over 2 lists at the same time: + +~~~~ +var dotStar = function(v1, v2){ + return map2( + function(x,y){ return x * y }, + v1, v2) +} + +dotStar([1,2,3], [4,5,6]) +~~~~ + +`repeat` is a built-in function that takes another function as an argument. It repeats it how many ever times you want: + +~~~~ +var g = function(){ return 8 } +repeat(100, g) +~~~~ + +Test your knowledge: [Exercises]({{site.baseurl}}/exercises/13-appendix-js-basics.html) diff --git a/chapters/conditional-dependence.md b/chapters/conditional-dependence.md old mode 100644 new mode 100755 index 5eca090..10a6243 --- a/chapters/conditional-dependence.md +++ b/chapters/conditional-dependence.md @@ -40,7 +40,42 @@ We see that `A` an `B` are statistically *independent* given knowledge of `C`. Screening off is a purely statistical phenomenon. For example, consider the the causal chain model, where A directly causes C, which in turn directly causes B. + +~~~~ +var BcondA = function(Aval) { + return Infer({method: 'enumerate'}, function() { + var A = flip() + var C = A ? flip(.1) : flip(.4) + var B = C ? flip(.5) : flip(.9) + condition(A == Aval) + return {B: B} + }) +} + +viz(BcondA(true)) +viz(BcondA(false)) +~~~~ + Here, when we observe C -- the event that mediates an *indirect* causal relation between A and B -- A and B are still causally dependent in our model of the world: it is just our beliefs about the states of A and B that become uncorrelated. + +~~~~ +var BcondA = function(Aval) { + return Infer({method: 'enumerate'}, function() { + var A = flip() + var C = A ? flip(.1) : flip(.4) + var B = C ? flip(.5) : flip(.9) + condition(C) + condition(A == Aval) + return {B: B} + }) +} + +viz(BcondA(true)) +viz(BcondA(false)) + +~~~~ + + There is also an analogous causal phenomenon. If we can actually manipulate or *intervene* on the causal system, and set the value of C to some known value, then A and B become both statistically and causally independent (by intervening on C, we break the causal link between A and C). diff --git a/chapters/conditioning.md b/chapters/conditioning.md old mode 100644 new mode 100755 diff --git a/chapters/deepprbmods.md b/chapters/deepprbmods.md new file mode 100755 index 0000000..5437bbe --- /dev/null +++ b/chapters/deepprbmods.md @@ -0,0 +1,200 @@ +--- +layout: +title: Deep probabilistic models +description: Sketchy sketch + +--- + +Recall curve fitting with polynomials: + +~~~~ +// a0 + a1*x + a2*x^2 + ... +var makePoly = function(as) { + return function(x) { + return sum(mapIndexed(function(i,a) { return a * Math.pow(x, i) }, as)) + } +} + +var observedData = [{"x":-4,"y":69.76636938284166},{"x":-3,"y":36.63586217969598},{"x":-2,"y":19.95244368751754},{"x":-1,"y":4.819485497724985},{"x":0,"y":4.027631414787425},{"x":1,"y":3.755022418210824},{"x":2,"y":6.557548104903805},{"x":3,"y":23.922485493795072},{"x":4,"y":50.69924692420815}] + +var inferOptions = {method: 'optimize', samples: 100, steps: 2000, optMethod: {adam: {stepSize: .01}}} + +var post = Infer(inferOptions, + function() { + var coeffs = repeat(4, function() {return gaussian(0,2)}) + var order = discrete([0.25,0.25,0.25,0.25]) + var f = makePoly(coeffs.slice(0,order+1)) + + var obsFn = function(datum){ + observe(Gaussian({mu: f(datum.x), sigma: 30}), datum.y) + } + mapData({data: observedData}, obsFn) + + return {order: order, + coeffs: coeffs} + } +) + +print("observed data:") +viz.scatter(observedData) + +// viz.marginals(post) + +var xs = [-4,-3,-2,-1,0,1,2,3,4] +var postFnSample = function(){ + var p = sample(post) + return makePoly(p.coeffs.slice(0,p.order+1)) +} +viz.line(xs, map(postFnSample(), xs)) +viz.line(xs, map(postFnSample(), xs)) +viz.line(xs, map(postFnSample(), xs)) +~~~~ + +Now let's make the function a neural net: + +~~~~ +var dm = 10 //try changing this! + +var makeFn = function(M1,M2,B1){ + return function(x){ + return T.toScalars( + // M2 * sigm(x * M1 + B1): + T.dot(M2,T.sigmoid(T.add(T.mul(M1,x),B1))) + )[0]} +} + +var observedData = [{"x":-4,"y":69.76636938284166},{"x":-3,"y":36.63586217969598},{"x":-2,"y":19.95244368751754},{"x":-1,"y":4.819485497724985},{"x":0,"y":4.027631414787425},{"x":1,"y":3.755022418210824},{"x":2,"y":6.557548104903805},{"x":3,"y":23.922485493795072},{"x":4,"y":50.69924692420815}] + +var inferOptions = {method: 'optimize', samples: 100, steps: 2000, optMethod: {adam: {stepSize: .01}}} + +var post = Infer(inferOptions, + function() { + var M1 = sample(DiagCovGaussian({mu: zeros([dm, 1]), sigma: ones([dm,1])})) + var B1 = sample(DiagCovGaussian({mu: zeros([dm, 1]), sigma: ones([dm,1])})) + var M2 = sample(DiagCovGaussian({mu: zeros([1, dm]), sigma: ones([1,dm])})) + + var f = makeFn(M1,M2,B1) + + var obsFn = function(datum){ + observe(Gaussian({mu: f(datum.x), sigma: 2}), datum.y) + } + mapData({data: observedData}, obsFn) + + return {M1: M1, M2: M2, B1: B1} + } +) + +print("observed data:") +viz.scatter(observedData) + +var xs = [-4,-3,-2,-1,0,1,2,3,4] +var postFnSample = function(){ + var p = sample(post) + return makeFn(p.M1,p.M2,p.B1) +} +viz.line(xs, map(postFnSample(), xs)) +viz.line(xs, map(postFnSample(), xs)) +viz.line(xs, map(postFnSample(), xs)) +~~~~ + +If we don't care much about the uncertainty in which function we learn, we can do MLE inference: + +~~~~ +var dm = 100 + +var makeFn = function(M1,M2,B1){ + return function(x){return T.toScalars(T.dot(M2,T.sigmoid(T.add(T.mul(M1,x),B1))))[0]} +} + +var observedData = [{"x":-4,"y":69.76636938284166},{"x":-3,"y":36.63586217969598},{"x":-2,"y":19.95244368751754},{"x":-1,"y":4.819485497724985},{"x":0,"y":4.027631414787425},{"x":1,"y":3.755022418210824},{"x":2,"y":6.557548104903805},{"x":3,"y":23.922485493795072},{"x":4,"y":50.69924692420815}] + +var inferOptions = {method: 'optimize', samples: 100, steps: 2000, optMethod: {adam: {stepSize: .01}}} + +var post = Infer(inferOptions, + function() { + var M1 = sample(DiagCovGaussian({mu: zeros([dm, 1]), sigma: ones([dm,1])}), { + guide: function() {return Delta({v: param({dims: [dm, 1]})})}}) + var B1 = sample(DiagCovGaussian({mu: zeros([dm, 1]), sigma: ones([dm,1])}), { + guide: function() {return Delta({v: param({dims: [dm, 1]})})}}) + var M2 = sample(DiagCovGaussian({mu: zeros([1, dm]), sigma: ones([1,dm])}), { + guide: function() {return Delta({v: param({dims: [1, dm]})})}}) + + var f = makeFn(M1,M2,B1) + + var obsFn = function(datum){ + observe(Gaussian({mu: f(datum.x), sigma: 2}), datum.y) + } + mapData({data: observedData}, obsFn) + + return {M1: M1, M2: M2, B1: B1} + } +) + +print("observed data:") +viz.scatter(observedData) + +var xs = [-4,-3,-2,-1,0,1,2,3,4] +var postFnSample = function(){ + var p = sample(post) + return makeFn(p.M1,p.M2,p.B1) +} +viz.line(xs, map(postFnSample(), xs)) +viz.line(xs, map(postFnSample(), xs)) +viz.line(xs, map(postFnSample(), xs)) +~~~~ + +Having shown that we can stick an unknown function in our supervised model, we can stick one anywhere in a generative model! + +Here we learn an unsupervised model of x,y pairs, which are generated from a latent z passed through a (learned) function. + +~~~~ +var hd = 50 +var ld = 10 +var outSig = Vector([0.1, 0.1]) + +var makeFn = function(M1,M2,B1){ + return function(x){return T.dot(M2,T.sigmoid(T.add(T.dot(M1,x),B1)))} +} + +var observedData = [{"x":-4,"y":69.76636938284166},{"x":-3,"y":36.63586217969598},{"x":-2,"y":19.95244368751754},{"x":-1,"y":4.819485497724985},{"x":0,"y":4.027631414787425},{"x":1,"y":3.755022418210824},{"x":2,"y":6.557548104903805},{"x":3,"y":23.922485493795072},{"x":4,"y":50.69924692420815}] + +var inferOptions = {method: 'optimize', samples: 100, steps: 5000, optMethod: {adam: {stepSize: .01}}} + +var post = Infer(inferOptions, + function() { + var M1 = sample(DiagCovGaussian({mu: zeros([hd,ld]), sigma: ones([hd,ld])}), { + guide: function() {return Delta({v: param({dims: [hd, ld]})})}}) + var B1 = sample(DiagCovGaussian({mu: zeros([hd, 1]), sigma: ones([hd,1])}), { + guide: function() {return Delta({v: param({dims: [hd, 1]})})}}) + var M2 = sample(DiagCovGaussian({mu: zeros([2,hd]), sigma: ones([2,hd])}), { + guide: function() {return Delta({v: param({dims: [2,hd]})})}}) + + var f = makeFn(M1,M2,B1) + + var obsFn = function(datum){ + var z = sample(DiagCovGaussian({mu: zeros([ld, 1]), sigma: ones([ld,1])})) +// print(datum) +// print(T.toScalars(z)) +// print(T.toScalars(f(z))) +// print(" ") + observe(DiagCovGaussian({mu: f(z), sigma: outSig}), Vector([datum.x, datum.y])) + } + mapData({data: observedData}, obsFn) + + return {M1: M1, M2: M2, B1: B1} + } +) + +print("observed data:") +viz.scatter(observedData) + +var postSample = function(){ + var p = sample(post) + var f = makeFn(p.M1,p.M2,p.B1) + var z = sample(DiagCovGaussian({mu: zeros([ld, 1]), sigma: ones([ld,1])})) + var s = T.toScalars(f(z)) + return {x:s[0], y:s[1]} +} + +viz.scatter(repeat(10, postSample)) +~~~~ \ No newline at end of file diff --git a/chapters/dependence.md b/chapters/dependence.md index 7fce5c3..33ff60d 100644 --- a/chapters/dependence.md +++ b/chapters/dependence.md @@ -231,7 +231,7 @@ The figure below defines a Bayesian network for the medical diagnosis example. The graph contains a node for each `var` statement in our WebPPL program, with links to that node from each variable that appears in the assignment expression. There is a probability table ("CPT") for each node, with a column for each value of the variable, and a row for each combination of values for its parents in the graph. -![A Bayes net for the medical diagnosis example.]({{site.baseurl}}/assets/img/Med-diag-bnet1.jpg) +![A Bayes net for the medical diagnosis example.](../assets/img/Med-diag-bnet1.jpg) Simple generative models will have a corresponding graphical model that captures all of the dependencies (and *in*dependencies) of the model, without capturing the precise *form* of these functions. For example, while the graphical model shown above faithfully represents the probability distribution encoded by the WebPPL program, it captures the *noisy-OR* form of the causal dependencies only implicitly. @@ -244,6 +244,13 @@ As we will see, this has important implications for the ability to learn the val More complicated generative models, which can be expressed as probabilistic programs, often don't have such a graphical model (or rather they have many approximations, none of which captures all independencies). Recursive models generally give rise to such ambiguous (or loopy) Bayes nets. +## Mem and Plate Notation + +If the same variable gets reused within a model (e.g., because of a memoized function), it is often useful to use plate notation. For example: + +![A Bayes net with plate notation.](../assets/img/plate_notation.png) + +In this simple model, `cough` depends on `cold` which depends on some prior $$\alpha$$. However, the value of `cough` and `cold` is determined independently for each patient `s`, which is what we want. Test your knowledge: [Exercises]({{site.baseurl}}/exercises/dependence.html) diff --git a/chapters/generative-models.md b/chapters/generative-models.md old mode 100644 new mode 100755 diff --git a/chapters/hierarchical-models.md b/chapters/hierarchical-models.md old mode 100644 new mode 100755 diff --git a/chapters/inference-algorithms.md b/chapters/inference-algorithms.md index 5563263..cb6d55f 100644 --- a/chapters/inference-algorithms.md +++ b/chapters/inference-algorithms.md @@ -52,7 +52,7 @@ var time = function(foo, trials) { time(infModel, 10) ~~~~ -Even for this simple program, lowering the baserate by just one order of magnitude, to $$0.01$$, will make rejection sampling impractical. +Even for this simple program, lowering the baserate by just one order of magnitude, to $$0.01$$, dramatically increases the amount of time the program takes (try it). Another option that we've seen before is to enumerate all of the possible executions of the model, using the rules of probability to calculate the conditional distribution: @@ -81,7 +81,7 @@ var infModel = function(){ time(infModel, 10) ~~~~ -Notice that the time it takes for this program to run doesn't depend on the baserate. Unfortunately it does depend critically on the number of random choices in an execution history: the number of possible histories that must be considered grows exponentially in the number of random choices. To see this we modify the model to allow a flexible number of `flip` choices: +Notice that the time it takes for this program to run doesn't depend on the baserate (use the code above to prove this to yourself). Unfortunately it does depend critically on the number of random choices in an execution history: the number of possible histories that must be considered grows exponentially in the number of random choices. To see this we modify the model to allow a flexible number of `flip` choices: ~~~~ ///fold: @@ -107,6 +107,8 @@ var infModel = function(){ time(infModel, 10) ~~~~ +Try trippling the number of flips. You should see that this increases the runtime by about 30x. + The dependence on size of the execution space renders enumeration impractical for many models. In addition, enumeration isn't feasible at all when the model contains a continuous distribution (because there are uncountably many value that would need to be enumerated). Try inserting `var x = gaussian(0,1)` in the above model. There are many other algorithms and techniques for probabilistic inference, reviewed below. They each have their own performance characteristics. For instance, *Markov chain Monte Carlo* inference approximates the posterior distribution via a random walk (described in detail below). @@ -135,8 +137,7 @@ var infModel = function(){ time(infModel, 10) ~~~~ -See what happens in the above inference as you lower the baserate. Unlike rejection sampling, inference will not slow down appreciably (but results will become less stable). Unlike enumeration, inference should also not slow down exponentially as the size of the state space is increased. -This is an example of the kind of trade offs that are common between different inference algorithms. +See what happens in the above inference as you lower the baserate. Unlike rejection sampling, inference slows down only moderately (but results will become less stable). Unlike enumeration, tripling the number of flips has only a mild impact on runtime (try it). This is an example of the kind of trade offs that are common between different inference algorithms. The varying performance characteristics of different algorithms for (approximate) inference mean that getting accurate results for complex models can depend on choosing the right algorithm (with the right parameters). In what follows we aim to gain some intuition for how and when algorithms work, without being exhaustive. @@ -408,7 +409,7 @@ To construct a Markov chain that converges to a stationary distribution of inter Fortunately, it turns out that for any given (conditional) distribution there are Markov chains with a matching stationary distribution. There are a number of methods for finding an appropriate Markov chain. One particularly common method is *Metropolis Hastings* recipe. To create the necessary transition function, we first create a *proposal distribution*, $$q(x\rightarrow x')$$, which does not need to have the target distribution as its stationary distribution, but should be easy to sample from (otherwise it will be unwieldy to use!). A common option for continuous state spaces is to sample a new state from a multivariate Gaussian centered on the current state. To turn a proposal distribution into a transition function with the right stationary distribution, we either accepting or reject the proposed transition with probability: $$\min\left(1, \frac{p(x')q(x'\rightarrow x)}{p(x)q(x\rightarrow x')}\right).$$ -That is, we flip a coin with that probability: if it comes up heads our next state is $x'$, otherwise our next state is still $$x$$. +That is, we flip a coin with that probability: if it comes up heads our next state is $$x'$$, otherwise our next state is still $$x$$. Such a transition function not only satisfies the *balance condition*, it actually satisfies a stronger condition, *detailed balance*. Specifically, $$p(x)\pi(x \rightarrow x') = p(x')\pi(x' \rightarrow x)$$. (To show that detailed balance implies balance, substitute the right-hand side of the detailed balance equation into the balance equation, replacing the summand, and then simplify.) It can be shown that the *Metropolis-hastings algorithm* gives a transition probability (i.e. $$\pi(x\rightarrow x')$$) that satisfies detailed balance and thus balance. (Recommended exercise: prove this fact. Hint: the probability of transitioning depends on first proposing a given new state, then accepting it; if you don't accept the proposal you "transition" to the original state.) @@ -547,6 +548,8 @@ A particle filter -- also known as [Sequential Monte Carlo](http://docs.webppl.o The particles are "re-sampled" upon encountering new evidence, in order to adjust the numbers so that the population will be approximately distributed according to the model. SMC is particularly useful for models where beliefs can be incrementally updated as new observations come in. +It is recommended that you watch [Particle Filters Explained without Equations](https://www.youtube.com/watch?v=aUkBa1zMKv4) before continuing on, in order to develop some intuitions for how particle filters work. + Let's consider another simple model, where five real numbers are constrained to be close to their neighbors: ~~~~ @@ -917,8 +920,11 @@ Again, the actual trajectory is in green, the observations are in grey, and the # Variational Inference -The previous parts of this chapter focused on Monte Carlo methods for approximate inference: algorithms that generate a (large) collection of samples to represent a conditional distribution. -Another way to represent a distribution is by finding the closest approximation among a set (or "family") of simpler distributions. This is the approach taken by *variational inference*. At a high level, we declare a set of models that have the same choices as our target model, but don't have any conditions (i.e. no `condition`, `observe`, or `factor`); we then try to find the member of this set closest to our target model and use it as the result of `Infer`. +The previous parts of this chapter focused on Monte Carlo methods for approximate inference: algorithms that generate a (large) collection of samples to represent a conditional distribution. An advantage of this method is that it is guaranteed to give you the right answer in the long run. A disadvantage is that the long run is a very long time (potentially long after the heat death of the universe). Even in the best cases, Monte Carlo methods tend to be computationally intensive and slow. + +*Variational inference* involves reprsenting the distribution you want (the probability distribution) by finding the closest approximation among a set (or "family") of simpler distributions. This is generally much faster, though with the disadvantage that the answer is often guaranteed to be wrong. (Since Monte Carlo is only guaranteed to be correct in the long run, in practice this difference is not always that meaningful.) + +At a high level, we declare a set of models that have the same choices as our target model, but don't have any conditions (i.e. no `condition`, `observe`, or `factor`); we then try to find the member of this set closest to our target model and use it as the result of `Infer`. To search for a good approximating model, we will eventually use gradient-based techniques. For this reason, we don't want a set of isolated models, but a continuous family. In WebPPL we declare parameters of a family with `param()`. For instance, here is a family of Gaussian distributions with fixed variance but different means: diff --git a/chapters/introduction.md b/chapters/introduction.md old mode 100644 new mode 100755 diff --git a/chapters/learning-as-conditional-inference.md b/chapters/learning-as-conditional-inference.md index 0e3d3d5..7320ecb 100644 --- a/chapters/learning-as-conditional-inference.md +++ b/chapters/learning-as-conditional-inference.md @@ -60,9 +60,11 @@ Try varying the number of flips and the number of heads observed. You should be When exploring learning as a conditional inference, we are particularly interested in the dynamics of how inferred hypotheses change as a function of amount of data (often thought of as time the learner spends acquiring data). We can map out the *trajectory* of learning by plotting a summary of the posterior distribution as a function of the amount of observed data. Here we plot the expectation that the coin is fair in the above example: ~~~~ +var fairPrior = .999 + var fairnessPosterior = function(observedData) { return Infer({method: 'enumerate'}, function() { - var fair = flip(0.999) + var fair = flip(fairPrior) var coin = Bernoulli({p: fair ? 0.5 : 0.95}) var obsFn = function(datum){observe(coin, datum == 'h')} mapData({data: observedData}, obsFn) diff --git a/chapters/mixture-models.md b/chapters/mixture-models.md old mode 100644 new mode 100755 index 4185833..b822c6b --- a/chapters/mixture-models.md +++ b/chapters/mixture-models.md @@ -483,4 +483,6 @@ Notice that unlike the unbounded mixture model case above, we were able to use M +Reading & Discussion: [Readings]({{site.baseurl}}/readings/mixture-models.html) + Test your knowledge: [Exercises]({{site.baseurl}}/exercises/mixture-models.html) diff --git a/chapters/occams-razor.md b/chapters/occams-razor.md old mode 100644 new mode 100755 diff --git a/chapters/process-models.md b/chapters/process-models.md index cebe2d3..87de786 100644 --- a/chapters/process-models.md +++ b/chapters/process-models.md @@ -1,7 +1,7 @@ --- layout: chapter title: Rational process models -description: The psychological reality of inference algorithms. +description: Inference in the real world. chapter_num: 8 custom_js: - assets/js/box2d.js @@ -13,6 +13,12 @@ custom_css: - /assets/css/draw.css --- +A major challenge to Bayesian models is that Bayesian inference is often computationally difficult. The most straightforward interpretation of Bayesian models at the algorithmic level is that explicit probabilities for different states are computed and represented. (This is very much like the 'enumerate' method of Infer.) Attempts have been made to model how neural systems might capture these representations, via ideas such as *probabilistic population codes*. (See [Bayesian inference with probabilistic population codes](https://www.nature.com/articles/nn1790), Ma, Beck, Latham, Pouget (2006).) While such methods work well for simple problems, it is difficult to see how they can scale up to real-world cognition. + + + - # How is uncertainty represented? A signature of probabilistic ("Bayesian") cognitive models is the central role of uncertainty. Generative models, our main notion of knowledge, capture uncertain causal processes. After making observations or assumptions, Infer captures uncertain answers. At the computational level we work with this uncertainty by manipulating distribution objects, without needing to explore (much) how they are created or represented. Yet cognitively there is a key algorithmic question: how is uncertainty represented in the human mind? @@ -43,20 +39,18 @@ We have at least three very different possible answers to this question: - Explicit representation of probabilities. - Parametric representation of distribution families. - Sampling-based representations. +--> + + - one and done. + - drift diffusion? + - mcmc / anchoring. + - amortized inference. + - neural implementation. -## Approximate distribution representations + --> -Another possible representation of uncertainty is via the parameters of a family of distributions. For instance, the mean and covariance of a Gaussian is a flexible and popular (in statistics) way to approximate a complex distirbution. (Indeed, we have seen that a mean-field product of Gaussians can give quick and useful inference result from variational inference.) It is thus possible that all uncertainty is represented in the human mind as parameters of some family. A version of this idea can be seen in the *free energy* hypothesis. (See [The free-energy principle: a unified brain theory?](https://www.nature.com/articles/nrn2787), Friston (2010).) ## The sampling hypothesis @@ -87,8 +81,144 @@ viz(repeat(100,sampleAgent)) The maximizing agent chooses the most likely outcome by examining the conditional probability they assign to outcomes -- the result is all such agents choosing 'true'. In contrast, a population of agents that each represents their belief with a single sample will choose 'false' about 30% of the time. This behavioral signature -- *probability matching* -- is in fact a very old and well studied psychological phenomenon. (See for instance, Individual Choice Behavior: A Theoretical Analysis, Luce (1959).) +### How many samples should you take? + Vul, Goodman, Griffiths, Tenenbaum (2014) further ask how many samples a rational agent *should* use, if they are costly. This analysis explores the trade off between expected reward increase from more precise probability estimates (more samples) with resource savings from less work (fewer samples). The, somewhat surprising, result is that for a wide range of cost and reward assumptions it is optimal to decide based on only one, or a few, samples. +Let's use our favorite example: flipping a coin. Suppose this is a trick coin with known weight `w`. Our job is to correctly guess the outcome of the next flip of the coin. + +If we want to maximize, we obviously should just round: if `w >= .5` we should guess `heads`; otherwise, `tails` (although, as we just discussed, humans often probability-match rather than maximize, here we're interested in what would be optimal, so we will maximize): + +~~~~ +Infer({method: "forward", samples: 5000}, function(){ + var w = sample(Uniform({a: 0, b: 1})) //true weight + return (flip(w) == (w >= .5)) +}) +~~~~ + +We can win this bet around 75% of the time. + +However, let's assume for the moment that we can't easily calculate the optimal strategy. (For most non-trivial problems, we can't.) Instead, we sample the distribution of heads for our coin. If most of those samples come up `heads`, then we bet `heads`; otherwise, `tails': + +~~~~ +var takesamples = function(nsamples){ + var w = sample(Uniform({a: 0, b: 1})) //true weight + var samples = Infer({method: "forward", samples:nsamples}, function(){ + return flip(w) + }) + return(flip(w) == samples.MAP().val) +} + +Infer({method: "forward", samples: 1000}, function(){takesamples(1000)}) +~~~~ + +Here, we took 1,000 samples. Not surprisingly, we win our bet nearly 75\% of the time. But what happens if we only take 10 samples? + +~~~~ +var takesamples = function(nsamples){ + var w = sample(Uniform({a: 0, b: 1})) //true weight + var samples = Infer({method: "forward", samples:nsamples}, function(){ + return flip(w) + }) + return(flip(w) == samples.MAP().val) +} + +Infer({method: "forward", samples: 5000}, function(){takesamples(10)}) +~~~~ + +Impressively, we're still very close to 75%. What if we only took 1 sample? + +~~~~ +var takesamples = function(nsamples){ + var w = sample(Uniform({a: 0, b: 1})) //true weight + var samples = Infer({method: "forward", samples:nsamples}, function(){ + return flip(w) + }) + return(flip(w) == samples.MAP().val) +} + +Infer({method: "forward", samples: 5000}, function(){takesamples(1)}) +~~~~ + +We are still winning around 2/3 of the time. Obviously, if we have the computational power available and enough time to take the samples, we should take 1,000 samples and maximize our chances of winning. But if samples are costly, it may not be worth taking more than 1. + +### Rational Anchoring + +One well-known bias in human reasoning is the anchoring bias discovered by Kahnamen and Tversky. In the original study, subjects were first asked to judge whether the percentage of African countries in the United Nations was larger or smaller than a number that was randomly generated by spinning a wheel of fortune. Subjects were then asked to guess the correct percentage. What was striking was that subjects were systematically biased towards the random number: their guesses were larger when the random number was large and smaller when the random number was smaller. This same "anchoring" bias has appeared in many other contexts, and it is often given as a paradigmatic case of irrationality on the part of humans. + +In "[The anchoring bias reflects rational use of cognitive resources](https://link.springer.com/article/10.3758/s13423-017-1286-8)", Lieder and colleagues suggest a resource-rational account, based on the dynamics of MCMC. Suppose that in the class study, subjects use the random number as their initial proposal for an MCMC-like process. (There are any number of reasons people might do this.) For instance, perhaps they consider whether it seems correct. If it doesn't, they propose a new number *based on the first number* and check again. And so on. The dynamics of this process look a lot like MCMC. + +As we've seen previously, initial samples from MCMC are strongly biased by the starting point. If you run MCMC long enough, this bias disappears. However, an agent that is being resource-rational may not take a large number of samples, particularly in the context of being asked trivia questions by a random psychologist. + +### Inferring human optimality from data + +In "[Rational Use of Cognitive Resources: Levels of Analysis Between the Computational and the Algorithmic](https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/tops.12142)", Griffiths, Lieder, and Goodman note that Bayesian models provide a convenient way of analyzing behavior. That is, we can ask not just whether human cognition is perfectly optimal (it probably isn't), but *how* optimal is it. + +Let's return to our cross-situational learning model from the exercises for the "Algorithms for Inference" chapter. Below, it has been refactored so that the number of particles used by human cognition is a free variable. Now, we assume that 10 subjects have all participated in the study, and all of them concluded that the word for dog is "dax". If you run the following code (**warning: it will take a few minutes**), it'll infer the number of particles is probably fairly large. + +~~~~ +var names = ["dax", "blicket", "gorper", "greeble", "freeble"] + +var objName = mem(function(obj) { + sample(Categorical({vs: names, ps: [.2, .2, .2, .2, .2]})) +}) + +var nameOne = function(obj1, obj2){ + return flip() ? objName(obj1) : objName(obj2) +} + +var clmodel = function() { + var dog = objName("dog") + var cat = objName("cat") + factor(2*(nameOne("dog","cat") == "dax")) + var bird = objName("bird") + factor(2*(nameOne("dog","bird") == "blicket")) + var cow = objName("cow") + factor(2*(nameOne("dog","cow") == "greeble")) + var platypus = objName("platypus") + factor(2*(nameOne("dog","platypus") == "freeble")) + var ostrich = objName("platypus") + factor(2*(nameOne("dog","ostrich") == "dax")) + return objName("dog") +} + +var experiment = Infer({method: "MCMC", samples:250, lag:10}, function(){ + var npart = sample(RandomInteger({n:50}))+1 + + //5 subjects, all conclude dax=dog + var sub1 = Infer({method: "SMC", particles: npart, rejuvSteps: 10}, clmodel).MAP().val + var sub2 = Infer({method: "SMC", particles: npart, rejuvSteps: 10}, clmodel).MAP().val + var sub3 = Infer({method: "SMC", particles: npart, rejuvSteps: 10}, clmodel).MAP().val + var sub4 = Infer({method: "SMC", particles: npart, rejuvSteps: 10}, clmodel).MAP().val + var sub5 = Infer({method: "SMC", particles: npart, rejuvSteps: 10}, clmodel).MAP().val + + factor(5*(sub1 == 'dax')); + factor(5*(sub2 == 'dax')); + factor(5*(sub3 == 'dax')); + factor(5*(sub4 == 'dax')); + factor(5*(sub5 == 'dax')); + + return npart +}) + +viz(experiment) +~~~~ + +Output from one run looked like this: + +![Inferred number of particles](../assets/img/particles_1.svg) + +This seems reasonable. We know that if we are using particle filtering, accuracy goes up the more particles one has. Since all five subjects gave the right answer, that suggests a decent number of particles. + +Suppose our subjects weren't so accuracy. We can rewrite the code above so that each subject comes to a different conclusion about the word for 'dog': one subject concludes 'dax', one concludes 'greeble', one concludes 'freeble', one concludes 'blicket', and one concludes 'gorper'. In this case, we'll infer the number of particles is probably a lot smaller: + +![Inferred number of particles](../assets/img/particles_2.svg) + +Suppose we fit this model to a real dataset and found the best estimate for number of particles is 7. That would not necessarily mean that humans in fact use particle filtering with approximately 7 particles to learn vocabulary. Rather, it means that human level of accuracy can be captured by such a model. It further suggests that humans are fairly resource-limited in our ability to do cross-situational learning, since accuracy with only 7 particles is not going to be very high. (One could quantify just how limited it is by seeing how accuracy is affected by number of particles in a realistic learning scenario.) + +## Approximate distribution representations + +Another possible representation of uncertainty is via the parameters of a family of distributions. For instance, the mean and covariance of a Gaussian is a flexible and popular (in statistics) way to approximate a complex distirbution. (Indeed, we have seen that a mean-field product of Gaussians can give quick and useful inference result from variational inference.) It is thus possible that all uncertainty is represented in the human mind as parameters of some family. A version of this idea can be seen in the *free energy* hypothesis. (See [The free-energy principle: a unified brain theory?](https://www.nature.com/articles/nrn2787), Friston (2010).) This would of course come with its own set of tradeoffs between accuracy and cost. diff --git a/exercises/03-conditioning.md b/exercises/03-conditioning.md new file mode 100755 index 0000000..2425d16 --- /dev/null +++ b/exercises/03-conditioning.md @@ -0,0 +1,766 @@ +--- +layout: exercise +title: Conditioning - exercises +custom_js: +- assets/js/box2d.js +- assets/js/physics.js +--- + +## Exercise 1: Fair coins and biased coins + +### a) + +I flip a fair coin. What is the probability that it lands heads? + +~~~~ +var model = function() { + // Your code here +} +var log_prob = Infer({method:'enumerate'}, model).score('H') +Math.exp(log_prob) +~~~~ + +### b) + +I also have a biased coin, with $$P(\text{heads})=0.9$$. +I hand you one of the coins (either biased or fair) without telling you which. +You flip it three times. + +Given that first two coin flips landed on heads, what is the posterior distribution for the next flip? + +~~~~ +var model = function() { + // Your code here +} +viz(Infer({method:'enumerate'}, model)) +~~~~ + +### c) + +Given that all three flips landed on heads, what is the probability that the coin was biased? + +### d) + +Given that the first two flips were different, what is the probability that the third flip will be heads? + +## Exercise 2: Conditioning and Intervention + +In the earlier [Medical Diagnosis]({{site.baseurl}}/chapters/02-generative-models.html#example-causal-models-in-medical-diagnosis) section we suggested understanding the patterns of symptoms for a particular disease by changing the prior probability of the disease such that it is always true (also called the *do* operator). + +~~~~ +var lungCancer = flip(0.01); +var cold = flip(0.2); +var cough = ( + (cold && flip(0.5)) || + (lungCancer && flip(0.3)) +) +cough; +~~~~ + +### a) + +For this example, does intervening on the program in this way (e.g. by setting the value of `lungCancer`) have the same effect as *conditioning* on the disease being true? What about the casual dependency makes this the case? + +### b) + +Why would intervening have a different effect than conditioning for more general hypotheticals? Construct an example where they differ. Then translate this into a WebPPL model and show that manipulating the prior gives different answers than manipulating the observation. *Hint:* think about the effect that intervening vs. conditioning on a variable that has a **causal parent** would have on that parent. + +~~~~ + +~~~~ + +## Exercise 3: Computing marginals + +Use the rules for computing probabilities to compute the marginal distribution on return values from these programs by hand (use `viz()` to check your answers): + +### a) + +~~~~ +Infer({method: "enumerate"}, function() { + var a = flip(); + var b = flip(); + condition(a || b); + return a; +}) +~~~~ + +### b) + +~~~~ +var smilesModel = function() { + var nice = mem(function(person) {return flip(.7)}); + var smiles = function(person) {return nice(person) ? flip(.8) : flip(.5);} + condition(smiles('alice') && smiles('bob') && smiles('alice')); + return nice('alice'); +} + +Infer({method: "enumerate"}, smilesModel) +~~~~ + +## Exercise 4: Extending the smiles model + +### a) + +Describe (using ordinary English) what the second WebPPL program, `smilesModel` above means. + +### b) + +Extend `smilesModel` to create a version of the model that also captures these two intuitions: + +1. people are more likely to smile if they want something and +2. *nice* people are less likely to want something. + +Note: Do not lose the fact that niceness is also a risk factor for smiling. + +*Hint:* Which variables change at different times for the same person? +Which values *depend* on other values? + +~~~~ +var extendedSmilesModel = function() { + var nice = mem(function(person) {return flip(.7)}); + + ... + + var smiles = function(person, ...) { + return nice(person) ? flip(.8) : flip(.5); + } + + return smiles('alice') +} + +Infer({method: "enumerate"}, extendedSmilesModel) +~~~~ + +### c) + +Suppose you've seen Bob five times this week and each time, he was not smiling. But today, you see Bob and he *is* smiling. +Use this `extendedSmilesModel` model to compute the posterior belief that Bob wants something from you today. + +*Hint:* How will you represent the same person (Bob) smiling *multiple times*? +What features of Bob will stay the same each time he smiles (or doesn't) and what features will change? + +In your answer, show the WebPPL inference and a histogram of the answers -- in what ways do these answers make intuitive sense or fail to? + +~~~~ +var extendedSmilesModel = function() { + // copy your code frome above + + // make the appropriate observations + + // return the appropriate query + return ...; +} + + +Infer({method: "enumerate"}, extendedSmilesModel) +~~~~ + + +Question 5: Sprinklers, Rain and mem + +### a) + +I have a particularly bad model of the sprinkler in my garden. +It is supposed to water my grass every morning, but is turns on only half the time (at random, as far as I can tell). +Fortunately, I live in a city where it also rains 30% of days. + +One day I check my lawn and see that it is wet, meaning that either it rained that morning or my sprinkler turned on (or both). + +Answer the following questions, either using the Rules of Probability or by writing your own sprinkler model in webppl. + +* What is the probability that it rained? +* What is the probability that my sprinkler turned on? + +~~~~ + +~~~~ + +### c) + +My neighbour Kelsey, who has the same kind of sprinkler, tells me that her lawn was also wet that same morning. +What is the new posterior probability that it rained? + +~~~~ + +~~~~ + +### d) + +To investigate further we poll a selection of our friends who live nearby, and ask if their grass was wet this morning. +Kevin and Manu and Josh, each with the same sprinkler, all agree that their lawns were wet too. +Using `mem`, write a model to reason about arbitrary numbers of people, and then use it to find the new probability that it rained. + +~~~~ + +~~~~ + + +## Exercise 5: Casino game + +Consider the following game. +A machine randomly gives Bob a letter of the word "game"; it gives a, e (the vowels) with probability 0.45 each and the remaining letters (the consonants g, m) with probability 0.05 each. +The probability that Bob wins depends on which letter he got. +Letting $$h$$ denote the letter and letting $$Q(h)$$ denote the numeric position of that letter in the word "game" (e.g., $$Q(\text{g}) = 1, Q(\text{a}) = 2$$, and so on), the probability of winning is $$1/Q(h)^2$$. + +Suppose that we observe Bob winning but we don't know what letter he got. +How can we use the observation that he won to update our beliefs about which letter he got? +Let's express this formally. +Before we begin, a bit of terminology: the set of letters that Bob could have gotten, $$\{g, a, m, e\}$$, is called the *hypothesis space* -- it's our set of hypotheses about the letter. + +### a) + +In English, what does the posterior probability $$p(h \mid \text{win})$$ represent? + +### b) + +Manually compute $$p(h \mid \text{win})$$ for each hypothesis. +Remember to normalize --- make sure that summing all your $$p(h \mid \text{win})$$ values gives you 1. + +| $$h$$ | $$p(h)$$ | $$p(\text{win}\mid h)$$ | $$p(h \mid \text{win})$$ | +| ----- | -------- | ------------------------ |------------------------- | +| g | 0.05 | | | +| a | 0.45 | | | +| m | 0.05 | | | +| e | 0.45 | | | + + + +### d) + + +Now, we're going to write this model in WebPPL using `Infer`. Here is some starter code for you: + +~~~~ +// define some variables and utility functions +var checkVowel = function(letter) {return _.includes(['a', 'e', 'i', 'o', 'u'], letter);} +var letterVals = ['g', 'a', 'm', 'e']; +var letterProbs = map(function(letter) {return checkVowel(letter) ? 0.45 : 0.05;}, letterVals); +var letters = Categorical({vs: letterVals, ps: letterProbs}) + +// Compute p(h | win) +var distribution = Infer({method: 'enumerate'}, function() { + var letter = sample(letters); + var position = letterVals.indexOf(letter) + 1; + var winProb = 1 / Math.pow(position, 2); + condition(...) + return ... +}); +viz.auto(distribution); +~~~~ + +Fill in the `...`'s in the code to compute $$p(h \mid \text{win})$$. +Include a screenshot of the resulting graph. +What letter has the highest posterior probability? +In English, what does it mean that this letter has the highest posterior? +It might be interesting to comment out the `condition` statement so you can compare visually the prior (no `condition` statement) to the posterior (with `condition`). + +Make sure that your WebPPL answers and hand-computed answers agree -- note that this demonstrates the equivalence between the program view of conditional probability and the distributional view. + +### e) + +Which is higher, $$p(\text{vowel} \mid \text{win})$$ or $$p(\text{consonant} \mid \text{win})$$? +Answer this using the WebPPL code you wrote *Hint:* use the `checkVowel` function. + +~~~~ +// define some variables and utility functions +var checkVowel = function(letter) {return _.includes(['a', 'e', 'i', 'o', 'u'], letter);} +var letterVals = ['g', 'a', 'm', 'e']; +var letterProbs = map(function(letter) {return checkVowel(letter) ? 0.45 : 0.05;}, letterVals); +var letters = Categorical({vs: letterVals, ps: letterProbs}) + +// Compute p(h | win) +var distribution = Infer({method: 'enumerate'}, function() { + var letter = sample(letters); + var position = letterVals.indexOf(letter) + 1; + var winProb = 1 / Math.pow(position, 2); + condition(...) + return ... +}); +viz.auto(distribution); +~~~~ + +### f) + +What difference do you see between your code and the mathematical notation? +What are the advantages and disadvantages of each? +Which do you prefer? + + + + + + + + diff --git a/exercises/04-patterns-of-inference.md b/exercises/04-patterns-of-inference.md new file mode 100755 index 0000000..a54d04f --- /dev/null +++ b/exercises/04-patterns-of-inference.md @@ -0,0 +1,82 @@ +--- +layout: exercise +title: Patterns of inference - exercises +--- + +## Exercise 1: Causal and statistical dependency. + +For each of the following programs: + +* Draw the dependency diagram (Bayes net), including the probability tables (see example in chapter). If you don't have software on your computer for doing this, Google Docs has a decent interface for creating drawings. + +* Use informal evaluation order reasoning and the intervention method to determine causal dependency between A and B. + +* Use conditioning to determine whether A and B are statistically dependent. + +a) + +~~~~ +var a = flip() +var b = flip() +var c = flip(a && b ? .8 : .5) +~~~~ + +b) + +~~~~ +var a = flip() +var b = flip(a ? .9 : .2) +var c = flip(b ? .7 : .1) +~~~~ + +c) + +~~~~ +var a = flip() +var b = flip(a ? .9 : .2) +var c = flip(a ? .7 : .1) +~~~~ + +d) + +~~~~ +var a = flip(.6) +var c = flip(.1) +var z = flip() ? a : c; +var b = z ? 'foo' : 'bar' +~~~~ + +e) + +You do not need to include probability tables for this one. + +~~~~ +var examFairPrior = Bernoulli({p: .8}) +var doesHomeworkPrior = Bernoulli({p: .8}) +var examFair = mem(function(exam) {return sample(examFairPrior)}) +var doesHomework = mem(function(student) {return sample(doesHomeworkPrior)}); + +var pass = function(student, exam) { + return flip(examFair(exam) ? + (doesHomework(student) ? .9 : .5) : + (doesHomework(student) ? .2 : .1)); +} +var a = pass('alice', 'historyExam'); +var b = pass('bob', 'historyExam'); +~~~~ + +## Exercise 2: Epidemiology + +Imagine that you are an epidemiologist and you are determining people's cause of death. In this simplified world, there are two main diseases, cancer and the common cold. People rarely have cancer, $$p( \text{cancer}) = 0.00001$$, but when they do have cancer, it is often fatal, $$p( \text{death} \mid \text{cancer} ) = 0.9$$. People are much more likely to have a common cold, $$p( \text{cold} ) = 0.2$$, but it is rarely fatal, $$p( \text{death} \mid \text{cold} ) = 0.00006$$. Very rarely, people also die of other causes $$p(\text{death} \mid \text{other}) = 0.000000001$$. + +Write this model in WebPPL and use `Infer` to answer these questions (Be sure to include your code in your answer): + +~~~~ +Infer({method: 'enumerate'}, function() { + ... +}); +~~~~ + +a) Compute $$p( \text{cancer} \mid \text{death} , \text{cold} )$$ and $$p( \text{cancer} \mid \text{death} , \text{no cold} )$$. How do these probabilities compare to $$p( \text{cancer} \mid \text{death} )$$ and $$p( \text{cancer} )$$? Using these probabilities, give an example of explaining away. + +b) Compute $$p( \text{cold} \mid \text{death} , \text{cancer} )$$ and $$p( \text{cold} \mid \text{death} , \text{no cancer} )$$. How do these probabilities compare to $$p( \text{cold} \mid \text{death} )$$ and $$p( \text{cold} )$$? Using these probabilities, give an example of explaining away. diff --git a/exercises/04.1-agents-as-programs.md b/exercises/04.1-agents-as-programs.md old mode 100644 new mode 100755 diff --git a/exercises/05.1-sequential-decisions.md b/exercises/05.1-sequential-decisions.md old mode 100644 new mode 100755 diff --git a/exercises/06-inference-about-inference.md b/exercises/06-inference-about-inference.md new file mode 100755 index 0000000..1bbe18b --- /dev/null +++ b/exercises/06-inference-about-inference.md @@ -0,0 +1,116 @@ +--- +layout: exercise +title: Inference about inference - exercises +--- + +## Exercise 1: Tricky Agents + +What would happen if Sally knew you were watching her and wanted to deceive you? + +a) Complete the code below so that `chooseAction` chooses a misdirection if Sally is deceptive. Then describe and show what happens if you knew Sally was deceptive and chose action "b". + +~~~~ +var actionPrior = Categorical({vs: ['a', 'b', 'c'], ps: [1/3, 1/3, 1/3]}); +var foodPrior = Categorical({vs: ['bagel', 'cookie', 'doughnut'], ps: [1/3, 1/3, 1/3]}); + +var vendingMachine = function(state action) { + return (action == 'a' ? categorical({vs: ['bagel', 'cookie', 'doughnut'], ps: [.8, .1, .1]}) : + action == 'b' ? categorical({vs: ['bagel', 'cookie', 'doughnut'], ps: [.1, .8, .1]}) : + action == 'c' ? categorical({vs: ['bagel', 'cookie', 'doughnut'], ps: [.1, .1, .8]}) : + 'nothing'); + +var chooseAction = function(goal, transition, state, deceive) { + return Infer({method: 'enumerate'}, function() { + var action = sample(actionPrior); + condition(...) + return action; + }) +}; + +var goalPosterior = Infer({method: 'enumerate'}, function() { + var deceive = flip(); + var goalFood = sample(foodPrior); + var goal = function(outcome) {return outcome == goalFood}; + var sallyActionDist = chooseAction(goal, vendingMachine, 'state', deceive); + condition(...) + return goalFood; +}); + +viz.auto(goalPosterior); +~~~~ + +b) What happens if you don't know Sally is deceptive and she chooses "b" and then "b". What if she chooses "a" and then "b." Show the models and describe the difference in behavior. Is she deceptive in each case? + +## Exercise 2: Monty Hall. + +Here, we will use the tools of Bayesian inference to explore a classic statistical puzzle -- the Monty Hall problem. Here is one statement of the problem: + +> Alice is on a game show and she's given the choice of three doors. Behind one door is a car; behind the others, goats. She picks door 1. The host, Monty, knows what's behind the doors and opens another door, say No. 3, revealing a goat. He then asks Alice if she wants to switch doors. Should she switch? + +Intuitively, it may seem like switching doesn't matter. However, the canonical solution is that you *should* switch doors. We'll explore (a) the intuition that switching doesn't matter, (b) the canonical solution, and more. This is the starter code you'll be working with: + +~~~~ +// Here's a function that might be handy: it removes some set of badItems from a list l +// e.g. removeBadItems(['nut', 'cake', 'nut', 'bagel'], ['cake', 'bagel']) => ['nut', 'nut'] +var removeBadItems = function(l, badItems) { + return reduce(function(badItem, remainingL) { + return remove(badItem, remainingL) + }, l, badItems); +} + +var doors = [1,2,3] + +var montyRandom = function(aliceDoor, prizeDoor) { + return Infer({method: 'enumerate'}, function() { + ... + }; +}; + +// var montyAvoidBoth = function(aliceDoor, prizeDoor) { +// return Infer({method: 'enumerate'}, function() { +// ... +// }; +// }; + +// var montyAvoidAlice = function(aliceDoor, prizeDoor) { +// return Infer({method: 'enumerate'}, function() { +// ... +// }; +// }; + +// var montyAvoidPrize = function(aliceDoor, prizeDoor) { +// return Infer({method: 'enumerate'}, function() { +// ... +// }; +// }; + +Infer({method: 'enumerate'}, function() { + var aliceDoor = ... + var prizeDoor = ... + var montyFunction = montyRandom; + + var montyDoorDist = montyFunction(aliceDoor, prizeDoor); + + factor(...) + + return ... +}); +~~~~ + +a) Whether you should switch depends crucially on how you believe Monty chooses doors to pick. First, write the model such that the host *randomly* picks doors (for this, fill in `montyRandom`). In this setting, should Alice switch? Or does it not matter? Hint: it is useful to condition on the exact doors that we discussed in the problem description. + +b) Now, fill in `montyAvoidBoth` (make sure you switch your `var montyFunction = ...` alias to use `montyAvoidBoth`). Here, Monty randomly picks a door that is *neither* the prize door *nor* Alice's door. For both-avoiding Monty, you'll find that Alice *should* switch. This is unintuitive -- we know that Monty picked door 3, so why should the process he used to arrive at this choice matter? By hand, compute the probability table for $$P(\text{Prize } \mid \text{Alice picks door 1}, \text{Monty picks door 3}, \text{Door 3 is not the prize})$$ under both `montyRandom` and `montyAvoidBoth`. Your tables should look like: + +Alice's door| Prize door| Monty's Door| P(Alice, Prize, Monty) +-------------| -----------| -------------| ----------------------- +1| 1| 1| ... +1| 1| 2| ... +...| ...| ...| ... + +Using these tables, explain why Alice should switch for both-avoiding Monty but why switching doesn't matter for random Monty. Hint: you will want to compare particular *rows* of these tables. + +c) Fill in `montyAvoidAlice`. Here, Monty randomly picks a door that is simply not Alice's door. Should Alice switch here? + +d) Fill in `montyAvoidPrize`. Here, Monty randomly picks a door that is simply not the prize door. Should Alice switch here? + +e) An interesting cognitive question is: why do we have the initial intuition that switching shouldn't matter? Given your explorations, propose an answer. diff --git a/exercises/08-learning-as-conditional-inference.md b/exercises/08-learning-as-conditional-inference.md new file mode 100755 index 0000000..f17ec57 --- /dev/null +++ b/exercises/08-learning-as-conditional-inference.md @@ -0,0 +1,387 @@ +--- +layout: exercise +title: learning - exercises +--- + +## 1. Calculating learning curves + +#### a) + +How does a *learning curve* differ from a *learning trajectory*? + +#### b) + +In the chapter, we graphed *learning trajectories* for a number of models. Below is one of these models (the one with the Beta(10,10) prior). In the chapter, we observed how the model's best guess as to the weight of the coin changed across a sequence of sucessive heads. See what happens if instead we see heads and tails in alternation: + +(Notice that we make use of [globalStore](https://webppl.readthedocs.io/en/master/globalstore.html) to create our data set.) + +~~~~js +///fold: +var makeCoin = function(weight) { + return function() { + return flip(weight) ? 'h' : 't'; + } +}; +/// + +var pseudoCounts = {a: 10, b: 10}; + +var weightPosterior = function(observedData){ + return Infer({method: 'MCMC', burn:1000, samples: 1000}, function() { + var coinWeight = sample(Beta({a: pseudoCounts.a, b: pseudoCounts.b})) + var coinDist = Bernoulli({p: coinWeight}) + var obsFn = function(datum){observe(coinDist, datum=='h')} + mapData({data: observedData}, obsFn) + return coinWeight + }) +} + +//creating 50 pairs of 'h' and 't' alternating +globalStore.fullDataSet = ['h', 't'] +var ignore = repeat(49, function(){ + globalStore.fullDataSet = globalStore.fullDataSet.concat(['h','t']) +}); + +var observedDataSizes = [0,2,4,6,8,10,20,30,40,50,70,100]; +var estimates = map(function(N) { + return expectation(weightPosterior(globalStore.fullDataSet.slice(0,N))) +}, observedDataSizes); +viz.line(observedDataSizes, estimates); +~~~~ + +It looks like we haven't learned anything! Indeed, since our best estimate for the coin's weight was 0.5 *prior* to observing anything, our best estimate is hardly going to change when we get data consistent with that prior. + +The problem is that we've been looking at the MAP (maximum a posteriori) estimate. Edit the code below to see whether our posterior *distribution* is at all changed by observing this data set. (You only need to compare the prior and the posterior after all 100 observations): + +~~~~js +///fold: +var makeCoin = function(weight) { + return function() { + return flip(weight) ? 'h' : 't'; + } +}; + +var pseudoCounts = {a: 10, b: 10}; + +//creating 50 pairs of 'h' and 't' alternating +globalStore.fullDataSet = ['h', 't'] +var ignore = repeat(49, function(){ + globalStore.fullDataSet = globalStore.fullDataSet.concat(['h','t']) +}); +/// + +var weightPosterior = function(observedData){ + return Infer({method: 'MCMC', burn:1000, samples: 1000}, function() { + var coinWeight = sample(Beta({a: pseudoCounts.a, b: pseudoCounts.b})) + var coinDist = Bernoulli({p: coinWeight}) + var obsFn = function(datum){observe(coinDist, datum=='h')} + mapData({data: observedData}, obsFn) + return coinWeight + }) +} + +var prior = //your code here +var post = //your code here + +viz(prior); //should graph the prior distribution on weights +viz(post); //should graph the posterior distribution on weights +~~~~ + +You should see a much sharper peak in the posterior. Note that the bounds on the x-axis are likely to be different in the two graphs, which could obscure this. (The `viz` package doesn't appear to allow you to adjust the bounds on the axes.) + +#### c) + +Ideally, we'd like to see how our belief distribution shifts as more data comes in. A particularly good measure would be entropy. Unfortunately, calculating entropy for a Beta distribution is [somewhat involved](https://en.wikipedia.org/wiki/Beta_distribution#Quantities_of_information_(entropy)). + +A somewhat hacky alternative we can use is variance: the expected squared difference between a sample from the distribution and the distribution mean. This is hacky because it doesn't take into account the shape of the distribution, and so won't give us quite what we want if the distribution is non-symmetric. + +Edit the code below to see how variance changes as more data is observed. + +~~~~js +///fold: +var makeCoin = function(weight) { + return function() { + return flip(weight) ? 'h' : 't'; + } +}; + +var pseudoCounts = {a: 10, b: 10}; + +var weightPosterior = function(observedData){ + return Infer({method: 'MCMC', burn:1000, samples: 1000}, function() { + var coinWeight = sample(Beta({a: pseudoCounts.a, b: pseudoCounts.b})) + var coinDist = Bernoulli({p: coinWeight}) + var obsFn = function(datum){observe(coinDist, datum=='h')} + mapData({data: observedData}, obsFn) + return coinWeight + }) +} + +//creating 256 pairs of 'h' and 't' alternating +globalStore.fullDataSet = ['h', 't'] +var ignore = repeat(499, function(){ + globalStore.fullDataSet = globalStore.fullDataSet.concat(['h','t']) +}); +/// + + +var observedDataSizes = [0,2,4,8,16,32,64,128,256,512]; +var posts = map(function(N) { + return weightPosterior(globalStore.fullDataSet.slice(0,N)) +}, observedDataSizes); +// returns an array of posteriors of length observedDataSizes.length + +var variances = mapN(function(i){ + // your code here +}, observedDataSizes.length) + +viz.line(observedDataSizes, variances); +~~~~ + +You may need to look up how to use [mapN()](https://webppl.readthedocs.io/en/master/functions/arrays.html?highlight=mapN). + +HINT: notice how the variable `posts` differs from `estimates` in the code above. + +## 2. Causal Power + +Consider our model of causal power from the chapter: + +~~~~js +var observedData = [{C:true, E:false}] + +var causalPowerPost = Infer({method: 'MCMC', samples: 10000, lag:2}, function() { + // Causal power of C to cause E + var cp = uniform(0, 1) + + // Background probability of E + var b = uniform(0, 1) + + var obsFn = function(datum) { + // The noisy causal relation to get E given C + var E = (datum.C && flip(cp)) || flip(b) + condition( E == datum.E) + } + + mapData({data: observedData}, obsFn) + + return {causal_power: cp, background: b} +}); + +viz.marginals(causalPowerPost); +~~~~ + +#### a) + +Find a set of observations that result in inferring a fairly high causal power for C and a low background probability of E. Explain why this works. + +#### b) + +Find a set of observations that result in inferring a fairly low causal power for C and a high background probability of E. Explain why this works. + +#### c) + +Find a set of observations that result in inferring a fairly high causal power for C and a high background probability of E. Explain why this works. + +#### d) + +Suppose every time C is present, so is the effect E. Suppose C is present at least 5 times. Is there a way to nonetheless fail to infer a high causal power for C? + +## 3. Inferring Functions + +Consider our model of function inference from the chapter: + +~~~~js +///fold: +// make expressions easier to look at +var prettify = function(e) { + if (e == 'x' || _.isNumber(e)) { + return e + } else { + var op = e[0] + var arg1 = prettify(e[1]) + var prettyarg1 = (!_.isArray(e[1]) ? arg1 : '(' + arg1 + ')') + var arg2 = prettify(e[2]) + var prettyarg2 = (!_.isArray(e[2]) ? arg2 : '(' + arg2 + ')') + return prettyarg1 + ' ' + op + ' ' + prettyarg2 + } +} + +var plus = function(a,b) { + return a + b; +} + +var multiply = function(a,b) { + return Math.round(a * b,0); +} + +var divide = function(a,b) { + return Math.round(a/b,0); +} + +var minus = function(a,b) { + return a - b; +} + +var power = function(a,b) { + return Math.pow(a,b); +} + +// make expressions runnable +var runify = function(e) { + if (e == 'x') { + return function(z) { return z } + } else if (_.isNumber(e)) { + return function(z) { return e } + } else { + var op = (e[0] == '+') ? plus : + (e[0] == '-') ? minus : + (e[0] == '*') ? multiply : + (e[0] == '/') ? divide : + power; + var arg1Fn = runify(e[1]) + var arg2Fn = runify(e[2]) + return function(z) { + return op(arg1Fn(z),arg2Fn(z)) + } + } +} + +var randomConstantFunction = function() { + return uniformDraw(_.range(10)) +} + +var randomCombination = function(f,g) { + var op = uniformDraw(['+','-','*','/','^']); + return [op, f, g]; +} + +// sample an arithmetic expression +var randomArithmeticExpression = function() { + if (flip(0.3)) { + return randomCombination(randomArithmeticExpression(), randomArithmeticExpression()) + } else { + if (flip()) { + return 'x' + } else { + return randomConstantFunction() + } + } +} +/// + +viz.table(Infer({method: 'enumerate', maxExecutions: 100}, function() { + var e = randomArithmeticExpression(); + var s = prettify(e); + var f = runify(e); + + condition(f(0) == 0) + condition(f(2) == 4) + + return {s: s}; +})) +~~~~ + +Why does this think the probability of `x * 2` is so much lower than `x * x`? + +HINT: Think about the probability assigned to `x ^ 2`. + +#### b) + +Let's reconceptualize of our program as a sequence-generator. Suppose that the first number in the sequence ($$f(1)$$) is `1` and the second number ($$f(2)$$) is `4`. What number comes next? + +~~~~js +///fold: +// make expressions easier to look at +var prettify = function(e) { + if (e == 'x' || _.isNumber(e)) { + return e + } else { + var op = e[0] + var arg1 = prettify(e[1]) + var prettyarg1 = (!_.isArray(e[1]) ? arg1 : '(' + arg1 + ')') + var arg2 = prettify(e[2]) + var prettyarg2 = (!_.isArray(e[2]) ? arg2 : '(' + arg2 + ')') + return prettyarg1 + ' ' + op + ' ' + prettyarg2 + } +} + +var plus = function(a,b) { + return a + b; +} + +var multiply = function(a,b) { + return Math.round(a * b,0); +} + +var divide = function(a,b) { + return Math.round(a/b,0); +} + +var minus = function(a,b) { + return a - b; +} + +var power = function(a,b) { + return Math.pow(a,b); +} + +// make expressions runnable +var runify = function(e) { + if (e == 'x') { + return function(z) { return z } + } else if (_.isNumber(e)) { + return function(z) { return e } + } else { + var op = (e[0] == '+') ? plus : + (e[0] == '-') ? minus : + (e[0] == '*') ? multiply : + (e[0] == '/') ? divide : + power; + var arg1Fn = runify(e[1]) + var arg2Fn = runify(e[2]) + return function(z) { + return op(arg1Fn(z),arg2Fn(z)) + } + } +} + +var randomConstantFunction = function() { + return uniformDraw(_.range(10)) +} + +var randomCombination = function(f,g) { + var op = uniformDraw(['+','-','*','/','^']); + return [op, f, g]; +} + +// sample an arithmetic expression +var randomArithmeticExpression = function() { + if (flip(0.3)) { + return randomCombination(randomArithmeticExpression(), randomArithmeticExpression()) + } else { + if (flip()) { + return 'x' + } else { + return randomConstantFunction() + } + } +} +/// + +viz.table(Infer({method: 'enumerate', maxExecutions: 10000}, function() { + var e = randomArithmeticExpression(); + var s = prettify(e); + var f = runify(e); + + condition(f(1) == 1) + condition(f(2) == 4) + + return {'f(3)':f(3)}; +})) +~~~~ + +Not surprisingly, the model predicts `9` as the most likely next number. However, it also puts significant probability on `27`. Why does this happen? + +#### c) + +Many people find the high probability assignmed by our model in (b) to `27` to be unintuitive. This suggests our model is an imperfect model of human intuitions. How could we decrease the probability of inferring `27`? (HINT: Consider the priors). \ No newline at end of file diff --git a/exercises/10-occam's-razor.md b/exercises/10-occam's-razor.md new file mode 100755 index 0000000..c0339e7 --- /dev/null +++ b/exercises/10-occam's-razor.md @@ -0,0 +1,48 @@ +--- +layout: exercise +title: Occam's razor - exercises +--- + +## Exercise 1: Causal induction + +Write the causal support model from Griffiths and Tenenbaum's [-@Griffiths2005], "Structure and strength in causal induction" (GT05) in Church. You don't need to compute the log likelihood ratio for $$P(\text{data} \mid \text{Graph 1})/P(\text{data} \mid \text{Graph 0})$$ but can simply estimate the posterior probability $$P(\text{Graph 1} \mid \text{data})$$. + +a) Replicate the model predictions from Fig. 1 of GT05. + +b) Show samples from the posteriors over the causal strength and background rate +parameters, as in Fig 4 of GT05. + +c) Try using different parameterizations of the function that relates the cause and the background to the effect, as described in a later 2009 paper [@Griffiths2009]: noisy-or for generative causes, noisy-and-not for preventive causes, generic multinomial parameterization for causes that have an unknown effect. Show their predictions for a few different data sets, including the Delta-P = 0 cases. + +## Exercise 2 + +Try an informal behavioral experiment with several friends as experimental subjects to see whether the Bayesian approach to curve fitting given on the wiki page corresponds with how people actually find functional patterns in sparse noisy data. Your experiment should consist of showing each of 4-6 people 8-10 data sets (sets of x-y values, illustrated graphically as points on a plane with x and y axes), and asking them to draw a continuous function that interpolates between the data points and extrapolates at least a short distance beyond them (as far as people feel comfortable extrapolating). Explain to people that the data were produced by measuring y as some function of x, with the possibility of noise in the measurements. + +The challenge of this exercise comes in choosing the data sets you will show people, interpreting the results and thinking about how to modify or improve a probabilistic program for curve fitting to better explain what people do. Of the 8-10 data sets you use, devise several ("type A") for which you believe the church program for polynomial curve fitting will match the functions people draw, at least qualitatively. Come up with several other data sets ("type B") for which you expect people to draw qualitatively different functions than the church polynomial fitting program does. Does your experiment bear out your guesses about type A and type B? If yes, why do you think people found different functions to best explain the type B data sets? If not, why did you think they would? There are a number of factors to consider, but two important ones are the noise model you use, and the choice of basis functions: not all functions that people can learn or that describe natural processes in the world can be well described in terms of polynomials; other types of functions may need to be considered. + +Can you modify the church program to fit curves of qualitatively different forms besides polynomials, but of roughly equal complexity in terms of numbers of free parameters? Even if you can't get inference to work well for these cases, show some samples from the generative model that suggest how the program might capture classes of human-learnable functions other than polynomials. + +You should hand in the data sets you used for the informal experiment, discussion of the experimental results, and a modified church program for fitting qualitatively different forms from polynomials plus samples from running the program forward. + +## Exercise 3: Number game + +Write the *number game* model from Tenenbaum's [-@Tenenbaum2000] "Rules and similarity in concept learning" in Church. + +Replicate the model predictions in Fig. 1b. You may want to start by writing out the hypotheses by hand. + +How might you generate the hypothesis space more compactly? + +How would you change the model if the numbers were sequences instead of sets? + +Hint: to draw from a set of integers, you may want to use this `noisy-draw` function: + +~~~~ +;;the total possible range is 0 to total-range - 1 +(define total-range 10) + +;;draw from a set of integers with some chance of drawing a different integer in the possible range: +(define (noisy-draw set) (sample-discrete (map (lambda (x) (if (member x set) 1.0 0.01)) (iota total-range)))) + +;;for example: +(noisy-draw '(1 3 5)) +~~~~ diff --git a/exercises/14-bayesian-data-analysis.md b/exercises/14-bayesian-data-analysis.md new file mode 100755 index 0000000..55ea52d --- /dev/null +++ b/exercises/14-bayesian-data-analysis.md @@ -0,0 +1,378 @@ +--- +layout: exercise +title: Bayesian Data Analysis - exercises +custom_js: +- assets/js/towData.js +- assets/js/towConfigurations.js +--- + +## Exercise 1: Experimenting with priors and predictives + +In [our simple binomial model]({{site.baseurl}}/chapters/14-bayesian-data-analysis.html#a-simple-illustration), we compared the parameter priors and posteriors to the corresponding **predictives** which tell us what data we should expect given our prior and posterior beliefs. For convenience, we've reproduced that model here: + +~~~~ +// observed data +var k = 1 // number of successes +var n = 20 // number of attempts +var priorDist = Uniform({a: 0, b: 1}); + +var model = function() { + var p = sample(priorDist); + + // Observed k number of successes, assuming a binomial + observe(Binomial({p : p, n: n}), k); + + // sample from binomial with updated p + var posteriorPredictive = binomial(p, n); + + // sample fresh p (for visualization) + var prior_p = sample(priorDist); + // sample from binomial with fresh p (for visualization) + var priorPredictive = binomial(prior_p, n); + + return { + prior: prior_p, priorPredictive : priorPredictive, + posterior : p, posteriorPredictive : posteriorPredictive + }; +} + +var opts = {method: "MCMC", samples: 2500, lag: 50}; +var posterior = Infer(opts, model); + +viz.marginals(posterior) +~~~~ + +a. Notice that we used a uniform distribution over the interval [0,1] as our prior, reflecting our assumption that a probability must lie between 0 and 1 but otherwise remaining agnostic to which values are most likely to be the case. +While this is convenient, we may want to represent other assumptions. +The [Beta distribution](https://en.wikipedia.org/wiki/Beta_distribution), expressed in WebPPL as `Beta({a:..., b:...})`' is a more general way of expressing beliefs over the interval [0,1]. + +Try different beta priors on `p`, by changing `priorDist = Uniform(...)` to `p = Beta({a: 10,b: 10})`, `Beta({a: 1, b: 5})` and `Beta({a: 0.1, b: 0.1})`. +(Note that `beta(1,1)` is mathematically the same as `uniform(0,1)`.) +Use the figures produced to describe the assumptions these priors capture, and how they interact with the same data to produce posterior inferences and predictions. + +b. Predictive distributions are not restricted to exactly the same experiment as the observed data, and can be used in the context of any experiment where the inferred model parameters make predictions. +In the current simple binomial setting, for example, predictive distributions could be found by an experiment that is different because it has `n' != n` observations. +Change the model to implement an example of this. + +## Exercise 2: Parameter fitting vs. Parameter integration + +One of the strongest motivations for using Bayesian techniques for model-data evaluation is in how "nuisance" parameters are treated. +"Nuisance" parameters are parameters of no theoretical interest; their only purpose is to fill in a necessary slot in the model. +Classically, the most prominant technique (from the frequentist tradition) for dealing with these parameters is to fit them to the data, i.e., to set their value equal to whatever value maximizes the model-data fit (or, equivalently, minimizes some cost function). + +The Bayesian approach is different. +Since we have *a priori* uncertainty about the value of our parameter, we will also have *a posteriori* uncertainty about the value (though hopefully the uncertainty will be reduced). +What the Bayesian does is *integrate over* her posterior distribution of parameter values to make predictions. +Intuitively, rather than taking the value corresponding to the peak of the distribution (i.e., the maximum), she's considering all values with their respective probabilites. + +Why might this be important for model assessment? +Imagine the following situation. +You are piloting a task and want to use Bayesian Data Analysis because you hear it is useful when you have few data points. +You think that the task you've designed is a little too difficult for subjects. +(Let's imagine that you're a psychophysicist, and your task pertains to contrast discriminiation in the peripheral visual field.) +You think the current task design is too difficult, but you're not sure. +It may well be that it's fine for subjects. + +Here is your prior. + +~~~~ +// Prior on task difficulty is uniform on [0, ..., 0.9], with a spike on 0.9 +// i.e., you think it's likely that the task is too difficult +var sampleTaskDifficulty = function() { + return flip() ? .9 : randomInteger(10) / 10; +}; + +var model = function() { + return sampleTaskDifficulty(); +}; + +viz.hist(Infer({method: 'enumerate'}, model), {numBins: 9}) +~~~~ + +You have a model of how subjects perform on your task. +You could have a structured, probabilistic model here. +For simplicity, let's assume you have the simplest model of task performance. +It is a direct function of task-difficulty: subjects perform well if the task isn't too difficult. + +~~~~norun +var subjectPerformWell = !flip(taskDifficulty) +~~~~ + +There's a lot of training involved in your task and that it's very time consuming for you to collect data. +You run one subject through your training regime and have them do the task. +The subject performs well! +Soon after, your adviser drops by and wants you to make a decision to collect more data or tweak your experimental paradigm. +You thought beforehand that your task was too difficult. +Do you still think your task is too hard? + +Since you wrote down your prior beliefs, we can examine how much the data update those beliefs about the `taskDifficulty` parameter. +How does your degree of belief in task difficult change as a result of your one pilot subject performing well? + +~~~~ +// Prior on task difficulty is uniform on [0, ..., 0.9], with a spike on 0.9 +var sampleTaskDifficulty = function() { + return flip() ? .9 : randomInteger(10) / 10; +}; + +// Compute posterior after seeing one subject perform well on the task +var taskDifficultyPosterior = Infer({method: 'enumerate'}, function(){ + var taskDifficulty = sampleTaskDifficulty(); + + // subject will perform well if the task is not too difficult + var subjectPerformsWell = !flip(taskDifficulty) + + // observe that they perform well (i.e. this value is true) + condition(subjectPerformsWell) + return taskDifficulty; +}) + +// Most likely task-difficulty is still .9 +taskDifficultyPosterior.MAP().val + +// But a lot of probability mass is on lower values +viz.hist(taskDifficultyPosterior, {numBins: 9}) + +// Indeed, the expected subject ability is around .4 +expectation(taskDifficultyPosterior) +~~~~ + +A. Would you proceed with more data collection or would you change your paradigm? +How did you come to this conclusion? + +B. In part A, you probably used either a value of task-difficulty or the full distribution of values to decide about whether to continue data collection or tweak the paradigm. +We find ourselves in a similar situation when we have models of psychological phenomena and want to decide whether or not the model has fit the data (or, equivalently, whether our psychological theory is capturing the phenomenon). +The traditional approach is the value (or "point-wise estimate") approach: take the value that corresponds to the best fit (e.g., by using least-squares or maximum-likelihood estimation; here, you would have taken the Maximum A Posteriori (or, MAP) estimate, which would be 0.9). +Why might this not be a good idea? +Provide two answers. +One that applies to the data collection situation above, and one that applies to the metaphor of model or theory evaluation. + +## Exercise 3: BDA of Bayesian Cognitive Models + +We saw in this chapter how to analyze our models of cognition by using Bayesian statistical techniques. +Compare and contrast the results of our cognitive model of tug-of-war with our regression models. +Some questions to ponder: + +* What phenomena in the data was it better able to capture? + +* What, if anything, did it fail to capture? + +* Are there other aspects of the model you could 'lift' into the Bayesian Data Analysis (i.e. fixed parameters that you could put a prior on and include in your joint inference)? + +* How does WebPPL expose commonalities between these two models? + +## Exercise 4 + +Let's continue to explore the inferences you (as a scientist) can draw from the posterior over parameter values. This posterior can give you an idea of whether or not your model is well-behaved. In other words, do the predictoins of your model depend heavily on the exact parameter value? + +To help us understand how to examine posteriors over parameter settings, we're going to revisit the example of the blicket detector from Chapter 4. + +Here is the model, with slightly different names than the original example, and written in a parameter-friendly way. It is set up to display the "backwards blocking" phenomenon. + +~~~~ +var blicketBaseRate = 0.4 +var blicketPower = 0.9 +var nonBlicketPower = 0.05 +var machineSpontaneouslyGoesOff = 0.05 + +var blicketPosterior = function(evidence) { + return Infer({method: 'enumerate'}, function() { + var blicket = mem(function(block) {return flip(blicketBaseRate)}) + var power = function(block) {return blicket(block) ? blicketPower : nonBlicketPower} + var machine = function(blocks) { + return (blocks.length == 0 ? + flip(machineSpontaneouslyGoesOff) : + flip(power(first(blocks))) || machine(rest(blocks))) + } + // Condition on each of the pieces of evidence making the machine go off + map(function(blocks){condition(machine(blocks))}, evidence) + return blicket('A') + }); +}; + +// A&B make the blicket-detector go off +viz(blicketPosterior([['A', 'B']])) + +// A&B make the blicket-detector go off, and then B makes the blicket detector go off +viz(blicketPosterior([['A', 'B'], ['B']])) +~~~~ + +A. What are the parameters of this model? In the plainest English you can muster, interpret the current values of the parameters. What do they mean? + +Let's analyze this model with respect to some data. First, we'll put priors on these parameters, and then we'll do inference, conditioning on some data we might have collected in an experiment on 4 year olds, a la Sobel, Tenenbaum, & Gopnik (2004). [The data used in this exercise is schematic data]. + +~~~~ +///fold: + +// alternative proposal distribution for metropolis-hastings algorithm +var uniformKernel = function(prevVal) { + return Uniform({a: prevVal - 0.2, b: prevVal + 0.2}); +}; + +var toProbs = function(predictions) { + return _.object(map(function(i) {return "predictive: cond" + i + " P(true)";}, _.range(1, predictions.length + 1)), + map(function(model) {return Math.exp(model.score(true))}, predictions)) +} + +var dataSummary = function(data) { + return map(function(condData) { + return filter(function(d) {return d}, condData).length/11 + }, data) +}; + +var predictiveSummary = function(model) { + var labels = map(function(i) {return "predictive: cond" + i + " P(true)"}, _.range(1, 6)); + return map(function(label) { + return expectation(model, function(s) { + return s[label] + }); + }, labels); +}; +/// + +// 5 experiment conditions / stimuli +var possibleEvidenceStream = [ + [['A']], + [['A', 'B']], + [['A', 'B'], ['B']], + [['A', 'B'], ['A', 'B']], + [[]] +]; + +// for each condition. +// note: always the question "is A a blicket?" +var data = [ + repeat(10, function(){return true}).concat(false), + repeat(6 , function(){return true}).concat(repeat(5, function(){return false})), + repeat(4, function(){return true}).concat(repeat(7, function(){return false})), + repeat(8, function(){return true}).concat(repeat(3, function(){return false})), + repeat(2, function(){return true}).concat(repeat(9, function(){return false})) +]; + +// Same model as above, but parameterized +var detectingBlickets = mem(function(evidence, params) { + return Infer({method: 'enumerate'}, function() { + var blicket = mem(function(block) {return flip(params.blicketBaseRate)}) + var power = function(block) {return blicket(block) ? params.blicketPower : params.nonBlicketPower} + var machine = function(blocks) { + return (blocks.length == 0 ? + flip(params.machineSpontaneouslyGoesOff) : + flip(power(first(blocks))) || machine(rest(blocks))) + } + map(function(blocks){condition(machine(blocks))}, evidence) + return blicket('A') + }) +}) + +var dataAnalysis = Infer({method: 'MCMC', samples: 5000, callbacks: [editor.MCMCProgress()]}, function() { + var params = { + blicketBaseRate: sample(Uniform({a: 0, b: 1}), {driftKernel: uniformKernel}), + blicketPower: sample(Uniform({a: 0, b: 1}), {driftKernel: uniformKernel}), + nonBlicketPower: sample(Uniform({a: 0, b: 1}), {driftKernel: uniformKernel}), + machineSpontaneouslyGoesOff: sample(Uniform({a: 0, b: 1}), {driftKernel: uniformKernel}) + } + + var cognitiveModelPredictions = map(function(evidence) { + return detectingBlickets(evidence,params); + }, possibleEvidenceStream); + + // observe each data point under the model's predictions + map2(function(dataForStim, modelPosterior) { + map(function(dataPoint) { + observe(modelPosterior, dataPoint); + }, dataForStim) + }, data, cognitiveModelPredictions) + + var predictives = toProbs(cognitiveModelPredictions) + return _.extend(params, predictives) +}) + +viz.marginals(dataAnalysis); +viz.scatter(predictiveSummary(dataAnalysis), dataSummary(data), + {xLabel: 'model', yLabel: 'data'}) +~~~~ + +Before running this program, answer the following question: + +B. What does the `Infer` statement in `dataAnalysis` return? What does the `Infer` statement in `detectingBlickets` return? Why are there two queries in this program? + +C. Now, run the program. [Note: This will take between 15-30 seconds to run.] Interpret each of the resulting plots. + +D. How do your interpretations relate to the parameter values that were set in the original program? + +E. Look carefully at the priors (in the code) and the posteriors (in the plots) over blicketPower and nonBlicketPower. Did we impose any a priori assumptions about the relationship between these parameters? Think about the experimental setup. Do you think we would be justified in imposing any assumptions? Why or why not? What do the posteriors tell you? How was the data analysis model able to arrive at this conclusion? + +F. Do you notice anything about the scatter plot? How would you interpret this? Is there something we could add to the data analysis model to account for this? + +G. Now, we're going to examine the predictions of the model if we had done a more traditional analysis of point-estimates of parameters (i.e. fitting parameters). Examine your histograms and determine the "maximum a posteriori" (MAP) value for each parameter. Plug those into the code below and run it. + +~~~~ +///fold: +var toProbs = function(predictions) { + return _.object(map(function(i) {return "predictive: cond" + i + " P(true)";}, _.range(1, predictions.length + 1)), + map(function(model) {return Math.exp(model.score(true))}, predictions)) +} + +var dataSummary = function(data) { + return map(function(condData) { + return filter(function(d) {return d}, condData).length/11 + }, data) +}; + +// 5 experiment conditions / stimuli +var possibleEvidenceStream = [ + [['A']], + [['A', 'B']], + [['A', 'B'], ['B']], + [['A', 'B'], ['A', 'B']], + [[]] +]; + +var data = [ + repeat(10, function(){return true}).concat(false), + repeat(6 , function(){return true}).concat(repeat(5, function(){return false})), + repeat(4, function(){return true}).concat(repeat(7, function(){return false})), + repeat(8, function(){return true}).concat(repeat(3, function(){return false})), + repeat(2, function(){return true}).concat(repeat(9, function(){return false})) +]; + +// for each condition. +// note: always the question "is A a blicket?" +var data = [ + repeat(10, function(){return true}).concat(false), + repeat(6 , function(){return true}).concat(repeat(5, function(){return false})), + repeat(4, function(){return true}).concat(repeat(7, function(){return false})), + repeat(8, function(){return true}).concat(repeat(3, function(){return false})), + repeat(2, function(){return true}).concat(repeat(9, function(){return false})) +]; + +// Same model as above, but parameterized +var detectingBlickets = mem(function(evidence, params) { + return Infer({method: 'enumerate'}, function() { + var blicket = mem(function(block) {return flip(params.blicketBaseRate)}) + var power = function(block) {return blicket(block) ? params.blicketPower : params.nonBlicketPower} + var machine = function(blocks) { + return (blocks.length == 0 ? + flip(params.machineSpontaneouslyGoesOff) : + flip(power(first(blocks))) || machine(rest(blocks))) + } + map(function(blocks){condition(machine(blocks))}, evidence) + return blicket('A') + }) +}) +/// + +var params = { + blicketBaseRate : ..., + blicketPower: ..., + nonBlicketPower: ..., + machineSpontaneouslyGoesOff: ... +}; + +var bestFitModelPredictions = map(function(evidence) { + return Math.exp(detectingBlickets(evidence, params).score(true)); +}, possibleEvidenceStream) + +viz.scatter(bestFitModelPredictions, dataSummary(data)) +~~~~ + +H. What can you conclude about the two ways of looking at parameters in this model's case? Do you think the model is relatively robust to different parameter settings? diff --git a/exercises/appendix-js-basics.md b/exercises/appendix-js-basics.md old mode 100644 new mode 100755 diff --git a/exercises/conditional-dependence.md b/exercises/conditional-dependence.md index 2b9f021..66a2c6d 100644 --- a/exercises/conditional-dependence.md +++ b/exercises/conditional-dependence.md @@ -81,3 +81,206 @@ viz.table(Infer({method: 'enumerate'}, function() { ... })); ~~~~ + +## Exercise 2: Monty Hall. + +Here, we will use the tools of Bayesian inference to explore a classic statistical puzzle -- the Monty Hall problem. +Here is one statement of the problem: + +> Alice is on a game show, and she's given the choice of three doors. +> Behind one door is a car; behind the others, goats. +> She picks door 1. The host, +> Monty, knows what's behind the doors and opens another door, say No. 3, revealing a goat. +> He then asks Alice if she wants to switch doors. +> Should she switch? + +Intuitively, it may seem like switching doesn't matter. +However, the canonical solution is that you *should* switch doors. +We will explore why this is the case. + +For this problem, we will assume (condition) that we observe Monty opening the door that +is neither Alice's door nor the prize door. + +### Exercise 2.1 + +The decision to switch depends crucially on how you believe Monty chooses doors to pick. +First, write the model such that the host *randomly* picks doors (for this, fill in `montyRandom`). +In this setting, should Alice switch, or does it not matter? + +~~~~ +///fold: +var removeBadItems = function(l, badItems) { + return reduce(function(badItem, remainingL) { + return remove(badItem, remainingL) + }, l, badItems); +} + +var doors = [1, 2, 3]; +/// + +var montyRandom = function(aliceDoor, prizeDoor) { + return Infer({method: 'enumerate'}, function() { + return ... + }) +}; + +var model = function(switch_cond) { + var aliceDoor = ... + var prizeDoor = ... + var montyDoor = ... + + condition(montyDoor != prizeDoor); + condition(montyDoor != aliceDoor); + + return ... +} + +display("P(win) if Alice doesn't switch"); +viz.auto(Infer({method: 'enumerate'}, function() {model(false)})); +display("P(win) if Alice does switch"); +viz.auto(Infer({method: 'enumerate'}, function() {model(true)})); +~~~~ + + +### Exercise 2.2 + +This time, fill in the code so that Monty behaves according to the original Monty Hall problem, +i.e. picking the door that is neither the prize door nor Alice's door. +For both-avoiding Monty, you'll find that Alice *should* switch. + +~~~~ +///fold: +var removeBadItems = function(l, badItems) { + return reduce(function(badItem, remainingL) { + return remove(badItem, remainingL) + }, l, badItems); +} + +var doors = [1, 2, 3]; +/// + +var montyAvoidBoth = function(aliceDoor, prizeDoor) { + return Infer({method: 'enumerate'}, function() { + return ... + }) +}; + +var model = function(switch_cond) { + var aliceDoor = ... + var prizeDoor = ... + var montyDoor = ... + + condition(montyDoor != prizeDoor); + condition(montyDoor != aliceDoor); + + return ... +} + +display("P(win) if Alice doesn't switch"); +viz.auto(Infer({method: 'enumerate'}, function() {model(false)})); +display("P(win) if Alice does switch"); +viz.auto(Infer({method: 'enumerate'}, function() {model(true)})); +~~~~ + + +### Exercise 2.3 + +This is unintuitive -- we know that Monty picked door 3, so why should the process he used to arrive at this choice matter? +By hand, complete the probability table for P(Alice, Prize, Monty) under both `montyRandom` and `montyAvoidBoth`. +Your tables should look like: + +Alice's door| Prize door| Monty's Door| P(Alice, Prize, Monty) +-------------| -----------| -------------| ----------------------- +1| 1| 1| ... +1| 1| 2| ... +...| ...| ...| ... + +Using these tables, explain why Alice should switch for both-avoiding Monty but why switching doesn't matter for random Monty. +Hint: you will want to compare particular *rows* of these tables. + + +### Exercise 2.4 + +This time, fill in the code so that Monty randomly chooses between the two doors that aren't Alice's door. +What should Alice do now? + +~~~~ +///fold: +var removeBadItems = function(l, badItems) { + return reduce(function(badItem, remainingL) { + return remove(badItem, remainingL) + }, l, badItems); +} + +var doors = [1, 2, 3]; +/// + +var montyAvoidAlice = function(aliceDoor, prizeDoor) { + return Infer({method: 'enumerate'}, function() { + return ... + }) +}; + +var model = function(switch_cond) { + var aliceDoor = ... + var prizeDoor = ... + var montyDoor = ... + + condition(montyDoor != prizeDoor); + condition(montyDoor != aliceDoor); + + return ... +} + +display("P(win) if Alice doesn't switch"); +viz.auto(Infer({method: 'enumerate'}, function() {model(false)})); +display("P(win) if Alice does switch"); +viz.auto(Infer({method: 'enumerate'}, function() {model(true)})); +~~~~ + + +### Exercise 2.5 + +This time, fill in the code so that Monty randomly chooses between the two doors that aren't the prize door. +What should Alice do now? + +~~~~ +///fold: +var removeBadItems = function(l, badItems) { + return reduce(function(badItem, remainingL) { + return remove(badItem, remainingL) + }, l, badItems); +} + +var doors = [1, 2, 3]; +/// + +var montyAvoidPrize = function(aliceDoor, prizeDoor) { + return Infer({method: 'enumerate'}, function() { + return ... + }) +}; + +var model = function(switch_cond) { + var aliceDoor = ... + var prizeDoor = ... + var montyDoor = ... + + condition(montyDoor != prizeDoor); + condition(montyDoor != aliceDoor); + + return ... +} + +display("P(win) if Alice doesn't switch"); +viz.auto(Infer({method: 'enumerate'}, function() {model(false)})); +display("P(win) if Alice does switch"); +viz.auto(Infer({method: 'enumerate'}, function() {model(true)})); +~~~~ + + +### Exercise 2.6 + +The psychological question is why do people have the initial intuition that switching shouldn’t matter? +Given your explorations, propose a hypothesis. +Can you think of an experiment that would test this hypothesis? diff --git a/exercises/conditioning.md b/exercises/conditioning.md index 0fc5503..8c0dc5d 100644 --- a/exercises/conditioning.md +++ b/exercises/conditioning.md @@ -149,7 +149,7 @@ Describe (using ordinary English) the `smilesModel` program in Exercise 3b. Extend `smilesModel` to create a version of the model considers two additional factors: -1. People will smile 80% of the time if they want something from you and 50% if they do not. +1. Wanting something causes people to smile 80\% of the time. 2. *Nice* people will only want something from you 20% of the time; non-nice people 50% of the time. Don't forget that nice people also smile more often! diff --git a/exercises/dependence.md b/exercises/dependence.md index a2cfa50..5e1a4bb 100644 --- a/exercises/dependence.md +++ b/exercises/dependence.md @@ -9,7 +9,7 @@ For each of the following programs: * Draw the dependency diagram (Bayes net). If you don't have software on your computer for doing this, Google Docs has a decent interface for creating drawings. -* Use informal evaluation order reasoning and the intervention method to determine causal dependency between A and B. +* Use informal causal dependence order reasoning and the intervention method to determine causal dependency between A and B. * Use conditioning to determine whether A and B are statistically dependent. diff --git a/exercises/generative-models.md b/exercises/generative-models.md old mode 100644 new mode 100755 diff --git a/exercises/hierarchical-models.md b/exercises/hierarchical-models.md index 86c8f26..7d931c6 100644 --- a/exercises/hierarchical-models.md +++ b/exercises/hierarchical-models.md @@ -6,7 +6,7 @@ description: The power of abstraction. ## Exercise 1: Pseudocounts and the Dirichlet distribution -In the Bayesian Data Analysis exercises, we explored the Beta distribution by varying its parameters. +In the Bayesian Data Analysis (BDA) exercises, we explored the Beta distribution by varying its parameters. The Dirichlet is a generalization of the Beta distribution to more than two categories (see [Appendix](http://probmods.org/chapters/appendix-useful-distributions.html)) Instead of Beta parameters $$(a, b)$$ governing the probabilities of two categories $$(false/true)$$, @@ -180,7 +180,7 @@ How many of these apples are likely to be rotten? ~~~~ ~~~~ -## Exercise 3: Hierarchical models for BDA +## Exercise 3: Hierarchical models for Bayesian Data Analysis (BDA) Imagine that you have conducted an experiment on word reading times to test the hypothesis that words starting with vowels take longer to read. Each data point includes whether the word starts with a vowel or a consonant, the word itself, the participant id, and the response time you measured ("rt"). diff --git a/exercises/inference-algorithms.md b/exercises/inference-algorithms.md index e45b342..546da98 100644 --- a/exercises/inference-algorithms.md +++ b/exercises/inference-algorithms.md @@ -38,7 +38,7 @@ var post = Infer({method: 'rejection', samples: 1000}, model); viz.auto(post); ~~~~ -### Exercise 1.1) +### Exercise 1.1 Try using MCMC with Metropolis-Hastings instead of rejection sampling. You'll notice that it does not fare as well as rejection sampling. Why not? @@ -71,13 +71,13 @@ var post = Infer({method: 'rejection', samples: 1000}, model); viz.auto(post); ~~~~ -### Exercise 1.2) +### Exercise 1.2 Change the *model* to make MH successfully trace the curves. Your solution should result in a graph that clearly traces a heart-shaped figure -- though it need not do quite as well as rejection sampling. Why does this work better? -You may find the following piece of code useful. +You may find the following piece of code useful. ~~~~ var a = diagCovGaussian({mu: Vector([0, 100]), @@ -86,6 +86,14 @@ display(T.get(a, 0)); display(T.get(a, 1)); ~~~~ +Note that `T.get()` is just a helper function to get specific values out of the return value of diagCovGaussian(). See what that output looks like: + +~~~~ +var a = diagCovGaussian({mu: Vector([0, 100]), + sigma: Vector([1, 10])}); +a +~~~~ + ~~~~ ///fold: var onCurve = function(x, y) { @@ -295,6 +303,82 @@ var posterior = Infer({method: 'MCMC', verbose: true}, model); ~~~~ +# Exercise 3: Cross-situational learning + +When children hear an object being named, the data is often ambiguous. There are multiple things the parent could be talking about. Which one does the word belong to? + +A common paradigm for studying this problem is the cross-situational learning study. On the first trial, the subject may see a dog and a cat and hear the word `dax`. Does `dax` refer to dogs or cats? There's no way to know. + +Suppose on the second trial, however, the subject sees a dog and a bird and hears the word `dax`. Now, your intuition is probably that `dax` refers to dogs. + +#### Exercise 3.1 + +Implement a simple model that achieves this result. + +~~~~ +var names = ["dax", "blicket", "gorper", "greeble", "freeble"] + +var objName = mem(function(obj) { + sample(Categorical({vs: names, ps: [.2, .2, .2, .2, .2]})) +}) + +var nameOne = function(obj1, obj2){ + return flip() ? objName(obj1) : objName(obj2) +} + +var clmodel = function() { + // your model goes here + return objName("dog") +} + +var posterior = Infer(clmodel) +viz(posterior) +~~~~ + +#### Exercise 3.2 + +An obvious concern about cross-situational learning is that it may require a lot of memory. Suppose the following trial structure: + +1. objects: dog, cat, word: dax +2. objects: dog, bird, word: blicket +3. objects: dog, cow, word: greeble +4. objects: dog, platypus, word: freeble +5. objects: dog, ostrich, word: dax + +You should still place very high probability on a dog being called a "dax". Show that this holds in your model. Is the probability as high as it was previously? If not, why not? + +~~~~ +var names = ["dax", "blicket", "gorper", "greeble", "freeble"] + +var objName = mem(function(obj) { + sample(Categorical({vs: names, ps: [.2, .2, .2, .2, .2]})) +}) + +var nameOne = function(obj1, obj2){ + return flip() ? objName(obj1) : objName(obj2) +} + +var clmodel = function() { + // your model goes here + return objName("dog") +} + +var posterior = Infer(clmodel) +viz(posterior) +~~~~ + +#### Exercise 3.3 + +In a thought-provoking paper titled "[Propose but verify](https://www.sciencedirect.com/science/article/pii/S0010028512000795?casa_token=nz-cJhc201oAAAAA:R2uj-uguW3RBr37sqNuHw9FaooZio0UL787yJmqI5nGlwc89nd-tMabrBszCZtYNHyHLNcbzqQ)", John Trueswell, Tamara Medina, Alon Hafri, and Lila Gleitman argue that cross-situational models like the one above require too much memory. It's unrealistic to suppose that learners remember all prior encounters with objects and words! + +Instead, they argue that at any given time, learners are entertaining a single possible meaning for any given word. If later evidence disproves their working definition, they throw it out and start over. + +Rewrite your model from Exercise 3.2 to implement this proposal. Hint: Consider how you could do this by changing the inference algorithm, not the model itself. (You may need to change the model, though, for instance to change `condition` statements to `factor` statements just in order to get the model to run.) + +~~~~ +// FUBAR +~~~~ + @@ -71,9 +91,9 @@ Now imagine you hear a noise from inside a crater but you cannot see the alien t This problem is adapted from Section 6.5 of [Lee \& Wagenmakers (2013)](https://faculty.washington.edu/jmiyamot/p548/leemd%20bayesian%20cog%20modeling%20-%20practical%20crs.pdf). -Consider the practical challenge of detecting if people cheat on a test. For example, people who have been in a car accident may seek financial compensation from insurance companies by feigning cognitive impairment such as pronounced memory loss. When these people are confronted with a memory test that is intended to measure the extent of their impairment, they may deliberately under-perform. This behavior is called malingering, and it may be accompanied by performance much worse than that displayed by real amnesiacs. Sometimes, for example, malingerers may perform substantially below chance. +In a psychology experiment, not all the subjects are necessarily doing their best. Some just want their payment or credit and to get out as quickly as possible. Consider the practical challenge of detecting which subjects are actually taking the task seriously. -Malingering is not always easy to detect, but is naturally addressed by a mixture model. Using this approach, it is possible to infer which of two categories -- those who malinger, and those who are truthful or bona fide -- each person belongs to, and quantify the confidence in each of these classifications. +Malingering (not taking the task seriously) is not always easy to detect, but is naturally addressed by a mixture model. Using this approach, it is possible to infer which of two categories -- those who malinger, and those who are truthful or bona fide -- each person belongs to, and quantify the confidence in each of these classifications. We consider an experimental study on malingering, in which each of p = 22 participants completed a memory test (Ortega, Wagenmakers, Lee, Markowitsch, & Piefke, 2012). One group of participants was told to do their best. These are the bona fide participants. The other group of participants was told to under-perform by deliberately simulating amnesia. These are the malingerers. Out of a total of n = 45 test items, the participants get 45, 45, 44, 45, 44, 45, 45, 45, 45, 45, 30, 20, 6, 44, 44, 27, 25, 17, 14, 27, 35, and 30 correct. Because this was an experimental study, we know that the first 10 participants were bona fide and the next 12 were instructed to malinger. ### a) diff --git a/exercises/process-models.md b/exercises/process-models.md index 66d133f..c241455 100644 --- a/exercises/process-models.md +++ b/exercises/process-models.md @@ -3,6 +3,8 @@ layout: exercise title: Rational process models - exercises --- +## Exercise 1 + Consider once again the simple blicket detector experiment from the Conditional Dependence chapter and Bayesian Data Analysis exercises. Here, we have simplified the model such that the only free parameter is the base rate of being a blicket and the participant only sees one data point of evidence at a time (i.e. one set of blocks that makes the machine beep). @@ -86,14 +88,14 @@ viz.marginals(Infer(opts, dataAnalysis)); ~~~ -## Exercise 1 +#### a) Complete the code to infer the posterior distributions of the base rate and that the model is conditioned on both the participants' responses and response times. HINT: The `observe()` function requires a distribution as its first parameter. -## Exercise 2 +#### b) How do your inferences about the base rates change with the following modifications? @@ -103,7 +105,7 @@ How do your inferences about the base rates change with the following modificati What does this say about the information provided about the base rate from each source? -## Exercise 3 +#### c) Note that there is some subject variability in RT. Modify your model to allow the two subjects to have different base rates in mind. @@ -113,14 +115,14 @@ What do you notice about the base rates? What makes their base rates different? -## Exercise 4 +#### d) Suppose we went to survey another group of aliens on Venus and collected another data set. Run this same BDA on these subjects. How do the Venusians compare to the Martians? -## Exercise 5 +#### e) Suppose you want to compare the hypotheses that the aliens use rejection sampling versus enumeration to estimate probabilities. Modify your code to infer the posterior probabilities of each method for each planet. @@ -129,7 +131,32 @@ Which algorithm is each kind of alien most likely to be using? Hint: Make `method` a random variable. -## Exercise 6 +#### f) Do you think any of these algorithms are good descriptions of how people intuitively do the Blicket task? Explain what aspects of the inference may or may not be analogous to what people do. + +## Exercise 2 + +Consider the particle filter example from the chapter that we used to infer the number of hypotheses vocabulary-learners are entertaining. It's straightforward to apply this model to experimental datasets where subjects are learning names of novel objects. What's one problem you might run into in trying to apply it to understand children's actual learning in the real world? + +## Exercise 3 + +In the chapter, we investigated how many samples we should take when deciding whether to guess `heads` or `tails` for a coin of known weight. Let's consider a related problem. In this case, all we know is the weight of the coin is drawn from a uniform distribution from 0 to 1. We are allowed to flip the coin as many times as we want before guessing the outcome of the next flip. How many flips should we take? + +#### a) + +What's the best-case scenario? That is, suppose you know the actual weight of the coin. How often can you guess the next flip? + +~~~~ +// your code here +~~~~ + +#### b) + +Now figure out how often you could guess the next flip based on first flipping it 10 times. (Keep in mind that in this scenario, you can do as much inference as you want; no need to restrict samples during inference. It's the number of observations you can make about the coin that we are restricting.) + + +~~~~ +// your code here +~~~~ \ No newline at end of file diff --git a/exercises/social-cognition.md b/exercises/social-cognition.md index dc980d8..76c1b10 100644 --- a/exercises/social-cognition.md +++ b/exercises/social-cognition.md @@ -54,205 +54,197 @@ Explain how deceptiveness and preferences interact to produce her actions. ~~~~ ~~~~ -## Exercise 2: Monty Hall. +## Exercise 2: Factors -Here, we will use the tools of Bayesian inference to explore a classic statistical puzzle -- the Monty Hall problem. -Here is one statement of the problem: +The `factor` function can be very helpful. The WebPPL manual has this to say about `factor`: -> Alice is on a game show, and she's given the choice of three doors. -> Behind one door is a car; behind the others, goats. -> She picks door 1. The host, -> Monty, knows what's behind the doors and opens another door, say No. 3, revealing a goat. -> He then asks Alice if she wants to switch doors. -> Should she switch? +> `factor(score)` adds `score` to the log probability of the current distribution. -Intuitively, it may seem like switching doesn't matter. -However, the canonical solution is that you *should* switch doors. -We will explore why this is the case. +Let's try an example: + +~~~~ +var dist1 = Infer({method: 'enumerate'}, + function () { + var A = flip() + return A +}); + +var dist2 = Infer({method: 'enumerate'}, + function () { + var A = flip() + A ? factor(1) : factor(0) + return A +}); + +viz(dist1) +viz(dist2) +~~~~ + +Consider that the probability of heads and tails in `dist1` are both .5. Adding 1 to the log probability of heads means + +$$log(P(H)) + 1 = log(.5) + 1 \approx .307$$ + +Adding 0 to the log probability of tails means + +$$log(P(T)) + 0 = log(.5) \approx -.693$$ + +Of course, these two probabilities no longer sum to 1, so we need to normalize: + +$$P(H) = \frac{P(H)}{P(H) + P(T)} \approx \frac{e^.307}{e^.307 + e^{-.693}} \approx .731$$ + +If you run the code above, you should see that our numbers match. -For this problem, we will assume (condition) that we observe Monty opening the door that -is neither Alice's door nor the prize door. -### Exercise 2.1 +### a) -The decision to switch depends crucially on how you believe Monty chooses doors to pick. -First, write the model such that the host *randomly* picks doors (for this, fill in `montyRandom`). -In this setting, should Alice switch, or does it not matter? +Try to use factor to get approximately 95\% probability of heads (this does not need to be exact; just get close): ~~~~ -///fold: -var removeBadItems = function(l, badItems) { - return reduce(function(badItem, remainingL) { - return remove(badItem, remainingL) - }, l, badItems); -} +var dist = Infer({method: 'enumerate'}, + function () { + var A = flip() + factor(A) //edit this line + return A +}); +viz(dist) +~~~~ -var doors = [1, 2, 3]; -/// +### b) -var montyRandom = function(aliceDoor, prizeDoor) { - return Infer({method: 'enumerate'}, function() { - return ... - }) -}; +In this model, we flip 3 coins. Use `factor` to favor an outcome of 2 heads and 1 tails: -var model = function(switch_cond) { - var aliceDoor = ... - var prizeDoor = ... - var montyDoor = ... - - condition(montyDoor != prizeDoor); - condition(montyDoor != aliceDoor); - - return ... -} +~~~~ +var softHeads = Infer({ + model() { + var a = flip(0.5); + var b = flip(0.5); + var c = flip(0.5); + factor( \\your code here ); + return a; + } +}); -display("P(win) if Alice doesn't switch"); -viz.auto(Infer({method: 'enumerate'}, function() {model(false)})); -display("P(win) if Alice does switch"); -viz.auto(Infer({method: 'enumerate'}, function() {model(true)})); +viz(softHeads); ~~~~ +## Exercise 3: The Ultimatum Game + +### a) -### Exercise 2.2 +The ultimatum game requires two players: A proposer and a responder. The proposer has to decide how to allocate \$10 between the two players in \$1 increments. Once this proposal is made, the responder decides whether to accept the proposal. If the responder accepts, both players are awarded the money according to the proposal. If the responder rejects, neither player gets anything. -This time, fill in the code so that Monty behaves according to the original Monty Hall problem, -i.e. picking the door that is neither the prize door nor Alice's door. -For both-avoiding Monty, you'll find that Alice *should* switch. +If the responder was a strict utilitarian, s/he would accept any offer of \$1 or more. Assume the proposer is a soft maximizer who wants to keep as much of the \$10 as possible. Complete the code below to find out how much the proposer will offer: ~~~~ -///fold: -var removeBadItems = function(l, badItems) { - return reduce(function(badItem, remainingL) { - return remove(badItem, remainingL) - }, l, badItems); + +var responder = function(offer) { + + // your code here + } -var doors = [1, 2, 3]; -/// +var proposer = Infer({method: "enumerate"}, function(){ + + // your code here + + factor(reward) + return(offer) + }) -var montyAvoidBoth = function(aliceDoor, prizeDoor) { - return Infer({method: 'enumerate'}, function() { - return ... - }) -}; +viz(proposer); +~~~~ -var model = function(switch_cond) { - var aliceDoor = ... - var prizeDoor = ... - var montyDoor = ... - - condition(montyDoor != prizeDoor); - condition(montyDoor != aliceDoor); - - return ... -} +### b) + +People, it turns out, act very differently than the model above suggests. Responders will often reject low offers as "unfair", even though this means they get nothing. Assume that the responder decides whether to accept in proportion to the percentage of the \$10 allocated to her, raised to some power `alpha` (you can think of `alpha` as "spitefulness"). Complete the code below to determine how much the proposer should offer: -display("P(win) if Alice doesn't switch"); -viz.auto(Infer({method: 'enumerate'}, function() {model(false)})); -display("P(win) if Alice does switch"); -viz.auto(Infer({method: 'enumerate'}, function() {model(true)})); ~~~~ +var responder = function(offer, alpha) { + var p = Math.pow(offer/10,alpha) + return(flip(p)); +} + +var proposer = Infer({method: "enumerate"}, function(){ + + // your code here + + factor(reward) + return(offer) + }) -### Exercise 2.3 +viz(proposer); +~~~~ -This is unintuitive -- we know that Monty picked door 3, so why should the process he used to arrive at this choice matter? -By hand, complete the probability table for P(Alice, Prize, Monty) under both `montyRandom` and `montyAvoidBoth`. -Your tables should look like: +### c) -Alice's door| Prize door| Monty's Door| P(Alice, Prize, Monty) --------------| -----------| -------------| ----------------------- -1| 1| 1| ... -1| 1| 2| ... -...| ...| ...| ... +You can think of the variable `alpha` in the code above as encoding spitefulness: the degree to which the responder is willing to forego a reward in order to prevent the proposer from having a reward. See how setting `alpha` to 4, 6, 10, 25, and 50 affects what the proposer does. Explain the results. -Using these tables, explain why Alice should switch for both-avoiding Monty but why switching doesn't matter for random Monty. -Hint: you will want to compare particular *rows* of these tables. +### d) -### Exercise 2.4 +The models above assume the proposer knows the responder's decision function. Let's soften that assumption: the proposer knows that the responder's value of `alpha` is somewhere on the range [0.5, 5]. Suppose the proposer offer \$2 and the responder rejects it. What is the most likely level of `alpha`? How does that change if the first offer was \$8? -This time, fill in the code so that Monty randomly chooses between the two doors that aren't Alice's door. -What should Alice do now? +(Hint: you may find it helpful to find a different place for `alpha` than within the definition of `responder`.) ~~~~ -///fold: -var removeBadItems = function(l, badItems) { - return reduce(function(badItem, remainingL) { - return remove(badItem, remainingL) - }, l, badItems); +var responder = function(offer, alpha) { + + // your code here + } -var doors = [1, 2, 3]; -/// +var proposer = Infer({method: "MCMC", samples:50000}, function(){ -var montyAvoidAlice = function(aliceDoor, prizeDoor) { - return Infer({method: 'enumerate'}, function() { - return ... - }) -}; + // your code here -var model = function(switch_cond) { - var aliceDoor = ... - var prizeDoor = ... - var montyDoor = ... - - condition(montyDoor != prizeDoor); - condition(montyDoor != aliceDoor); - - return ... -} +]}) -display("P(win) if Alice doesn't switch"); -viz.auto(Infer({method: 'enumerate'}, function() {model(false)})); -display("P(win) if Alice does switch"); -viz.auto(Infer({method: 'enumerate'}, function() {model(true)})); +viz(proposer) ~~~~ +### e) + +Extend the model in (d) as follows: Suppose the proposer and responder are going to play twice. Does it ever make sense for the responder to reject the first proposal in order to increase the total expected payoff across the two games? (If you cannot figure out how to write the model, a verbal description is OK.) -### Exercise 2.5 +## Exercise 4: The Prisoner's Dilemma -This time, fill in the code so that Monty randomly chooses between the two doors that aren't the prize door. -What should Alice do now? +### a + +In the prisoner's dilemma, two thieves work together on a bank heist. Afterwards, they are apprehended by the police. The police interrogate the thieves separately. They tell each thief that if they confess they'll get a lenient sentence. If one confesses and the other doesn't, though, the one who doesn't confess will get the maximum sentences of 10 years. If neither confesses, the prosecutors will charge them with some other crime (probably resisting arrest) and they'll each get 5 years. + +What's the longest the lenient sentence can be (in round years) such that it makes sense for the thief to confess (that is, where she has a greater than 50% chance of confessing)? Use `factor(percentYearsFreedom)` where `percentYearsFreedom` is the percentage of the next 10 years the thief will not be in jail. (Assume that this incident has scared her straight and she will not commit any other crimes.) ~~~~ -///fold: -var removeBadItems = function(l, badItems) { - return reduce(function(badItem, remainingL) { - return remove(badItem, remainingL) - }, l, badItems); +var thiefRats = function(){ + return (flip()? true: false) } -var doors = [1, 2, 3]; -/// +var thief = Infer({}, function(){ -var montyAvoidPrize = function(aliceDoor, prizeDoor) { - return Infer({method: 'enumerate'}, function() { - return ... - }) -}; + // your code here -var model = function(switch_cond) { - var aliceDoor = ... - var prizeDoor = ... - var montyDoor = ... - - condition(montyDoor != prizeDoor); - condition(montyDoor != aliceDoor); - - return ... -} +}) -display("P(win) if Alice doesn't switch"); -viz.auto(Infer({method: 'enumerate'}, function() {model(false)})); -display("P(win) if Alice does switch"); -viz.auto(Infer({method: 'enumerate'}, function() {model(true)})); +viz(thief) ~~~~ +### b + +Try using `factor` to make the theives more maximizing (they are even more likely to make the choice that maximizes their years of freedom). How does this affect the answer to part (a)? + +## Exercise 5: Exploring RSA + +In this exercise, we'll look at the final model of scalar implicature from th emain text a bit more. Modify it as necessary. + +### a) + +How does increasing the optimality of the speaker affect the pragmatic listener's inferences? Try a couple values and report the results. + +### b) + +Increase the depth to 2. How does that compare to a model with depth of 1? -### Exercise 2.6 +### c) -The psychological question is why do people have the initial intuition that switching shouldn’t matter? -Given your explorations, propose a hypothesis. -Can you think of an experiment that would test this hypothesis? +Is there any way to get ``some'' to refer to 0? Why or why not? \ No newline at end of file diff --git a/index.md b/index.md old mode 100644 new mode 100755 diff --git a/package.json b/package.json old mode 100644 new mode 100755 diff --git a/readings/04.1-agents-as-programs.md b/readings/04.1-agents-as-programs.md old mode 100644 new mode 100755 diff --git a/readings/05-observing-sequences.md b/readings/05-observing-sequences.md old mode 100644 new mode 100755 diff --git a/readings/05.1-sequential-decisions.md b/readings/05.1-sequential-decisions.md old mode 100644 new mode 100755 diff --git a/readings/conditional-dependence.md b/readings/conditional-dependence.md new file mode 100755 index 0000000..f56984f --- /dev/null +++ b/readings/conditional-dependence.md @@ -0,0 +1,67 @@ +--- +layout: exercise +title: Conditional Dependence - Readings +description: Conditional dependence. +--- + +## 1. Models of causal reasoning + +Read "[Hierarchical Bayesian inference in the visual cortex](https://web.archive.org/web/20170810151948id_/http://cvcl.mit.edu/SUNSeminar/LeeMumford2003.pdf)" by Tai Sing Lee and David Mumford. + +#### Reading questions: + +a) Which of Marr's levels are Lee and Mumford targeting? + +Here's what ChatGPT says: + +>In the paper "Hierarchical Bayesian inference in the visual cortex" by Tai Sing Lee and David Mumford, the authors are targeting the computational level of Marr's levels of analysis. + +b) Describe one or two ways in which the model relates to the content from the chapter. + +With a little bit of help, ChatGPT can produce this: + +>The "Hierarchical Bayesian inference in the visual cortex" model by Tai Sing Lee and David Mumford relates to the content in Chapter 4 of Probabilistic Models of Cognition in the following ways: +> +>Bayesian models of cognition: Chapter 4 of Probabilistic Models of Cognition covers Bayesian models of cognition, which are mathematical models that use Bayesian probability theory to explain how people process and make inferences about sensory information. Lee and Mumford's model of hierarchical Bayesian inference in the visual cortex fits within this framework, as it proposes that the visual cortex performs Bayesian inference to infer the most likely explanation of sensory input based on prior knowledge and context. +> +>Hierarchical models: The hierarchical aspect of Lee and Mumford's model is also relevant to the content in Chapter 4, as the chapter covers the use of hierarchical models in probabilistic models of cognition. Hierarchical models enable the modeling of complex, multi-level processes by breaking them down into simpler, more manageable components. In the case of Lee and Mumford's model, the hierarchy refers to the levels of abstraction in the visual cortex, from lower-level features to higher-level object representations. +> +>These similarities highlight how Lee and Mumford's model fits within the broader framework of probabilistic models of cognition, and how it contributes to our understanding of how the visual cortex performs inferences and processes sensory information. + +c) Lee and Mumford's model involves both bottom-up and top-down reasoning. What computational problem does this pose, and how do particle filters help? + +From ChatGPT: + +>The "Hierarchical Bayesian inference in the visual cortex" model by Tai Sing Lee and David Mumford involves both bottom-up and top-down reasoning, which poses a computational challenge. Bottom-up reasoning refers to the processing of sensory information starting from the most basic features, while top-down reasoning refers to the use of prior knowledge and context to guide processing. This combination of bottom-up and top-down reasoning creates a highly complex inference problem, as the visual cortex must balance the influence of sensory data with prior knowledge and context to make accurate inferences about the visual scene. +> +>Particle filters help to address this computational problem by providing a probabilistic framework for combining bottom-up and top-down information. Particle filters are a type of Monte Carlo method that generates a set of particle estimates of the state of a system. In the case of the visual cortex, these particles represent different hypotheses about the explanation of the sensory input, taking into account both bottom-up and top-down information. The visual cortex then uses Bayesian inference to update the particle estimates based on the new sensory information and refine its inferences about the visual scene. +> +>In this way, particle filters provide a way to balance bottom-up and top-down information in a computationally efficient manner, enabling the visual cortex to perform complex inferences about the visual scene based on a combination of sensory data and prior knowledge. + +## Optional + +To learn more about how discussions of causal, statistical, and conditional dependence have informed developmental psychology, read "[A theory of causal learning in children: causal maps and Bayes nets](https://pages.ucsd.edu/~rlevy/gopnik-etal-2004.pdf)" by Alison Gopnik and colleagues. + +## Extras + +* If you are unfamiliar with the terms "top-down" and "bottom-up" processing, read [this](https://www.simplypsychology.org/top-down-processing.html#:~:text=Top%2Ddown%20processing%20is%20perceiving,beliefs%2C%20values%20and%20social%20influences.) and [this](https://www.simplypsychology.org/bottom-up-processing.html#:~:text=Bottom%2Dup%20processing%20is%20the%20process%20of%20'sensation'%2C,organize%2C%20and%20interpret%20these%20sensations). + +* To get a basic overview of the parts of the brain involved in vision, read [the Wikipedia page on the visual cortex](https://en.wikipedia.org/wiki/Visual_cortex) + +If you tell ChatGPT you don't have enough background to understand this paper, here's what it suggests. I can't say that it's wrong, but it's not very helpful: + +>If you don't have a strong background in the topics covered in the paper "Hierarchical Bayesian inference in the visual cortex" by Tai Sing Lee and David Mumford, I recommend starting with the following resources: +> +>Introduction to Bayesian Statistics: Bayesian statistics is the mathematical framework used in the paper, and understanding the basics of Bayesian statistics is essential for comprehending the model. There are many resources available online to help you learn about Bayesian statistics, such as online courses, tutorials, and introductory texts. +> +>Vision and the Visual Cortex: The paper focuses on how the visual cortex performs inferences about visual scenes, so understanding the basics of vision and the visual cortex is also important. You can start by reading introductory texts on vision and the visual cortex, or taking online courses on these topics. +> +>Particle Filters: Particle filters are a specific type of algorithm used in the paper to perform Bayesian inference. Understanding particle filters is key to understanding the computational problem the paper addresses and how it solves it. You can learn about particle filters by reading introductory texts, watching videos, or taking online courses. +> +>Once you have a solid understanding of these basics, you will be better equipped to understand the paper "Hierarchical Bayesian inference in the visual cortex" and its contributions to the field. + + + + + + diff --git a/readings/conditioning.md b/readings/conditioning.md old mode 100644 new mode 100755 index 3988592..9b7cdde --- a/readings/conditioning.md +++ b/readings/conditioning.md @@ -17,4 +17,6 @@ b) At which of Marr's levels does Theory Theory operate? * **Theory of Mind**. Theory of Mind is a paradigmatic example of a Theory Theory theory. Readers not familiar with Theory of Mind may find [this short overview useful](https://pdfs.semanticscholar.org/7f87/7e8b42c2a0ef210edff65f572dd43b2e00a8.pdf). -* **Bayesian Inference**. For those finding the notion of conditioning confusing, [this video](https://www.youtube.com/watch?v=5NMxiOGL39M) may be helpful. \ No newline at end of file +* **Bayesian Inference**. For those finding the notion of conditioning confusing, [this video](https://www.youtube.com/watch?v=5NMxiOGL39M) may be helpful. + +* **ProbLang.org: Introduction to WebPPL**. For those still struggling with WebPPL, [this overview](http://www.problang.org/chapters/app-06-intro-to-webppl.html) provides a complementary look at the structure of WebPPL programs. Chapters 1-3 are probably the most useful. \ No newline at end of file diff --git a/readings/dependence.md b/readings/dependence.md old mode 100644 new mode 100755 index f9635a5..fb86c1b --- a/readings/dependence.md +++ b/readings/dependence.md @@ -1,17 +1,31 @@ --- layout: exercise -title: Patterns of inference - Readings -description: Causal and statistical dependence. Conditional dependence. +title: Causal and statistical dependence - Readings +description: Causal and statistical dependence. --- ## 1. Models of causal reasoning -In the chapter, we discussed how to reason about causation using generative models. There is a separate question as to how humans actually do reason about causality, and what they mean when they say *A caused B*. Read out ["How, whether, and why"](http://web.mit.edu/tger/www/papers/How,%20whether,%20why%20Causal%20judgments%20as%20counterfactual%20contrasts,%20Gerstenberg%20et%20al.,%202015.pdf) by Tobi Gerstenberg, Noah Goodman, David Lagnado, & Joshua Tenenbaum. (You may find it useful to play around with [this causality sandbox](https://scripts.mit.edu/~tger/csm_interface/). +In the chapter, we discussed how to reason about causation using generative models. There is a separate question as to how humans actually do reason about causality, and what they mean when they say *A caused B*. Read ["A counterfactual simulation model of causal judgments for physical events"](https://cicl.stanford.edu/papers/gerstenberg2021csm.pdf) by Tobi Gerstenberg, Noah Goodman, David Lagnado, & Joshua Tenenbaum. (You may find it useful to play around with [this causality sandbox](https://scripts.mit.edu/~tger/csm_interface/). #### Reading questions: a) According to Gerstenberg and colleagues, how do human notions of causality match the notion of causal dependence discussed in the chapter? How do they differ? -b) Gerstenberg and colleagues rely on a notion of counter-factual simulation. In what ways is this similar to embodiment or other classic simulation-based theories? +For the record, here is what ChatGPT says: + +>Gerstenberg and colleagues proposed a counterfactual simulation model to account for human causal judgments for physical events. According to their model, human notions of causality align with the notion of causal dependence in that both involve considering the necessary and sufficient conditions for an effect to occur. +> +>However, the model also suggests that human causal judgments are influenced by a range of factors beyond strict causal dependence, such as heuristics, biases, and prior beliefs. These additional factors can lead to differences between human notions of causality and the formal definition of causal dependence. +> +>For example, humans may rely on heuristics such as temporal proximity or covariation, rather than causal relationships, to make causal judgments. Additionally, humans may make causal judgments based on limited or biased information, whereas the definition of causal dependence requires complete and accurate information about the causal relationships between events. +> +>In conclusion, Gerstenberg and colleagues’ model suggests that human notions of causality match the notion of causal dependence to some extent, but also differ due to the influence of heuristics, biases, and prior beliefs. + +b) Gerstenberg and colleagues rely on reasoning through simulation. In what ways is this similar to embodiment or other classic simulation-based theories? + +Here is what ChatGPT thinks: + +>The "counterfactual simulation model of causal judgments" by Gerstenberg et al. is similar to embodiment and other classic simulation-based theories in that it posits that our understanding of the physical world is based on mental simulations. The authors argue that our causal judgments, or beliefs about what would happen in a hypothetical scenario, are based on simulating the consequences of counterfactual events in our minds. This is similar to embodiment theories, which suggest that our perception and understanding of the world is shaped by the way our body interacts with the environment, and that mental simulations play a key role in this process. Additionally, the simulation-based approach used by Gerstenberg et al. is similar to other classic simulation-based theories, such as mental models theory, which suggest that we build internal representations, or mental models, of the world and use these models to understand and predict the outcomes of events. ## Extras diff --git a/readings/generative-models.md b/readings/generative-models.md old mode 100644 new mode 100755 index 196a042..85a4f33 --- a/readings/generative-models.md +++ b/readings/generative-models.md @@ -17,4 +17,6 @@ a) How do Bayesian computational models differ from other kinds of reasoning sys * **Javascript**. Experienced programmers who need an introduction to Javascript may find the [appendix on Javascript](13-appendix-js-basics.html) sufficient. Students who have limited experience programming are encouraged to check out [*Introduction to JavaScript*](https://www.codecademy.com/learn/introduction-to-javascript) from Codeacademy (esp. Chapters 1-8). -* **Background in Mathematics**. This textbook requires a basic understanding of probability theory. The Khan Academy has a [gentle introduction](https://www.khanacademy.org/math/statistics-probability/probability-library) that should be sufficient for most students. \ No newline at end of file +* **Background in Mathematics**. This textbook requires a basic understanding of probability theory. The Khan Academy has a [gentle introduction](https://www.khanacademy.org/math/statistics-probability/probability-library) that should be sufficient for most students. + +* **More WebPPL**. There is also a textbook for WebPPL itself: [The Design and Impliementation of Probabilistic Programming Languages](http://dippl.org/). If you are struggling with WebPPL, the first three chapters may be helpful. \ No newline at end of file diff --git a/readings/hierarchical-models.md b/readings/hierarchical-models.md old mode 100644 new mode 100755 index fe43ba2..3eb4104 --- a/readings/hierarchical-models.md +++ b/readings/hierarchical-models.md @@ -13,7 +13,50 @@ description: The power of abstraction. a) What is a linguistic alternation, such as the dative alternation? +See if ChatGPT's answer roughly matches what you learned from the paper. If not, what would you correct? + +> A linguistic alternation is a phenomenon in language where different syntactic structures are used to express the same meaning. One example of a linguistic alternation is the dative alternation, also known as the double object construction. + +>The dative alternation involves the use of two different syntactic structures to convey the same meaning, specifically the transfer of an object from one person or thing to another. In one structure, the indirect object (the recipient) appears as a prepositional phrase introduced by the preposition "to", while the direct object (the transferred object) appears as a noun phrase. In the other structure, the indirect object appears as a noun phrase, while the direct object appears as a prepositional phrase introduced by the preposition "to". + +>For example, in the sentence "I gave the book to Mary," "Mary" is the indirect object introduced by the preposition "to," while "book" is the direct object. In the dative alternation, the same meaning can be expressed using the sentence "I gave Mary the book," where "Mary" is the indirect object expressed as a noun phrase, and "book" is the direct object expressed as a prepositional phrase. + +>The dative alternation is one of many syntactic alternations in English, and it is found in many other languages as well. Understanding these alternations is important for language learners and researchers alike, as they provide insight into the underlying structure and organization of language. + b) What is the Problem of No Negative Evidence? +Based on what you read in the paper, can you improve on ChatGPT's answer? Or is it basically right? + +> The problem of no negative evidence, also known as the poverty of the stimulus problem, is a well-known issue in language acquisition and linguistic theory. + +> In essence, the problem arises from the fact that children seem to be able to learn language incredibly quickly and accurately, even though they are not provided with explicit negative feedback on incorrect language usage. That is, children are not explicitly told which sentences or grammatical structures are incorrect, yet they are still able to learn the rules of their language with remarkable speed and accuracy. + +>This has led some researchers to argue that there must be some innate knowledge or ability in the human brain that allows us to learn language so easily. Others have suggested that children are able to learn language because they are exposed to a rich, complex input of language data, which provides them with enough positive evidence to induce the grammatical rules of their language. + +>Despite ongoing research and debate, the problem of no negative evidence remains unsolved. However, it continues to be a topic of great interest and importance in the fields of linguistics, cognitive psychology, and artificial intelligence, as it sheds light on the mechanisms of language acquisition and the nature of human cognition more broadly. + c) How does a hierarchical Bayesian model ameliorate the Problem of No Negative Evidence? +ChatGPT definitely bungles this one. Can you help? + +> A hierarchical Bayesian model is a statistical model that can be used to explain how humans learn language in the absence of explicit negative feedback. In the context of the problem of no negative evidence, a hierarchical Bayesian model can ameliorate this problem by incorporating prior knowledge about language and using this knowledge to guide the learning process. + +>The model assumes that the child has prior beliefs about the structure of their language, which are represented in the form of a probabilistic grammar. This grammar encodes the child's beliefs about the distribution of different syntactic structures in their language. When the child is exposed to language data, the model updates its beliefs about the grammar based on the observed data. + +>The key innovation of the hierarchical Bayesian model is that it allows the child to learn from both positive and negative evidence in a principled way. The model assumes that the child is uncertain about the true structure of their language, and this uncertainty is captured by a probability distribution over possible grammars. When the child observes a sentence, the model evaluates the probability of that sentence under each possible grammar, and updates the probability distribution over grammars accordingly. + +>Importantly, the model can use negative evidence, or evidence that a particular sentence is ungrammatical, to rule out certain grammars and narrow down the set of possible hypotheses. This is because ungrammatical sentences will have low probability under some grammars, while grammatical sentences will have high probability under all grammars. By ruling out certain grammars, the model can gradually converge on the correct grammar for the child's language. + +>Overall, the hierarchical Bayesian model provides a principled way to learn language in the absence of explicit negative feedback. By incorporating prior knowledge and using probabilistic reasoning, the model can learn from both positive and negative evidence and converge on the correct grammar for the child's language. + +# Extras + +## Psychology / Linguistics + +* While the problem of negative evidence is often associated with Nativist theories, it was first popularized by Melissa Bowerman, a dyed-in-the-wool Empiricist. Her original paper remains a classic: [The 'no negative evidence' problem: How do children avoid constructing an overly general grammar?](https://pure.mpg.de/rest/items/item_468143/component/file_532427/content). +* To get a sense of verb alternations, there's really nothing better than looking at [VerbNet](https://uvi.colorado.edu/). VerbNet is an online database that categorizes verbs into classes depending on which syntactic frames they can use (the two forms of the dative alternation are examples of two frames). + +## Computer Science + +* Perfors and colleagues also wrote a tutorial on [Bayesian models of cognition development](https://www.sciencedirect.com/science/article/pii/S001002771000291X), which includes a useful explanation of overhypotheses / hierarchical models. +* Also useful is [Learning overhypotheses with hierarchical Bayesian models](https://onlinelibrary.wiley.com/doi/full/10.1111/j.1467-7687.2007.00585.x), by many of the same authors. \ No newline at end of file diff --git a/readings/inference-algorithms.md b/readings/inference-algorithms.md old mode 100644 new mode 100755 index d547057..25e77d9 --- a/readings/inference-algorithms.md +++ b/readings/inference-algorithms.md @@ -4,28 +4,48 @@ title: "Algorithms for Inference - readings" description: "MCMC, Gibbs, Metropolis Hastings, Particle Filters, Variational Bayes" --- -## 1. Discussion of MCMC +## 1. Algorithmic-level theories -@T.L.Griffiths:2008:dd194 - Sec. 5.0 Markov Chain Monte Carlo (pp. 31-34) +Read "[Evolution in mind: Evolutionary dynamics, cognitive processes, and Bayesian inference](https://suchow.io/assets/docs/suchow2017tics.pdf)" by Jordan Suchow, David Bourgin, and Tom Griffiths. #### Reading questions: -a) Under what conditions is it *not* necessary to use an approximate sampling method to solve a Bayesian equation? +a) Suchow and colleagues suggest particle filters are a useful way of thinking about maintenance in working memory, that in fact particle filters are simply a good engineering design for working memory. Why would rejection sampling not work? What about Metropolis-Hastings? -b) What are the major differences between Gibbs sampling and Metropolis-Hastings sampling? +ChatGPT actually makes some useful points about the three sampling methods, but doesn't really tie the argument together. Still, you might find it a useful place to start: + +> Suhcow and colleagues suggest that particle filters, which are commonly used in the field of engineering for state estimation and tracking, can be seen as a useful model for understanding maintenance in working memory. In this model, working memory is represented as a set of particles, or possible states, that are continuously updated based on incoming sensory information. + +> Rejection sampling is a technique used in statistics to generate samples from a probability distribution. It involves sampling from a proposal distribution and then rejecting samples that do not meet certain criteria. While this approach could potentially be used to model working memory maintenance, it is less efficient than particle filters because it requires a large number of samples to be generated in order to obtain an accurate representation of the underlying distribution. + +> Metropolis-Hastings is another statistical technique used to generate samples from a probability distribution. It involves iteratively generating new samples based on a proposal distribution and accepting or rejecting them based on a specified acceptance criterion. While this approach could be used to model working memory maintenance, it can be computationally expensive and requires careful tuning of the proposal distribution and acceptance criterion to ensure accurate sampling. + +> In contrast, particle filters are designed to efficiently track and estimate the state of a system over time, and are well-suited to modeling the continuous updates and maintenance processes involved in working memory. They are computationally efficient and do not require as much fine-tuning as other statistical techniques, making them a useful engineering design for working memory. + +PS ChatGPT really did misspell "Suchow". It is not entirely clear what to make of that. -## 2. Particle filters +b) Suchow and colleagues suggest that Metropolis-Hastings may be a useful way of thinking about creativity, that in fact Metropolis-Hastings may be a useful way of *instantiating* creativity. Why would it work better than rejections sampling? Than particle filters? -[Particle Filters Explained without Equations](https://www.youtube.com/watch?v=aUkBa1zMKv4) +ChatGPT's answer was pretty similar in form to the one for (a), so it is not copied here. -#### Viewing questions: +## 2. Discussion of MCMC -a) As the number of particles increases, what happens to a particle filter's accuracy? What happens to its run-time? Would you want an infinite number of particles? Why or why not? +Read Sec. 5 (``Markov Chain Monte Carlo'') of [Bayesian models of cognition](https://kilthub.cmu.edu/articles/journal_contribution/Bayesian_models_of_cognition/6613682/1/files/12106358.pdf) by Tom Griffiths, Charles Kemp, and Josh Tenenbaum. -b) Describe a phenomenon that particle filters be particularly good for modeling. Why do you think a particle filter would be helpful? +#### Reading questions: + +a) Under what conditions is it *not* necessary to use an approximate sampling method to solve a Bayesian equation? + +b) What are the major differences between Gibbs sampling and Metropolis-Hastings sampling? ## Extras +### Extra modeling +* **[Empmirical evidence for Markov Chain Monte Carlo in Memory Search](https://escholarship.org/content/qt72r6n6cn/qt72r6n6cn.pdf)** A short paper describing a model closely related to the memory model discussed by Suchow and colleagues. Although it's short, it goes into the math in a bit more detail, which may be helpful. + ### Extra math -**Algorithms for Inference** For a somewhat longer, mathier disucssion of MCMC algorithms, see @andrieu2003introduction. +* **Algorithms for Inference** For a somewhat longer, mathier discussion of MCMC algorithms, see @andrieu2003introduction. + +* **[Gibbs sampling for the uninitiated](http://users.umiacs.umd.edu/~resnik/pubs/LAMP-TR-153.pdf)** Gibbs sampling is not really covered in this chapter, but it is important. Readers who want to know more can consult this text. +* **More WebPPL**. The WebPPL textbook [this overview](http://www.problang.org/chapters/app-06-intro-to-webppl.html) has chapters on [Particle Filtering](http://dippl.org/chapters/05-particlefilter.html) as well as [Markov Chain Monte Carlo](http://dippl.org/chapters/06-mcmc.html) (MCMC). \ No newline at end of file diff --git a/readings/introduction.md b/readings/introduction.md old mode 100644 new mode 100755 diff --git a/readings/learning-as-conditional-inference.md b/readings/learning-as-conditional-inference.md index d91a647..2db9c05 100644 --- a/readings/learning-as-conditional-inference.md +++ b/readings/learning-as-conditional-inference.md @@ -3,16 +3,23 @@ layout: exercise title: Learning as Infernece - readings --- -## 1. Learning numbers +## 1. Word-learning -@piantadosi2012bootstrapping +*Note that if you aren't familiar with plate notation (you'd know if you were), you'll want to read the "extra" on plate notation below before starting the paper.* -This is a long paper. Focus on the first 11 pages (through and including Sec. 4.1). +Read "[Using speakers' referential intentions to model early cross-situational word learning](https://journals.sagepub.com/doi/pdf/10.1111/j.1467-9280.2009.02335.x?casa_token=d6XWPJFydQsAAAAA:jT9BMvSjSZtFbvQRhrBr_pAUdkMsb_lkHHpZyHJad1om08ddsksv7qR8snr1ySEOjUfUuRZDsEKo)" by Frank, Goodman, & Tenenbaum. + #### Reading questions: -a) What stages do children go through in learning numbers? +a) What is the implicit model of the speaker? First, the speaker does what? Then what? Etc. -b) Why does the model go through several stages before finally learning the number system? +b) What are the key differences between the intentional model and the association frequency model? The conditional probability model? - \ No newline at end of file +## Extras + +* **Plate Notation**: The model is described using plate notation. If this is unfamiliar, read this [https://en.wikipedia.org/wiki/Plate_notation](description from Wikipedia). + +* **Word-learning biases**: The paper does a pretty good job of briefly explaining basic phenomena in word-learning like mutual exclusivity. For a bit more detail, see the Wikipedia article on [https://en.wikipedia.org/wiki/Word_learning_biases](Word learning biases). + +* **More Word-learning biases**: For a deeper dive into the theoretical and empirical background informing this paper, read Markman (1990) [Constraints children place on word meanings](https://onlinelibrary.wiley.com/doi/pdf/10.1207/s15516709cog1401_4). \ No newline at end of file diff --git a/readings/lot-learning.md b/readings/lot-learning.md new file mode 100644 index 0000000..ecd8593 --- /dev/null +++ b/readings/lot-learning.md @@ -0,0 +1,24 @@ +--- +layout: exercise +title: Learning with a Language of Thought - readings +--- + +## 1. Learning numbers + +[Bootstrapping a language of thought](https://www.sciencedirect.com/science/article/pii/S0010027711002769) by Piantadosi and colleagues. + +This is a long paper. Focus on the first 11 pages (through and including Sec. 4.1). + +#### Reading questions: + +a) What stages do children go through in learning numbers? + +b) Why does the model go through several stages before finally learning the number system? What aspects of the model allow it to recapitulate the same stages humans go through? + +## Extras + +a) For a more thorough review of how children learn numbers, see "[Learning to represent exact numbers](https://link.springer.com/article/10.1007/s11229-015-0854-6)" by Barbara Sarnecka. + +b) A highly related paper is ["A Bayesian Model of the Acquisition of Compositional Semantics"](https://www.science.org/doi/full/10.1126/science.aab3050). This isn't a language of thought model but applies very similar ideas to the problem of reading handwriting. + + \ No newline at end of file diff --git a/readings/mixture-models.md b/readings/mixture-models.md new file mode 100755 index 0000000..0ef17da --- /dev/null +++ b/readings/mixture-models.md @@ -0,0 +1,24 @@ +--- +layout: exercise +title: "Mixture Models - readings" +description: "Introduction" +--- + +## 1. A Mixture Model of Visual Working Memory + +Read "[Contextual effects in visual working memory reveal hierarchically structure memory representations](https://jov.arvojournals.org/article.aspx?articleid=2471226)" by Timothy Brady and George Alvarez. This paper was a substantial improvement on prior theories of working memory. In those theories, working memory consisted of a discrete number of slots. Using working memory involved sticking pieces of information into a slot until you ran out of slots. Slots could sometimes lose the piece of information (either through failure to maintain or by being kicked out by some new piece of information; the story varied by theory). This theory worked well for some things but failed to capture some really obvious facts about memory. + +#### Reading questions: +a) What kinds of phenomena can this model account for that a simple item-based model cannot? + +b) Figure 2A models subjects' responses as a mixture of what? (Don't just copy-and-paste what it says in the figure caption. Explain it in your own words. You may find it helpful to give an example or a metaphor.) + +c) According to the description of the "structure representations/hierarchical Bayesian model" in the appendix, that model is both hierarchical and a mixture model. What is the hierarchical part doing? What is the mixture part doing? (That is, if you eliminated the hierarchical component, what would happen? If you eliminated the mixture part, what would happen?) Your answer here can be broad strokes; it need not be technical. + +d) What Marr's level is this paper investigating? Justify your answer. + +## Extras + +* **Background in Psychology**. If you aren't familiar with the classic distinction between sensory/iconic memory, short-term/working memory, and long-term memory, read [this wikipedia article](https://en.wikipedia.org/w/index.php?title=Memory&oldid=1146564225) up until you reach the section "Types". "[Discrete capacity limits in visual working memory](https://www.sciencedirect.com/science/article/pii/S0959438810000437)" provides some context on the debate that the Brady paper is embedded in. If you want a deeper dive in the science of visual memory, "[Visual memory, the long and the short of it: A review of visual working memory and long-term memory](https://link.springer.com/article/10.3758/s13414-018-1522-y)" by Mark. W. Schurgin is useful. + +* **Math and Statistics**. For a more general discussion of mixture models, the [Wikipedia page](https://en.wikipedia.org/w/index.php?title=Mixture_model&oldid=1141109201) is a little dense but has useful links to many related articles and some nice (if succinctly described) examples. \ No newline at end of file diff --git a/readings/process-models.md b/readings/process-models.md index 7a2dfab..8264a5c 100644 --- a/readings/process-models.md +++ b/readings/process-models.md @@ -4,6 +4,43 @@ title: "Rational process models - readings" description: "The psychological reality of inference algorithms." --- +## 1. [Partial mental simulation explains fallacies in physical reasoning](https://par.nsf.gov/servlets/purl/10350504) by Ilona Bass, Kevin A. Smith, Elizabeth Bonawitz, \& Tomer D. Ullman + +##### a) What is the "physical conjunction fallacy"? In what way is it a fallacy? + +[chatGPT misreads "physical conjunction fallacy" as just "conjunction fallacy" and gives a basic explanation of what a conjunction fallacy is. It's not an interesting answer, so it's not included here.] + +##### b) The authors argue that humans have limited cognitive resources and so much rely on carefully-optimized partial simulation when answering physics questions. How does this help explain the physical conjunction fallacy? + +Here, chatGPT actualy starts off well and then goes off the rails: + +> Partial simulation involves mentally representing only a subset of the features or variables relevant to a physical event, rather than a complete and accurate representation of all the variables involved. This allows humans to quickly and efficiently reason about complex physical situations, but it can also lead to errors in judgment when the partial simulation omits important variables or interactions. +> +>In the case of the physical conjunction fallacy, people may rely on partial simulations that emphasize the similarity between two events or the salience of their conjunction, rather than their individual probabilities. For example, a person may mentally simulate the characteristics of a vegetarian marathon runner and find it a more plausible scenario than a vegetarian who is not a marathon runner, because the conjunction of the two events seems more coherent or vivid in their minds. +> +>Thus, the physical conjunction fallacy can be seen as a result of the limitations of human cognitive resources and the heuristics that people use to reason about complex physical situations. By understanding these limitations and biases, we can work to improve our reasoning skills and avoid common errors in judgment. + +(If you are wondering what happened here, try asking chatGPT about the conjunction fallacy.) + +## 2. [One and Done? Optimal Decisions From Very Few Samples](https://onlinelibrary.wiley.com/doi/full/10.1111/cogs.12101). Vul, Goodman, Griffiths, Tenenbaum (2014). Cognitive Science. + +##### a) A basic fact about sampling methods is that the more samples one takes, the more accurate the answer. Why would it ever be *rational* to not take large numbers of samples? + +chatGPT is again sometimes spot-on, and sometimes deeply confused: + +> In Bayesian models of cognition, it is true that taking more samples generally leads to more accurate answers. However, it is not always rational or feasible to take large numbers of samples. There are several reasons why taking a large number of samples may not be rational or necessary: +> +> Time and resource constraints: In some situations, it may simply not be feasible to take a large number of samples due to time or resource constraints. For example, if you are making a decision in a high-pressure, time-sensitive situation, you may not have the luxury of taking a large number of samples. +> +> Diminishing returns: Taking additional samples beyond a certain point may not result in a significant improvement in accuracy, as the additional samples may not contain much new information. In other words, there may be diminishing returns to taking more samples. +> +> Prior knowledge: In some cases, prior knowledge about the situation can be used to inform the sampling process, allowing for a more efficient use of resources. For example, if you are making a decision about whether to invest in a particular stock, you may be able to use your prior knowledge of the stock market to narrow down the range of possible outcomes, and thus require fewer samples to arrive at a decision. +> +> Risk tolerance: Taking more samples may involve more risk, as it may expose you to more uncertainty and variability in the data. In some cases, it may be more rational to make a decision based on a smaller number of samples in order to reduce the risk of being exposed to extreme or outlier data points. +> +> Overall, while taking more samples can improve the accuracy of Bayesian models of cognition, it is not always rational or necessary to take large numbers of samples. The decision of how many samples to take should be based on a careful consideration of the specific situation, including time and resource constraints, the potential for diminishing returns, prior knowledge, and risk tolerance. + + - -# Evolutionary algorithms, Bayesian inference, and the mind - -@suchow2017evolution - -## Reading questions: - -a) In what way is evolutionary dynamics like Bayesian inference? - -b) A number of different inference algorithms are discussed. What are the consequences of one of them being used for a particular process (like working memory) as opposed to another one? +## Extras +#### Extra Psychology +* Wikipedia has a brief discussion of the [conjunction fallacy](https://en.wikipedia.org/wiki/Conjunction_fallacy) +* Wikipedia also has a [reasonably complete list of commonly-discussed biases](https://en.wikipedia.org/wiki/List_of_cognitive_biases), with links to more complete discussions. +* The two papers above are part of a long-running debate in the literature about just how rational human cognition is. To get a sense of the other side in this debate, read "[How robust are probabilistic models of higher-level cognition](https://journals.sagepub.com/doi/pdf/10.1177/0956797613495418?casa_token=az-oW__aiZcAAAAA:efPp3X1NEL8Us-vsyVorN2FD3Nmh1fFby3VeTwN1CygfpYxVgkCtbK3pdHQWV5eqwh0IS5Fre9S4), Ernie Davis and Gary Marcus. \ No newline at end of file diff --git a/readings/social-cognition.md b/readings/social-cognition.md old mode 100644 new mode 100755 index cb2ea2c..fc18992 --- a/readings/social-cognition.md +++ b/readings/social-cognition.md @@ -1,9 +1,39 @@ --- layout: exercise -title: Inference about inference - readings +title: Social Cognition - readings --- -## 1. Natural pedagogy +## 1. Generics & Rational Speech Acts + +Read [this chapter](http://www.problang.org/chapters/07-generics.html) from another WebPPL textbook that extends the scalar implicature model we saw in the chapter to handle generics. + +#### Reading questions: +a) H. Paul Grice argued that listeners believe speakers will be as informative as possible. One way to measure informativity is in terms of the degree to which the listener's beliefs would be updated (presumably correctly). How does the notion of 'informativity' help the model determine a meaning for generics? + +From ChatGPT: + +> Informativity refers to the amount of new information conveyed by a statement. In the context of generics, informativity is used to determine the likelihood of a statement being true. A more informative generic statement is one that is true for a higher percentage of the members of the category being referred to, while a less informative statement is one that is only true for a smaller percentage of the category. +> +> The scalar implicature model uses informativity to determine the meaning of generics by considering the probability of a statement being true based on its informativity. The model reasons that a more informative generic statement is more likely to be true and uses this information to determine the meaning of the statement. For example, if the statement "Dogs are loyal" is more informative, meaning that a higher percentage of dogs are indeed loyal, the model will use this information to determine the meaning of the statement. + +b) How does this model reinterpret the relationship between what a sentence "literally" means, and what it implies? + +ChatGPT tries really, really hard on this one: + +> In the scalar implicature model, the relationship between the literal meaning of a sentence and what it implies is reinterpreted in terms of probabilistic reasoning. In this model, the literal meaning of a sentence is represented as a probability distribution over the space of possible meanings, and the implicatures are computed based on these probabilities and the context in which the sentence is used. + +> For example, consider the sentence "Some dogs are loyal." The literal meaning of this sentence is represented as a probability distribution over the set of all possible worlds, with each world being a possible scenario in which the sentence is true. The model would use this probability distribution to compute the scalar implicatures of the sentence, which represent the additional information that is conveyed by the sentence beyond its literal meaning. + +> In this way, the scalar implicature model reinterprets the relationship between the literal meaning of a sentence and its implicatures as a probabilistic relationship. The literal meaning of a sentence is represented as a probability distribution over possible meanings, and the implicatures are computed based on this distribution and the context in which the sentence is used. The result is a more nuanced understanding of the relationship between what a sentence "literally" means and what it implies, based on probabilistic reasoning and taking into account the context in which the sentence is used. + +c) To understand "birds lay eggs," the model assumes that people know the actual distributions of various properties across the animal kingdom. This is not realistic. Does that invalidate the model? + +ChatGPT's thoughts on the matter: + +> This limitation of the model does not necessarily invalidate the model, but rather highlights the fact that models of language understanding, like any other models, make simplifying assumptions that may not perfectly reflect reality. The purpose of models like this is to provide a simplified representation of reality that allows for testing and exploration of certain questions and theories related to language understanding. + + +## 2. Natural pedagogy @Shafto:2012by @@ -11,6 +41,48 @@ title: Inference about inference - readings a) The model in the reading invokes a theory of mind (learners are reasoning about teachers' mental states) but a relatively simple theory of mind (the kinds of mental states ascribed are not complex). Would this be sufficient for learning more complex ideas (like mathematics or how to tune a bicycle)? If not, what might you want to add to the learner's theory of mind? +As usual, ChatGPT has some thoughts: + +> The model in "Learning From Others: The Consequences of Psychological Reasoning for Human Learning" by Pat Shafto and colleagues uses a relatively simple theory of mind, which focuses on reasoning about teachers' mental states to facilitate learning. However, this simple theory of mind may not be sufficient for learning more complex ideas such as mathematics or how to tune a bicycle. To enhance the learner's theory of mind in these cases, additional components may be necessary, such as the ability to infer the teacher's goals, beliefs, and intentions, as well as an understanding of how the teacher's mental states relate to the task at hand. Additionally, the ability to reason about one's own mental states, such as self-awareness, may also play a role in learning complex ideas. + +## 3. Adjectives & Rational Speech Acts + +Start by reading @schmidt2009tall, which provides some background on the basic problem. Then read @lassiter2017adjectival to see how the Rational Speech Act model can be applied to this problem. + +#### Reading questions: +a) How does the notion of 'informativity' help Lassiter and Goodman's model determine the meaning for gradable adjectives? + +b) Do you think Lassiter and Goodman's model should be understood as a processing model that explains how people interpret gradable adjectives in real time? Why or why not? + +## 4. Making decisions about ethics. + +In the main chapter, the scenarios the agent had to consider were very simple. Real-life decision-making is much more complex. Read "[Inference of intention and permissibility in moral decision making](https://mindmodeling.org/cogsci2015/papers/0199/paper0199.pdf)" by Max Kleiman-Weiner, Tobias Gerstenberg, Sydney Levine, and Joshua Tenenbaum. + +#### Reading questions: +a) How do influence diagrams allow the agent to reason about actions? What do these allow that goes beyond what we saw in the chapter? + +ChatGPT provides an excellent example of an answer that is both more-or-less true and completely useless: + +> Influence diagrams are a graphical representation of the probabilistic relationships between variables in a decision-making process. + +> In this paper, the authors use influence diagrams to model an agent's reasoning about actions in moral decision-making scenarios. The influence diagrams allow the agent to reason about the relationships between different factors that influence its decision-making process, such as the intention behind an action, the permissibility of an action, and the outcomes of an action. By using influence diagrams, the agent can take into account multiple factors and their probabilistic relationships when making a decision, leading to a more nuanced understanding of the decision-making process. + +> Influence diagrams allow the agent to reason about actions in a way that goes beyond what was presented in the Social Cognition chapter of Probabilistic Models of Cognition. While the scenarios in the Social Cognition chapter were relatively simple, influence diagrams provide a more flexible and expressive way of modeling the relationships between variables in more complex decision-making scenarios. This allows the agent to take into account multiple factors and their probabilistic relationships when making a decision, leading to a more nuanced understanding of the decision-making process. + +> In conclusion, the use of influence diagrams in moral decision making allows an agent to reason about actions in a more complex and nuanced way, taking into account multiple factors and their probabilistic relationships. This provides a more flexible and expressive way of modeling the relationships between variables in decision-making scenarios, beyond what was presented in the Social Cognition chapter of Probabilistic Models of Cognition. + +b) What theoretical questions was the model intended to explore? What modeling assumptions are tangential to the investigation? + +ChatGPT's answer: + +> To explore these questions, the authors made a number of modeling assumptions. First, they assumed that people make decisions based on the influence of multiple factors, such as the intention behind an action, the permissibility of an action, and the outcomes of an action. Second, they assumed that the relationships between these factors are probabilistic in nature. Third, they assumed that the decision-making process is influenced by prior beliefs and biases. + +> These assumptions are tangential to the investigation as they form the basis for the model and help to capture the complexity of moral decision making. By taking into account multiple factors and their probabilistic relationships, the model provides a more nuanced understanding of the decision-making process, beyond what would be possible with simpler models. + ## Extras -### Extra math -**Theory of Mind**. For more on Theory of Mind, see @carlson:2013theory. \ No newline at end of file +### Extra psychology +* **Theory of Mind**. For more on Theory of Mind, see @carlson:2013theory. + +* **Gricean Theory**. The scalar implicature theory (and RSA in general) are derived from earlier philosophical work by H. Paul Grice. The wikipedia article on the [Cooperative Principle](https://en.wikipedia.org/wiki/Cooperative_principle) provides an excellent overview of Gricean Theory. "[Experimental pragmatics: A Gricean turn in the study of language](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.233.1679&rep=rep1&type=pdf)" by Ira Noveck and Anne Reboul provides a very accessible discussion of psychological research up through 2008. + +* **Expected Utility**. The Stanford Encyclopedia of Philosophy provides a [useful introduction](https://plato.stanford.edu/entries/rationality-normative-utility/) to the notion of "expected utility", which plays a major role in many approaches to decision-making. diff --git a/solutions/04.1-agents-as-programs.md b/solutions/04.1-agents-as-programs.md deleted file mode 100644 index 7d69e7f..0000000 --- a/solutions/04.1-agents-as-programs.md +++ /dev/null @@ -1,424 +0,0 @@ ---- -layout: exercise -title: Agents as Probabilistic Programs - exercises -custom_js: -- assets/js/box2d.js -- assets/js/physics.js ---- - -## Exercise 1: Factors - -### a) - -*Take our standard coin-flipping model. Use `factor` to create a "soft" condition on the outcome being heads, such that there is an approx. 95% chance of heads.* - -```js -var dist = Infer({method: 'enumerate'}, - function () { - var A = flip() - factor(A*3) //edit this line - return A -}); -viz(dist) -``` - -![](Figures/agents-as-programs-1.png) - -This is actually quite close to 95%: - -`{"probs":[0.04742587317756678,0.9525741268224333],"support":[false,true]}` - - -### b) - -In this model, we flip 3 coins. Use `factor` to favor an outcome of 2 heads and 1 tails: - -```js -var softHeads = Infer({}, function() { - var a = flip(0.5); - var b = flip(0.5); - var c = flip(0.5); - factor(1*((a+b+c)==2)); - return a; - } -}); - -viz(softHeads); -``` - -![](Figures/agents-as-programs-2.png) - -## Exercise 2: The Ultimatum Game - -### a) - -*The ultimatum game requires two players: A proposer and a responder. The proposer has to decide how to allocate \$10 between the two players in \$1 increments. Once this proposal is made, the responder decides whether to accept the proposal. If the responder accepts, both players are awarded the money according to the proposal. If the responder rejects, neither player gets anything.* - -*If the responder was a strict utilitarian, s/he would accept any offer of \$1 or more. Assume the proposer is a soft maximizer who wants to keep as much of the \$10 as possible. Complete the code below to find out how much the proposer will offer:* - -~~~~ -var responder = function(offer) { - - return (offer>0 ? true : false); - -} - -var proposer = Infer({method: "enumerate"}, function(){ - - var offer = uniformDraw([0,1,2,3,4,5,6,7,8,9,10]); - var reward = responder(offer) ? (10 - offer) : 0; - - factor(reward) - return(offer) - }) - -viz(proposer); -~~~~ - -![](Figures/agents-as-programs-3.png) - -### b) - -*People, it turns out, act very differently than the model above suggests. Responders will often reject low offers as "unfair", even though this means they get nothing. Assume that the responder decides whether to accept in proportion to the percentage of the \$10 allocated to her, raised to some power `alpha` (you can think of `alpha` as "spitefulness"). Complete the code below to determine how much the proposer should offer:* - -```js -var alpha = 2 - -var responder = function(offer, alpha) { - var p = Math.pow(offer/10,alpha) - return(flip(p)); -} - -var proposer = Infer({method: "enumerate"}, function(){ - var offer = uniformDraw([0,1,2,3,4,5,6,7,8,9,10]); - var reward = responder(offer,alpha) ? (10 - offer) : 0; - factor(reward) - return(offer) - }) - -viz(proposer); -``` - -![](Figures/agents-as-programs-4.png) - -### c) - -*You can think of the variable `alpha` in the code above as encoding spitefulness: the degree to which the responder is willing to forego a reward in order to prevent the proposer from having a reward. See how setting `alpha` to 4, 6, 10, 25, and 50 affects what the proposer does. Explain the results.* - -~![](Figures/agents-as-programs-5-1.png) -~![](Figures/agents-as-programs-5-2.png) -~![](Figures/agents-as-programs-5-3.png) -~![](Figures/agents-as-programs-5-4.png) -~![](Figures/agents-as-programs-5-5.png) - -As alpha increases, the responder becomes increasingly unlikely to accept any offer less than \$10. Thus, no matter what the proposer offers, she'll probably end up with \$0. This makes her indifferent to the choice. - -### d) - -*The models above assume the proposer knows the responder's decision function. Let's soften that assumption: the proposer knows that the responder's value of `alpha` is somewhere on the range [0.5, 5]. Suppose the proposer offered \$2 and the responder rejects it. What is the most likely level of `alpha`?* - -(Hint: you may find it helpful to find a different place for `alpha` than within the definition of `responder`.) - -```js -var responder = function(offer, alpha) { - var p = Math.pow(offer/10,alpha) - return(flip(p)); -} - -var proposer = Infer({method: "MCMC", samples:50000}, function(){ - var alpha = uniform(0.5,5) - var offer = 2; - var reward = responder(offer, alpha) ? (10 - offer) : 0; - condition(reward==0) - return(alpha) -}) - -viz(proposer) -``` - -![](Figures/agents-as-programs-6.png) - - -### e) - -*Again, suppose the proposer offered \$2 and the responder rejected it. Suppose they are going to play a second round. How much should the proposer offer? How does this change if the first (rejected) offer was \$8?* - -Here is a straight-forward if not especially computationally-efficient model: - -```js -var responder = function(offer, alpha) { - var p = Math.pow(offer/10,alpha) - return(flip(p)); -} - -var proposer1 = Infer({method: "MCMC", samples:50000}, function(){ - var alpha = uniform(0.5,5) - var offer1 = 2 - var reward1 = responder(offer1, alpha) ? (10 - offer1): 0; - condition(reward1==0) - return(alpha) -}) - -var makeoffer = Infer({method: "forward", samples:1000}, function(){ - - var alpha2 = sample(proposer1) - - var proposer2 = Infer({method: "MCMC", samples:5000}, function(){ - var offer2 = uniformDraw([0,1,2,3,4,5,6,7,8,9,10]); - var reward2 = responder(offer2, alpha2) ? (10 - offer2) : 0 - factor(reward2) - return(offer2) - }) - - return sample(proposer2) -}); - -viz(makeoffer) -``` - -With offer1 = 2: - -![](Figures/agents-as-programs-7-1.png) - -With offer1 = 8: - -![](Figures/agents-as-programs-7-2.png) - -The differences are underwhelming. The reason is `factor(reward2)` actually puts a lot of pressure on the proposer getting a large payout. If we change `factor(reward2)` to `factor(Math.pow(reward2,1))`, we get more impressive differences. - -With offer1 = 2: - -![](Figures/agents-as-programs-7-3.png) - -With offer1 = 8: - -![](Figures/agents-as-programs-7-4.png) - -## Exercise 3: The Prisoner's Dilemma - -*In the prisoner's dilemma, two thieves work together on a bank heist. Afterwards, they are apprehended by the police. The police interrogate the thieves separately. They tell each thief that if she confesses, she will get a lenient sentence. If not, she will get 10 years. However, the thieves know that the police need at least one of them to confess; if neither of them confesses, the police don't have enough evidence to charge them, and they will both go free.* - -*What's the longest the lenient sentence can be (in round years) such that it makes sense for the thief to confess (that is, where she has a greater than 50% chance of confessing)? Use `factor(percentYearsFreedom)` where `percentYearsFreedom` is the percentage of the next 10 years the thief will not be in jail. (Assume that this incident has scared her straight and she will not commit any other crimes.)* - -```js -var thiefRats = function(){ - return (flip()? true: false) -} - -var lenient = 6 - -var thief = Infer({}, function(){ - var otherThiefRats = thiefRats(); - var IRat = thiefRats(); - var years = (otherThiefRats? - (IRat? lenient : 10) : - (IRat? lenient : 0)); - var percentYearsFreedom = (10-years)/10 - factor(percentYearsFreedom) - return(IRat) -}) - -viz(thief) -``` - -From trial-and-error, if the lenient sentence is 6 years, the thief should be indifferent. - -![](Figures/agents-as-programs-11.png) - -Alternatively, you can infer the correct answer as follows: - -```js -var sentences = RandomInteger({n:10}) - -var thiefRats = function(){ - return (flip()? true: false) -} - -var thief = Infer({}, function(){ - var LenientSentence = sample(sentences); - var iRat = thiefRats() - var uRat = thiefRats() - var percentYearsFreedom = 1 - (iRat ? LenientSentence/10 : (uRat ? LenientSentence/10 : 0)) - factor (1*(percentYearsFreedom > .5)) - return LenientSentence -}) - -viz(thief) -``` - -![](Figures/agents-as-programs-12.png) - -As you can see, we end up prefering lenient sentences no longer than 4 years. - -## Exercise 4: Exploring RSA - -For this exercise, modify the RSA model introduced in the main text as necessary. - -### a) - -*How does increasing the optimality of the speaker affect the pragmatic listener's inferences? Try a couple values and report the results.* - -For convenience, we turn `alpha` into a parameter: - -```js -// Here is the code from the Frank and Goodman RSA model - -// possible objects of reference -var meaningPrior = function() { - uniformDraw([ - {shape: "square", color: "blue"}, - {shape: "circle", color: "blue"}, - {shape: "square", color: "green"} - ]) -} - -// possible one-word utterances -var utterances = ["blue","green","square","circle"] - -// meaning function to interpret the utterances -var meaning = function(utterance, obj){ - (utterance === "blue" || utterance === "green") ? utterance === obj.color : - (utterance === "circle" || utterance === "square") ? utterance === obj.shape : - true -} - -// literal listener -var literalListener = function(utterance){ - Infer({model: function(){ - var obj = meaningPrior(); - condition(meaning(utterance, obj)) - return obj - }}) -} - -// pragmatic speaker -var speaker = function(obj,alpha){ - Infer({model: function(){ - var utterance = uniformDraw(utterances) - factor(alpha * literalListener(utterance).score(obj)) - return utterance - }}) -} - -// pragmatic listener -var pragmaticListener = function(utterance,alpha){ - Infer({model: function(){ - var obj = meaningPrior() - observe(speaker(obj,alpha),utterance) - return obj - }}) -} - - -print("pragmatic listener's interpretation of 'blue', given alpha = 0.01:") -viz.table(pragmaticListener("blue", 0.01)) - -print("pragmatic listener's interpretation of 'blue', given alpha = 1:") -viz.table(pragmaticListener("blue", 1)) - -print("pragmatic listener's interpretation of 'blue', given alpha = 4:") -viz.table(pragmaticListener("blue", 4)) - -print("pragmatic listener's interpretation of 'blue', given alpha = 10:") -viz.table(pragmaticListener("blue", 10)) -``` - -![](Figures/agents-as-programs-8.png) - -As `alpha` increases, the pragmatic listener is increasingly likely to interpret `blue` as referring to the blue square. - -### b) - -*How do the inferences of $$L_{2}$$ compare to those of $$L_{1}$$?* - -```js -// Here is the code from the Frank and Goodman RSA model - -// possible objects of reference -var meaningPrior = function() { - uniformDraw([ - {shape: "square", color: "blue"}, - {shape: "circle", color: "blue"}, - {shape: "square", color: "green"} - ]) -} - -// possible one-word utterances -var utterances = ["blue","green","square","circle"] - -// meaning function to interpret the utterances -var meaning = function(utterance, obj){ - (utterance === "blue" || utterance === "green") ? utterance === obj.color : - (utterance === "circle" || utterance === "square") ? utterance === obj.shape : - true -} - -var alpha = 1 - -// literal listener -var literalListener = function(utterance){ - Infer({model: function(){ - var obj = meaningPrior(); - condition(meaning(utterance, obj)) - return obj - }}) -} - -// pragmatic speaker -var speaker = function(obj){ - Infer({model: function(){ - var utterance = uniformDraw(utterances) - factor(alpha * literalListener(utterance).score(obj)) - return utterance - }}) -} - -// pragmatic listener -var pragmaticListener = function(utterance){ - Infer({model: function(){ - var obj = meaningPrior() - observe(speaker(obj),utterance) - return obj - }}) -} - -// pragmatic speaker2 -var speaker2 = function(obj){ - Infer({model: function(){ - var utterance = uniformDraw(utterances) - factor(alpha * pragmaticListener(utterance).score(obj)) - return utterance - }}) -} - -// pragmatic listener #2 -var listener3 = function(utterance){ - Infer({model: function(){ - var obj = meaningPrior() - observe(speaker2(obj),utterance) - return obj - }}) -} - -print("L1's interpretation of 'blue'") -viz.table(pragmaticListener("blue")) - -print("L2's interpretation of 'blue'") -viz.table(listener3("blue")) -``` - -![](Figures/agents-as-programs-9.png) - -There is little additional effect. - -### c) - -*Add a blue circle to the scenario. What happens to the interpretion of "blue"? Why?* - -It becomes 50/50 between 'blue circle' and 'blue square'. This is because 'blue' is now useful for distinguishing between the two circles as well. - -### d) - -*Is there any way to get “blue” to refer to something green? Why or why not?* - -In this model, the literal listener expects the speaker to tell the literal truth, albeit with some noise. So there is no way to prefer an interpretation that is literally false to one that is literally true. So we'd need to relax the assumption that the literal listener expects the speaker to always tell the truth. \ No newline at end of file diff --git a/solutions/05-observing-sequences.md b/solutions/05-observing-sequences.md deleted file mode 100644 index 1c3c377..0000000 --- a/solutions/05-observing-sequences.md +++ /dev/null @@ -1,346 +0,0 @@ ---- -layout: exercise -title: Observing sequences - exercises ---- - - -## Exercise 1: What word comes next? - -a) *In human languages, certain words are more likely to follow others. "The" is more likely to be followed by "dog" than "rhino", and even less likely to be followed by "sings". * - -*Let's consider a fragment of English consisting of only the words "dogs", "cats", "chase", and "sleep". This fragment does not contain punctuation or capital letters. Now, suppose that somebody says, "dogs chase cats". Determine how likely "chase" is to be followed by each word in the vocabulary.* - -```js -//Helper function to compare arrays -var comparray = function(arr1,arr2){ - return (JSON.stringify(arr1) === JSON.stringify(arr2)) -} - -var mm = Infer({method:'MCMC', burn:10000, samples: 50000, onlyMAP:false}, function() { - - let vocab = ['dogs', 'cats', 'chase', 'sleep', 'stop']; - - var wordToDistribution = mem(function(word) { - return dirichletDrift({alpha:ones([vocab.length,1]), concentration:10}) - }) - - var transition = function(word) { - return categorical({ps: wordToDistribution(word), vs: vocab}) - } - - - let obs = ['dogs', 'chase', 'cats']; - - let generateSentence = function(lastState, sentence) { - let word = transition(lastState); - if (word == 'stop') return []; - return [word].concat(generateSentence(word, sentence)); - } - - condition(comparray(obs, generateSentence('start'))) - - return transition('chase'); - -}) - -viz(mm) -``` - -![](Figures/sequences-of-observations-1.png) - -b) *Assume now that in addition to saying "dogs chase cats", your interlocutor said a second sentence. However, you only heard the first word, which again was "dogs". What is the distribution across likely second words in this sentence? NOTE: If you are not careful, you will end up assigning some probability to "undefined". Be careful.* - -```js -//Helper function to compare arrays -var comparray = function(arr1,arr2){ - return (JSON.stringify(arr1) === JSON.stringify(arr2)) -} - -var mm = Infer({method:'MCMC', burn:10000, samples: 50000, onlyMAP: false}, function() { - - let vocab = ['dogs', 'cats', 'chase', 'sleep', 'stop']; - - var wordToDistribution = mem(function(word) { - return dirichletDrift({alpha:ones([vocab.length,1]), concentration:10}) - }) - - var transition = function(word) { - return categorical({ps: wordToDistribution(word), vs: vocab}) - } - - let generateSentence = function(lastState, sentence) { - let word = transition(lastState); - if (word == 'stop') return ['stop']; //to avoid probabilities on 'undefined' - return [word].concat(generateSentence(word, sentence)); - } - - let obs = ['dogs', 'chase', 'cats', 'stop']; - condition(comparray(obs, generateSentence('start'))) - - let newSentence = generateSentence('start'); - condition(newSentence[0] == 'dogs'); - return newSentence[1]; -}) - -viz(mm) -``` - -![](Figures/sequences-of-observations-2.png) - -c) *Suppose again that somebody said "dogs chase cats". Now suppose they spoke another sentence, where again the second word was "chase". Show that the most likely first word was "dogs". * - -```js -var comparray = function(arr1,arr2){ - return (JSON.stringify(arr1) === JSON.stringify(arr2)) -} - -var mm = Infer({method:'MCMC', burn:10000, samples: 50000, onlyMAP: false}, function() { - - let vocab = ['dogs', 'cats', 'chase', 'sleep', 'stop']; - - var wordToDistribution = mem(function(word) { - return dirichletDrift({alpha:ones([vocab.length,1]), concentration:10}) - }) - - var transition = function(word) { - return categorical({ps: wordToDistribution(word), vs: vocab}) - } - - let generateSentence = function(lastState, sentence) { - let word = transition(lastState); - if (word == 'stop') return ['stop']; //to avoid probabilities on 'undefined' - return [word].concat(generateSentence(word, sentence)); - } - - let obs = ['dogs', 'chase', 'cats', 'stop']; - condition(comparray(obs, generateSentence('start'))) - - let newSentence = generateSentence('start'); - condition(newSentence[1] == 'chase'); - return newSentence[0]; -}) - -viz(mm) -``` - -![](Figures/sequences-of-observations-3.png) - -## Exercise 2: Hidden Markov Model - -a) *Return to the model from Exercise 1b. Suppose that the second sentence, instead of beginning with "dogs", began with "cats". Provide the marginal distribution on the second word of that sentence.* - -```js -//Helper function to compare arrays -var comparray = function(arr1,arr2){ - return (JSON.stringify(arr1) === JSON.stringify(arr2)) -} - -var mm = Infer({method:'MCMC', burn:10000, samples: 50000, onlyMAP: false}, function() { - - let vocab = ['dogs', 'cats', 'chase', 'sleep', 'stop']; - - var wordToDistribution = mem(function(word) { - return dirichletDrift({alpha:ones([vocab.length,1]), concentration:10}) - }) - - var transition = function(word) { - return categorical({ps: wordToDistribution(word), vs: vocab}) - } - - let generateSentence = function(lastState, sentence) { - let word = transition(lastState); - if (word == 'stop') return ['stop']; //to avoid probabilities on 'undefined' - return [word].concat(generateSentence(word, sentence)); - } - - let obs = ['dogs', 'chase', 'cats', 'stop']; - condition(comparray(obs, generateSentence('start'))) - - let newSentence = generateSentence('start'); - condition(newSentence[0] == 'cats'); - return newSentence[1]; -}) - -viz(mm) -``` - -![](Figures/sequences-of-observations-4.png) - -b) *In Exercise 2a, you should have found that an ungrammatical sequence like "cats cats" is as likely as a grammatical sequence like "cats sleep". Why is this?* - -The model hasn't observed anything other than 'stop' as following the word 'cats'. This implies that 'stop' is the most likely option, but also that the algorithm is totally indifferent towards all the other words -- since this is a language without grammar, all words are treated the same (they literally coexist as entries in a single list). - -c) *Let's try a hidden Markov model instead. Note that two of the words in our fragment of English are nouns ("dogs", "cats") and two are verbs ("chase", "sleep").* - -*Model sentence generation as involving Markov transitions between parts of speech, rather than between the words themselves. * - -```js -//Helper function to compare arrays -var comparray = function(arr1,arr2){ - return (JSON.stringify(arr1) === JSON.stringify(arr2)) -} - -var drawWord = function(pos){ - return (pos=="N") ? uniformDraw(['dogs','cats']) : - (pos=="V") ? uniformDraw(['chase','sleep']) : - 'stop' -} -var POS = ["N", "V", "stop"] - -var posToDistribution = mem(function(pos) { - return dirichletDrift({alpha:ones([POS.length,1]), concentration:10}) - }) - -var transition = function(pos) { - return categorical({ps: posToDistribution(pos), vs: POS}) - } - -let generateSentence = function(lastPOS) { - let nextPOS = transition(lastPOS); - let word = drawWord(nextPOS); - return (word == 'stop') ? [word] : [word].concat(generateSentence(nextPOS)); -} - -var sentence = generateSentence("start"); -print(sentence) -``` - -d) *Try Exercise 2a, but using our new hidden Markov model. Show that we are now more likely to get the grammatical phrases "cats chase" or "cats sleep" than "cats cats" or "cats dogs".* - -```js -//Helper function to compare arrays -var comparray = function(arr1,arr2){ - return (JSON.stringify(arr1) === JSON.stringify(arr2)) -} - -var drawWord = function(pos){ - return (pos=="N") ? uniformDraw(['dogs','cats']) : - (pos=="V") ? uniformDraw(['chase','sleep']) : - 'stop' -} -var POS = ["N", "V", "stop"] - -var hmm = Infer({method:'MCMC', burn:10000, samples: 1000, lag:10, onlyMAP: false}, function() { - var posToDistribution = mem(function(pos) { - return dirichletDrift({alpha:ones([POS.length,1]), concentration:10}) - }) - - var transition = function(pos) { - return categorical({ps: posToDistribution(pos), vs: POS}) - } - - let generateSentence = function(lastPOS) { - let nextPOS = transition(lastPOS); - let word = drawWord(nextPOS); - return (word == 'stop') ? [word] : [word].concat(generateSentence(nextPOS)); - } - let obs = ['dogs', 'chase', 'cats', 'stop']; - condition(comparray(obs, generateSentence('start'))) - - let newSentence = generateSentence('start'); - condition(newSentence[0] == 'cats'); - return newSentence[1]; -}) - -viz(hmm) -``` - -![](Figures/sequences-of-observations-5.png) - -## Exercise 3: Phrase structure grammars - -a) *Extend your hidden Markov model from Exercise 2 so that our fragment of English additionally includes the determiners "the" and "a" as well as the adverb "diligently". Make "dogs", "cats", "chase", and "sleep" singular ("dog", "cat", "chases", "sleeps"). Condition on "The dog chases a cat" being a sentence in the language and generate some additional sentences.* - -*Note that for the solution used here, it's convenient (but not necessary) to set* `onlyMAP: true`. - - -```js -//Helper function to compare arrays -var comparray = function(arr1,arr2){ - return (JSON.stringify(arr1) === JSON.stringify(arr2)) -} - -var drawWord = function(pos){ - return (pos=="N") ? uniformDraw(['dog','cat']) : - (pos=="V") ? uniformDraw(['chases','sleeps']) : - (pos=="D") ? uniformDraw(['the','a']) : - (pos=="A") ? 'dilligently' : - 'stop' -} -var POS = ["N", "V", "D", "A", "stop"] - -var hmm = Infer({method:'MCMC', burn:10000, samples: 1000, lag:10, onlyMAP: true}, function() { - var posToDistribution = mem(function(pos) { - return dirichletDrift({alpha:ones([POS.length,1]), concentration:10}) - }) - - var transition = function(pos) { - return categorical({ps: posToDistribution(pos), vs: POS}) - } - - let generateSentence = function(lastPOS) { - let nextPOS = transition(lastPOS); - let word = drawWord(nextPOS); - return (word == 'stop') ? [word] : [word].concat(generateSentence(nextPOS)); - } - let obs = ['the', 'dog', 'chases', 'a', 'cat', 'stop']; - - factor(comparray(obs, generateSentence('start'))*5) - - var sent1 = generateSentence('start'); - var sent2 = generateSentence('start'); - var sent3 = generateSentence('start'); - var sent4 = generateSentence('start'); - var sent5 = generateSentence('start'); - - return {sent1: sent1, sent2: sent2, sent3: sent3, sent4: sent4, sent5: sent5} -}) - -print(hmm) -``` - -NOTE: This may take several tries to get it to run. Using `factor` instead of `condition` will work much better. We return to this in [Algorithms for Inference](07-inference-process.md). - -b) *Let us consider a phrase structure grammar for our English fragment instead, modeled on the one in Chapter 5. Again, condition on "The dog chases a cat" being a sentence in the language and generate some additional sentences.* - -*Note that for the solution used here, it's convenient (but not necessary) to set* `onlyMAP: true`. - -```js -//Helper function to compare arrays -var comparray = function(arr1,arr2){ - return (JSON.stringify(arr1) === JSON.stringify(arr2)) -} - -var uniformDraw = function (xs) {return xs[randomInteger(xs.length)]}; - -var D = function() {return uniformDraw(['the', 'a'])}; -var N = function() {return uniformDraw(['cat', 'dog'])}; -var V = function() {return uniformDraw(['chases', 'sleeps'])} -var A = function() {return uniformDraw(['diligently'])} -var AP = function() {return uniformDraw([A()])} -var NP = function() {return [D(), N()]} -var VP = function() {return uniformDraw([[V(), AP()], - [V(), NP()]])} -var S = function() {return [NP(), VP()]} - -var psg = Infer({method:'MCMC', burn:10000, samples: 1000, onlyMAP: true}, function() { - let obs = [['the', 'dog'], ['chases', ['a', 'cat']]] - condition(comparray(obs, S())) - - - var sent1 = S(); - var sent2 = S(); - var sent3 = S(); - var sent4 = S(); - var sent5 = S(); - - return {sent1: sent1, sent2: sent2, sent3: sent3, sent4: sent4, sent5: sent5} -}) - -print(psg) -``` - -c) *Which model produced better English sentences, the hidden Markov model in Exercise 3a or the phrase structure grammar model in Exercise 3b? Why do you suppose that is?* - -The phrase structure grammar produces much more sensible sentences, because it has a lot of prior knowlege about sentence structure. For instance, it is not capable of producing sentences with two articles in a row. - diff --git a/solutions/05.1-sequential-decisions.md b/solutions/05.1-sequential-decisions.md deleted file mode 100644 index 9df0e00..0000000 --- a/solutions/05.1-sequential-decisions.md +++ /dev/null @@ -1,407 +0,0 @@ ---- -layout: exercise -title: "Sequential decisions" -description: "Markov Decision Processes and Partially-Observable Markof Decision Processes" ---- - -## Exercise 1 - -Consider our "line-world" example from the chapter: - -'''js -var ___ = ' '; -var D = { name: 'Donut' }; - -var grid = [ - ['___', '___', '___', '___', D] -]; - -var mdp = makeGridWorldMDP({ grid, start: [0, 0] }); - -var transition = function(state, action) { - return state + action; -}; - -var utility = function(state) { - if (state === 4) { - return 1; - } else { - return 0; - } -}; - -var makeAgent = function() { - var act = function(state, timeLeft) { - return Infer({ model() { - var action = uniformDraw([-1, 0, 1]); - var eu = expectedUtility(state, action, timeLeft); - factor(100 * eu); - return action; - }}); - }; - - var expectedUtility = function(state, action, timeLeft) { - var u = utility(state, action); - var newTimeLeft = timeLeft - 1; - if (newTimeLeft === 0) { - return u; - } else { - return u + expectation(Infer({ model() { - var nextState = transition(state, action); - var nextAction = sample(act(nextState, newTimeLeft)); - return expectedUtility(nextState, nextAction, newTimeLeft); - }})); - } - }; - - return { act }; -} - - -var act = makeAgent().act; - -var simulate = function(state, timeLeft){ - if (timeLeft === 0){ - return []; - } else { - var action = sample(act(state, timeLeft)); - var nextState = transition(state, action); - return [state].concat(simulate(nextState, timeLeft - 1)) - } -}; - -var startState = 0; -var totalTime = 5; -viz.gridworld(mdp.world, { trajectory : [mdp.startState] }); -print("Agent's trajectory: " + simulate(startState, totalTime)); -''' - -### a) -*Change the world such that it is a loop, i.e. moving right from state `4` moves to state `0`, and moving left from state `0` moves to state `4`. How does this change the agent's sequence of actions?* - -Edit `transition()` to: - -```js -var transition = function(state, action) { - var nextstate = state + action - return (nextstate < 0) ? 4 : - (nextstate > 4) ? 0 : - nextstate; -}; -``` - -Agent now moves left to arrive at Donut shopt in a single move. - -![](Figures/sequential-decisions-1.PNG) - - -### b) -*Change the agent's action space such that the agent can also move two steps at a time. How does this change the agent's sequence of actions?* - -Edit `act()` as follows: - -```js - var act = function(state, timeLeft) { - return Infer({ model() { - var action = uniformDraw([-2, -1, 0, 1, 2]); - var eu = expectedUtility(state, action, timeLeft); - factor(100 * eu); - return action; - }}); - }; -``` - -Agent now only requires two moves to reach donut shop. - -![](Figures/sequential-decisions-2.PNG) - -### c) -*Change the agent's utility function such that the agent moves as far as possible to the right, given its available total time.* - -Edit `utility()` as follows: - -```js -var utility = function(state) { - return state; -}; -``` - -Agent now moves right on every time step. This is easiest to see if we increase the total amount of time (e.g., `var totalTime = 7`): - -![](Figures/sequential-decisions-3.PNG) - -## Exercise 2 - -*Consider this "line-world" involving a cookie shop and a donut shop. Bob starts out in between the donut shop and the cookie shop. Assume you observe Bob go to the donut shop in 3 time steps. Edit the code above to write a model to *infer* Bob's utility function for cookies and donuts. Use any reasonable prior.* - -~~~~ -// Anything that doesn't involve random choices can be put outside of the model - -var ___ = ' '; -var D = { name: 'Donut' }; -var C = { name: 'Cookie' }; - - var grid = [ - [C, '___', '___', '___', '___', '___', D] - ]; - -var mdp = makeGridWorldMDP({ grid, start: [3, 0] }); - -var transition = function(state, action) { - return state + action; - }; - -var model = function() { - - let utilities = [sample(Uniform({a: 0, b: 10})), sample(Uniform({a: 0, b: 10}))] - var utility = function(state) { - return (state == 0) ? utilities[0] : - (state == 6) ? utilities[1] : - 0; - }; - - var makeAgent = function() { - var act = function(state, timeLeft) { - return Infer({ model() { - var action = uniformDraw([-1, 0, 1]); - var eu = expectedUtility(state, action, timeLeft); - factor(100 * eu); - return action; - }}); - }; - - var expectedUtility = function(state, action, timeLeft) { - var u = utility(state, action); - var newTimeLeft = timeLeft - 1; - if (newTimeLeft === 0) { - return u; - } else { - return u + expectation(Infer({ model() { - var nextState = transition(state, action); - var nextAction = sample(act(nextState, newTimeLeft)); - return expectedUtility(nextState, nextAction, newTimeLeft); - }})); - } - }; - - return { act }; - } - - var act = makeAgent().act; - - var simulate = function(state, timeLeft){ - if (timeLeft === 0){ - return []; - } else { - var action = sample(act(state, timeLeft)); - var nextState = transition(state, action); - return [state].concat(simulate(nextState, timeLeft - 1)) - } - }; - - var startState = 3; - var totalTime = 4; - let path = simulate(startState, totalTime); - condition(path[3] == 6); - return { - Cookie: utilities[0], - Donut: utilities[1] - } - } - -var post = Infer({method: 'MCMC', samples: 10000}, model) -viz(post); -~~~~ - -![](Figures/sequential-decisions-4.PNG) - -Rejection sampling also works pretty well. This is with only 1,000 samples: - -![](Figures/sequential-decisions-5.PNG) - -Either way, we infer that the utility for Donut is likely to be at least slightly higher than that of Cookie. - - -## Exercise 3 - -*Use the codebox below to explore different levels of softmax noise. Find a setting of `utilityTable` and `alpha` such that the agent goes to West and East equally often and nearly always takes the most direct route to both East and West. Included below is code for simulating many trajectories and returning the trajectory length. You may find it helpful to extend this code to measure whether the route taken by the agent is direct or not.* - -The following code is useful for iteratively adjusting the parameters until the desired result is found. - -```js -///fold: -var makeHikeMDP = function(options) { - var H = { name: 'Hill' }; - var W = { name: 'West' }; - var E = { name: 'East' }; - var ___ = ' '; - var grid = [ - [___, ___, ___, ___, ___], - [___, '#', ___, ___, ___], - [___, '#', W , '#', E ], - [___, ___, ___, ___, ___], - [ H , H , H , H , H ] - ]; - return makeGridWorldMDP(_.assign({ grid }, options)); -}; - -var mdp = makeHikeMDP({ - start: [0, 1], - totalTime: 13, - transitionNoiseProbability: 0.1 -}); - -var world = mdp.world; -var startState = mdp.startState; -var makeUtilityFunction = mdp.makeUtilityFunction; -viz.gridworld(world) -/// - - -var utilityTable = { - East: 10, - West: 5.91, - Hill: -10, - timeCost: -1 -} - -var alpha = 5; // <- SOFTMAX NOISE - -// Create parameterized agent -var utility = makeUtilityFunction(utilityTable); -var agent = makeMDPAgent({ utility, alpha }, world); - -var trajectories = Infer({model() { - var trajectory = simulateMDP(startState, world, agent); - var locs = map(function(v){return(v.loc)}, trajectory) - return {locs} - }, - method: 'forward', - samples: 100000 -}); -viz.table(trajectories) -``` - -Note that the parameters given provide a nice result: - - - - -So we can definitely pick some values by trial and error. But that's boring. Let's infer it instead. The utility of West has to be less than the utility of East, or we'd never go to east. So let's fix the utility of East at 10 and find a value for West that is smaller. We'll also pick an alpha. Let's constrain it to between 0.1 and 6.0, just so we don't have too large of a space to search. - -Now, we'll factor an equal number of times on having gone straight to West and having gone straight to East. - -```js -var makeHikeMDP = function(options) { - var H = { name: 'Hill' }; - var W = { name: 'West' }; - var E = { name: 'East' }; - var ___ = ' '; - var grid = [ - [___, ___, ___, ___, ___], - [___, '#', ___, ___, ___], - [___, '#', W , '#', E ], - [___, ___, ___, ___, ___], - [ H , H , H , H , H ] - ]; - return makeGridWorldMDP(_.assign({ grid }, options)); -}; - -var mdp = makeHikeMDP({ - start: [0, 1], - totalTime: 13, - transitionNoiseProbability: 0.1 -}); - -var world = mdp.world; -var startState = mdp.startState; -var makeUtilityFunction = mdp.makeUtilityFunction; - -viz.gridworld(world) -var vals = Infer({ - model() { - var West = uniform({a: 1, b: 10}) - var utilityTable = { - East: 10, - West: West, - Hill: -10, - timeCost: -.1 - } - - // Create parameterized agent - var utility = makeUtilityFunction(utilityTable); - var alpha = uniform(0.1, 5); // <- SOFTMAX NOISE - var agent = makeMDPAgent({ utility, alpha }, world); - repeat(10, function(){ - var trajectory = simulateMDP(startState, world, agent); - var locs = map(function(v){return(v.loc)}, trajectory) - factor(1*(locs == [[0,1],[1,1],[2,1],[2,2]])) - var trajectory = simulateMDP(startState, world, agent); - var locs = map(function(v){return(v.loc)}, trajectory) - factor(1*(locs == [[0,1],[1,1],[2,1],[3,1],[4,1],[4,2]])) - }) - return {West: West, alpha: alpha} - }, - method: 'MCMC', - samples: 5000 -}); -repeat(10,function(){print(sample(vals))}) -``` - -![](Figures/sequential-decisions-6.PNG) - -We can see that a value of West near 9.0 and alpha near 0.4 tends to work. Let's confirm this through forward simulation - -```js -///fold: -var makeHikeMDP = function(options) { - var H = { name: 'Hill' }; - var W = { name: 'West' }; - var E = { name: 'East' }; - var ___ = ' '; - var grid = [ - [___, ___, ___, ___, ___], - [___, '#', ___, ___, ___], - [___, '#', W , '#', E ], - [___, ___, ___, ___, ___], - [ H , H , H , H , H ] - ]; - return makeGridWorldMDP(_.assign({ grid }, options)); -}; - -var mdp = makeHikeMDP({ - start: [0, 1], - totalTime: 13, - transitionNoiseProbability: 0.1 -}); - -var world = mdp.world; -var startState = mdp.startState; -var makeUtilityFunction = mdp.makeUtilityFunction; -viz.gridworld(world) -/// - - -var utilityTable = { - East: 10, - West: 3, - Hill: -10, - timeCost: -.1 -} - -var alpha = 0.4; // <- SOFTMAX NOISE - -// Create parameterized agent -var utility = makeUtilityFunction(utilityTable); -var agent = makeMDPAgent({ utility, alpha }, world); - -var trajectories = Infer({model() { - var trajectory = simulateMDP(startState, world, agent); - var locs = map(function(v){return(v.loc)}, trajectory) - return {locs} - }, - method: 'forward', - samples: 10000 -}); -viz.table(trajectories) -``` \ No newline at end of file diff --git a/solutions/Figures/agents-as-programs-1.png b/solutions/Figures/agents-as-programs-1.png deleted file mode 100644 index 18a1fb0..0000000 Binary files a/solutions/Figures/agents-as-programs-1.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-10.png b/solutions/Figures/agents-as-programs-10.png deleted file mode 100644 index a96db4c..0000000 Binary files a/solutions/Figures/agents-as-programs-10.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-11.png b/solutions/Figures/agents-as-programs-11.png deleted file mode 100644 index 8cfe0ac..0000000 Binary files a/solutions/Figures/agents-as-programs-11.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-12.png b/solutions/Figures/agents-as-programs-12.png deleted file mode 100644 index 24b514f..0000000 Binary files a/solutions/Figures/agents-as-programs-12.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-2.png b/solutions/Figures/agents-as-programs-2.png deleted file mode 100644 index ecc171f..0000000 Binary files a/solutions/Figures/agents-as-programs-2.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-3.png b/solutions/Figures/agents-as-programs-3.png deleted file mode 100644 index 142817f..0000000 Binary files a/solutions/Figures/agents-as-programs-3.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-4.png b/solutions/Figures/agents-as-programs-4.png deleted file mode 100644 index d3e1eef..0000000 Binary files a/solutions/Figures/agents-as-programs-4.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-5-1.png b/solutions/Figures/agents-as-programs-5-1.png deleted file mode 100644 index dcb3c6e..0000000 Binary files a/solutions/Figures/agents-as-programs-5-1.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-5-2.png b/solutions/Figures/agents-as-programs-5-2.png deleted file mode 100644 index ce72cf5..0000000 Binary files a/solutions/Figures/agents-as-programs-5-2.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-5-3.png b/solutions/Figures/agents-as-programs-5-3.png deleted file mode 100644 index 0442c00..0000000 Binary files a/solutions/Figures/agents-as-programs-5-3.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-5-4.png b/solutions/Figures/agents-as-programs-5-4.png deleted file mode 100644 index ecd4c74..0000000 Binary files a/solutions/Figures/agents-as-programs-5-4.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-5-5.png b/solutions/Figures/agents-as-programs-5-5.png deleted file mode 100644 index 4514524..0000000 Binary files a/solutions/Figures/agents-as-programs-5-5.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-6.png b/solutions/Figures/agents-as-programs-6.png deleted file mode 100644 index da21d07..0000000 Binary files a/solutions/Figures/agents-as-programs-6.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-7-1.png b/solutions/Figures/agents-as-programs-7-1.png deleted file mode 100644 index 63a506a..0000000 Binary files a/solutions/Figures/agents-as-programs-7-1.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-7-2.png b/solutions/Figures/agents-as-programs-7-2.png deleted file mode 100644 index 03a99f1..0000000 Binary files a/solutions/Figures/agents-as-programs-7-2.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-7-3.png b/solutions/Figures/agents-as-programs-7-3.png deleted file mode 100644 index fc68854..0000000 Binary files a/solutions/Figures/agents-as-programs-7-3.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-7-4.png b/solutions/Figures/agents-as-programs-7-4.png deleted file mode 100644 index 3b623d6..0000000 Binary files a/solutions/Figures/agents-as-programs-7-4.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-8.png b/solutions/Figures/agents-as-programs-8.png deleted file mode 100644 index 433996a..0000000 Binary files a/solutions/Figures/agents-as-programs-8.png and /dev/null differ diff --git a/solutions/Figures/agents-as-programs-9.png b/solutions/Figures/agents-as-programs-9.png deleted file mode 100644 index 27e101a..0000000 Binary files a/solutions/Figures/agents-as-programs-9.png and /dev/null differ diff --git a/solutions/Figures/inference-about-inference-1a.PNG b/solutions/Figures/inference-about-inference-1a.PNG deleted file mode 100755 index 9c01b0e..0000000 Binary files a/solutions/Figures/inference-about-inference-1a.PNG and /dev/null differ diff --git a/solutions/Figures/inference-about-inference-1b.PNG b/solutions/Figures/inference-about-inference-1b.PNG deleted file mode 100755 index a423242..0000000 Binary files a/solutions/Figures/inference-about-inference-1b.PNG and /dev/null differ diff --git a/solutions/Figures/inference-about-inference-1c.PNG b/solutions/Figures/inference-about-inference-1c.PNG deleted file mode 100755 index 59a1865..0000000 Binary files a/solutions/Figures/inference-about-inference-1c.PNG and /dev/null differ diff --git a/solutions/Figures/inference-about-inference-1d.PNG b/solutions/Figures/inference-about-inference-1d.PNG deleted file mode 100755 index a490468..0000000 Binary files a/solutions/Figures/inference-about-inference-1d.PNG and /dev/null differ diff --git a/solutions/Figures/inference-about-inference-PartA_1.PNG b/solutions/Figures/inference-about-inference-PartA_1.PNG deleted file mode 100755 index 27e7c5e..0000000 Binary files a/solutions/Figures/inference-about-inference-PartA_1.PNG and /dev/null differ diff --git a/solutions/Figures/inference-about-inference-PartB.PNG b/solutions/Figures/inference-about-inference-PartB.PNG deleted file mode 100755 index 3bf5957..0000000 Binary files a/solutions/Figures/inference-about-inference-PartB.PNG and /dev/null differ diff --git a/solutions/Figures/inference-about-inference-PartC.PNG b/solutions/Figures/inference-about-inference-PartC.PNG deleted file mode 100755 index 21ad17e..0000000 Binary files a/solutions/Figures/inference-about-inference-PartC.PNG and /dev/null differ diff --git a/solutions/Figures/inference-about-inference-PartD.PNG b/solutions/Figures/inference-about-inference-PartD.PNG deleted file mode 100755 index c721b3d..0000000 Binary files a/solutions/Figures/inference-about-inference-PartD.PNG and /dev/null differ diff --git a/solutions/Figures/inference-about-inference-PartE.PNG b/solutions/Figures/inference-about-inference-PartE.PNG deleted file mode 100644 index 7c71027..0000000 Binary files a/solutions/Figures/inference-about-inference-PartE.PNG and /dev/null differ diff --git a/solutions/Figures/inference-process-1.png b/solutions/Figures/inference-process-1.png deleted file mode 100644 index 8cbc379..0000000 Binary files a/solutions/Figures/inference-process-1.png and /dev/null differ diff --git a/solutions/Figures/inference-process-2.png b/solutions/Figures/inference-process-2.png deleted file mode 100644 index 9f60b71..0000000 Binary files a/solutions/Figures/inference-process-2.png and /dev/null differ diff --git a/solutions/Figures/inference-process-3.png b/solutions/Figures/inference-process-3.png deleted file mode 100644 index 1c79b33..0000000 Binary files a/solutions/Figures/inference-process-3.png and /dev/null differ diff --git a/solutions/Figures/inference-process-4.png b/solutions/Figures/inference-process-4.png deleted file mode 100644 index 1a11680..0000000 Binary files a/solutions/Figures/inference-process-4.png and /dev/null differ diff --git a/solutions/Figures/inference-process-5.png b/solutions/Figures/inference-process-5.png deleted file mode 100644 index c1405ee..0000000 Binary files a/solutions/Figures/inference-process-5.png and /dev/null differ diff --git a/solutions/Figures/inference-process-6.png b/solutions/Figures/inference-process-6.png deleted file mode 100644 index 8e484f9..0000000 Binary files a/solutions/Figures/inference-process-6.png and /dev/null differ diff --git a/solutions/Figures/inference-process-7.png b/solutions/Figures/inference-process-7.png deleted file mode 100644 index 1e406be..0000000 Binary files a/solutions/Figures/inference-process-7.png and /dev/null differ diff --git a/solutions/Figures/inference-process-8.png b/solutions/Figures/inference-process-8.png deleted file mode 100644 index 316ce1b..0000000 Binary files a/solutions/Figures/inference-process-8.png and /dev/null differ diff --git a/solutions/Figures/learning-as-inference-1.png b/solutions/Figures/learning-as-inference-1.png deleted file mode 100644 index 3ddafc4..0000000 Binary files a/solutions/Figures/learning-as-inference-1.png and /dev/null differ diff --git a/solutions/Figures/learning-as-inference-2.png b/solutions/Figures/learning-as-inference-2.png deleted file mode 100644 index a24218f..0000000 Binary files a/solutions/Figures/learning-as-inference-2.png and /dev/null differ diff --git a/solutions/Figures/learning-as-inference-3.png b/solutions/Figures/learning-as-inference-3.png deleted file mode 100644 index 4b8adbc..0000000 Binary files a/solutions/Figures/learning-as-inference-3.png and /dev/null differ diff --git a/solutions/Figures/learning-as-inference-4.png b/solutions/Figures/learning-as-inference-4.png deleted file mode 100644 index 319ea3e..0000000 Binary files a/solutions/Figures/learning-as-inference-4.png and /dev/null differ diff --git a/solutions/Figures/learning-as-inference-5.png b/solutions/Figures/learning-as-inference-5.png deleted file mode 100644 index b0d4cb1..0000000 Binary files a/solutions/Figures/learning-as-inference-5.png and /dev/null differ diff --git a/solutions/Figures/learning-as-inference-6.png b/solutions/Figures/learning-as-inference-6.png deleted file mode 100644 index 43589d8..0000000 Binary files a/solutions/Figures/learning-as-inference-6.png and /dev/null differ diff --git a/solutions/Figures/sequences-of-observations-1.png b/solutions/Figures/sequences-of-observations-1.png deleted file mode 100644 index a3ce530..0000000 Binary files a/solutions/Figures/sequences-of-observations-1.png and /dev/null differ diff --git a/solutions/Figures/sequences-of-observations-2.png b/solutions/Figures/sequences-of-observations-2.png deleted file mode 100644 index dc56140..0000000 Binary files a/solutions/Figures/sequences-of-observations-2.png and /dev/null differ diff --git a/solutions/Figures/sequences-of-observations-3.png b/solutions/Figures/sequences-of-observations-3.png deleted file mode 100644 index 97628a0..0000000 Binary files a/solutions/Figures/sequences-of-observations-3.png and /dev/null differ diff --git a/solutions/Figures/sequences-of-observations-4.png b/solutions/Figures/sequences-of-observations-4.png deleted file mode 100644 index f5c83c6..0000000 Binary files a/solutions/Figures/sequences-of-observations-4.png and /dev/null differ diff --git a/solutions/Figures/sequences-of-observations-5.png b/solutions/Figures/sequences-of-observations-5.png deleted file mode 100644 index 3775434..0000000 Binary files a/solutions/Figures/sequences-of-observations-5.png and /dev/null differ diff --git a/solutions/Figures/sequential-decisions-1.png b/solutions/Figures/sequential-decisions-1.png deleted file mode 100644 index 4797aad..0000000 Binary files a/solutions/Figures/sequential-decisions-1.png and /dev/null differ diff --git a/solutions/Figures/sequential-decisions-2.png b/solutions/Figures/sequential-decisions-2.png deleted file mode 100644 index 494e1be..0000000 Binary files a/solutions/Figures/sequential-decisions-2.png and /dev/null differ diff --git a/solutions/Figures/sequential-decisions-3.png b/solutions/Figures/sequential-decisions-3.png deleted file mode 100644 index 1e6881c..0000000 Binary files a/solutions/Figures/sequential-decisions-3.png and /dev/null differ diff --git a/solutions/Figures/sequential-decisions-4.png b/solutions/Figures/sequential-decisions-4.png deleted file mode 100644 index b5ff654..0000000 Binary files a/solutions/Figures/sequential-decisions-4.png and /dev/null differ diff --git a/solutions/Figures/sequential-decisions-5.png b/solutions/Figures/sequential-decisions-5.png deleted file mode 100644 index 372185c..0000000 Binary files a/solutions/Figures/sequential-decisions-5.png and /dev/null differ diff --git a/solutions/Figures/sequential-decisions-6.png b/solutions/Figures/sequential-decisions-6.png deleted file mode 100644 index de1fbfe..0000000 Binary files a/solutions/Figures/sequential-decisions-6.png and /dev/null differ diff --git a/solutions/Figures/sequential-decisions-7.png b/solutions/Figures/sequential-decisions-7.png deleted file mode 100644 index 55b8a81..0000000 Binary files a/solutions/Figures/sequential-decisions-7.png and /dev/null differ diff --git a/solutions/bayesian-data-analysis.md b/solutions/bayesian-data-analysis.md deleted file mode 100644 index f4d7f42..0000000 --- a/solutions/bayesian-data-analysis.md +++ /dev/null @@ -1,546 +0,0 @@ ---- -layout: exercise -title: Bayesian Data Analysis - solutions -custom_js: -- assets/js/towData.js -- assets/js/towConfigurations.js ---- - -## Exercise 1: Experimenting with priors and predictives - -In [our simple binomial model]({{site.baseurl}}/chapters/bayesian-data-analysis.html#a-simple-illustration), -we compared the parameter priors and posteriors to the corresponding **predictives** -which tell us what data we should expect given our prior and posterior beliefs. -For convenience, we've reproduced that model here. - -### Exercise 1.1 - -> Notice that we used a uniform distribution over the interval [0, 1] as our prior, reflecting our assumption that a probability must lie between 0 and 1 but otherwise remaining agnostic to which values are most likely to be the case. -While this is convenient, we may want to represent other assumptions. -> -> The [Beta distribution](https://en.wikipedia.org/wiki/Beta_distribution), expressed in WebPPL as `Beta({a:..., b:...})`' is a more general way of expressing beliefs over the interval [0,1]. -> The beta distribution is what's called the conjugate prior probability distribution for the binomial distribution due -> to its relationship between the prior and the posterior, and it also has a really neat interpretation that we will -> explore in this problem. -> -> You may want to visualize the beta distribution a few times with different parameters to get a sense of its shape. -> 1. Beta(1, 1) -> 2. Beta(3, 3) -> 3. Beta(50, 50) -> 4. Beta(1, 10) -> 5. Beta(10, 1) -> 6. Beta(.2, .2) - -~~~~ -viz(repeat(10000, function() { sample(Beta({a:1, b: 1})) })); -~~~~ - -> Here, we have the binomial distribution example from the chapter. - -~~~~ -// observed data -var k = 1; // number of successes -var n = 20; // number of attempts -var priorDist = Uniform({a: 0, b: 1}); - -var model = function() { - var p = sample(priorDist); - - // Observed k number of successes, assuming a binomial - observe(Binomial({p : p, n: n}), k); - - // sample from binomial with updated p - var posteriorPredictive = binomial(p, n); - - // sample fresh p (for visualization) - var prior_p = sample(priorDist); - // sample from binomial with fresh p (for visualization) - var priorPredictive = binomial(prior_p, n); - - return { - prior: prior_p, priorPredictive : priorPredictive, - posterior : p, posteriorPredictive : posteriorPredictive - }; -} - -var opts = {method: "MCMC", samples: 2500, lag: 50}; -var posterior = Infer(opts, model); - -viz.marginals(posterior); -~~~~ - -> Using the code above, answer the following questions. -> 1. Run the code as is. How does the posterior compare to beta(2, 20)? - -They look similar. - -> 2. Set the prior to beta(1, 1). What do you notice about the posterior distribution? - -The posterior looks similar to before. - -> 3. Set n = 10 and the prior to beta(1, 11). What do you notice about the posterior distribution? - -The posterior looks similar to before. - -> 4. Set k = 5, n = 15, and the prior to beta(1, 1). Compare the posterior to beta(6, 11). - -The posterior looks similar to beta(6, 11). - -> 5. Set k = 4, n = 10, and the prior to beta(1, 1). - What values of `a` and `b` would of beta(a, b) would the posterior look like? - -beta(5, 7) - -> 6. Set k = 10 and n = 20. - What values of `a` and `b` would a prior of beta(a, b) make the posterior look like beta(12, 10)? - -beta(2, 0). Since 0 isn't a valid parameter for beta, we could do something like beta(2, 0.01). - -> 7. Based on these observations (and any others you may have tried), - what is the relationship between the beta distribution and the binomial distribution? - -`a` can intuitively be thought of as the number of successes/trues/heads/etc. we've seen before, -and `b` as the number of failures/falses/tails, etc. we've seen before. -Note that if `a` and `b` are less than `1`, we have strong intuitions against values towards the center. - -### Exercise 1.2 - -> Predictive distributions are not restricted to exactly the same experiment as the observed data, -and can be used in the context of any experiment where the inferred model parameters make predictions. -> In the current simple binomial setting, for example, predictive distributions could be found by an experiment -that is different because it has `n' != n` observations. -> Change the model to implement an example of this. - -~~~~ -// observed data -var k = 1 // number of successes -var n = 20 // number of attempts -var new_n = 5 // number of attempts in the followup experiment -var priorDist = Beta({a: 1, b: 1}); - -var model = function() { - var p = sample(priorDist); - - // Observed k number of successes, assuming a binomial - observe(Binomial({p : p, n: n}), k); - - // sample from binomial with updated p - var posteriorPredictive = binomial(p, new_n); - - // sample fresh p (for visualization) - var prior_p = sample(priorDist); - // sample from binomial with fresh p (for visualization) - var priorPredictive = binomial(prior_p, n); - - return { - prior: prior_p, priorPredictive : priorPredictive, - posterior : p, posteriorPredictive : posteriorPredictive - }; -} - -var opts = {method: "MCMC", samples: 2500, lag: 50}; -var posterior = Infer(opts, model); - -viz.marginals(posterior) -~~~~ - -## Exercise 2: Parameter fitting vs. parameter integration - -> One of the strongest motivations for using Bayesian techniques for model-data evaluation is in how "nuisance" parameters are treated. -"Nuisance" parameters are parameters of no theoretical interest; their only purpose is to fill in a necessary slot in the model. -Classically, the most prominant technique (from the frequentist tradition) for dealing with these parameters is to -fit them to the data, i.e., to set their value equal to whatever value maximizes the model-data fit -(or, equivalently, minimizes some cost function). - -> The Bayesian approach is different. -Since we have *a priori* uncertainty about the value of our parameter, we will also have *a posteriori* -uncertainty about the value (though hopefully the uncertainty will be reduced). -What the Bayesian does is *integrate over* her posterior distribution of parameter values to make predictions. -Intuitively, rather than taking the value corresponding to the peak of the distribution (i.e., the maximum), -she's considering all values with their respective probabilites. - -> Why might this be important for model assessment? -Imagine the following situation. -You are piloting a task and want to use Bayesian Data Analysis because you hear it is useful when you have few data points. -You think that the task you've designed is a little too difficult for subjects. -(Let's imagine that you're a psychophysicist, and your task pertains to contrast discriminiation in the peripheral visual field.) -You think the current task design is too difficult, but you're not sure. -It may well be that it's fine for subjects. - -> Here is your prior. - -~~~~ -// Prior on task difficulty is uniform on [0, ..., 0.9], with a spike on 0.9 -// i.e., you think it's likely that the task is too difficult -var sampleTaskDifficulty = function() { - return flip() ? .9 : randomInteger(10) / 10; -} - -var model = function() { - return sampleTaskDifficulty(); -} - -viz.hist(Infer({method: 'enumerate'}, model), {numBins: 9}); -~~~~ - -> You have a model of how subjects perform on your task. -You could have a structured, probabilistic model here. -For simplicity, let's assume you have the simplest model of task performance. -It is a direct function of task-difficulty: subjects perform well if the task isn't too difficult. - -~~~~norun -var subjectPerformWell = !flip(taskDifficulty); -~~~~ - -> There's a lot of training involved in your task and that it's very time consuming for you to collect data. -You run one subject through your training regime and have them do the task. -The subject performs well! -Soon after, your adviser drops by and wants you to make a decision to collect more data or tweak your experimental paradigm. -You thought beforehand that your task was too difficult. - -> Since you wrote down your prior beliefs, we can examine how much the data update those beliefs about the `taskDifficulty` parameter. -How does your degree of belief in task difficult change as a result of your one pilot subject performing well? - -~~~~ -// Prior on task difficulty is uniform on [0, ..., 0.9], with a spike on 0.9 -var sampleTaskDifficulty = function() { - return flip() ? .9 : randomInteger(10) / 10; -}; - -// Compute posterior after seeing one subject perform well on the task -var taskDifficultyPosterior = Infer({method: 'enumerate'}, function(){ - var taskDifficulty = sampleTaskDifficulty(); - - // subject will perform well if the task is not too difficult - var subjectPerformsWell = !flip(taskDifficulty); - - // observe that they perform well (i.e. this value is true) - condition(subjectPerformsWell); - return taskDifficulty; -}); - -// Most likely task-difficulty is still .9 -print("MAP: " + taskDifficultyPosterior.MAP().val); - -// But a lot of probability mass is on lower values -viz.hist(taskDifficultyPosterior, {numBins: 9}); - -// Indeed, the expected subject ability is around .4 -print("Expectation: " + expectation(taskDifficultyPosterior)); -~~~~ - -### Exercise 2.1 - -> Would you proceed with more data collection or would you change your experimental paradigm? -In other words, do you still think your task is too hard? - -The posterior distribution shows that the task may not be as difficult as originally thought. -If this participant did well, other participants may also do well, so the paradigm may not need to be changed. - - -### Exercise 2.2 - -> In Exercise 2.1, you probably used either one value of the task-difficulty or the full distribution of values to decide about whether to continue data collection or tweak the paradigm. -We find ourselves in a similar situation when we have models of psychological phenomena and want to decide whether the model fits the data (or, equivalently, whether our psychological theory is capturing the phenomenon). -The traditional approach is the value (or "point-wise estimate") approach: take the value that corresponds to the best fit -(e.g., by using least-squares or maximum-likelihood estimation; here, -you would have taken the Maximum A Posteriori (or, MAP) estimate, which would be 0.9). -Why might this not be a good idea? Comment on the reliability of the MAP estimate and how MAP estimate compares to other values of the posterior distribution. - -The MAP is only 0.9 because of our strong prior beliefs. -The second most likely posterior value is the complete opposite (p = 0). - - -## Exercise 3 - -> Let's continue to explore the inferences you (as a scientist) can draw from the posterior over parameter values. -This posterior can give you an idea of whether your model is well-behaved. -In other words, do the predictions of your model depend heavily on the exact parameter value? - -> To help us understand how to examine posteriors over parameter settings, we're going to revisit the example of the blicket detector from the chapter on `Conditional Dependence`. - -> Here is the model, with slightly different names than the original example, and written in a parameter-friendly way. -It is set up to display the "backwards blocking" phenomenon. - - -~~~~ -var blicketBaseRate = 0.4; -var blicketPower = 0.9; -var nonBlicketPower = 0.05; -var machineSpontaneouslyGoesOff = 0.05; - -var blicketPosterior = function(evidence) { - return Infer({method: 'enumerate'}, function() { - var blicket = mem(function(block) { flip(blicketBaseRate) }); - var power = function(block) { blicket(block) ? blicketPower : nonBlicketPower }; - var machine = function(blocks) { - return (blocks.length == 0 ? - flip(machineSpontaneouslyGoesOff) : - flip(power(first(blocks))) || machine(rest(blocks))); - }; - // Condition on each of the pieces of evidence making the machine go off - map(function(blocks) { condition(machine(blocks)) }, evidence); - return blicket('A'); - }); -}; - -// A & B make the blicket-detector go off -viz(blicketPosterior([['A', 'B']])); - -// A & B make the blicket-detector go off, and then B makes the blicket detector go off -viz(blicketPosterior([['A', 'B'], ['B']])); -~~~~ - -### Exercise 3.1 - -> What are the parameters of the above model? -> Explain what they represent in plain English. - -`blicketBaseRate` | 0.4 | The probability that a block is a blicket. -`blicketPower` | 0.9 | The probability that a blicket will set off the detector and power the machine. -`nonBlicketPower` | 0.05 | The probability that a non-blicket will set off the detector and power the machine. -`machineSpontaneouslyGoesOff` | 0.05 | The probability that the machine goes off on its own. - - -### Exercise 3.2 - -> Let's analyze this model with respect to some data. -> First, we'll put priors on these parameters, and then we'll do inference, -> conditioning on some data we might have collected in an experiment on 4 year olds, a la Sobel, Tenenbaum, & Gopnik (2004). -> [The data used in this exercise is schematic data]. - -> Before running the program below, answer the following question: -> 1. What does the `Infer` statement in `dataAnalysis` return? -> 2. What does the `Infer` statement in `detectingBlickets` return? - -1. The posterior probabilities of params and predictions, fitted to the data. -2. An inference of what people will say given the evidence they see. - -~~~~ -///fold: - -// alternative proposal distribution for metropolis-hastings algorithm -var uniformKernel = function(prevVal) { - return Uniform({a: prevVal - 0.2, b: prevVal + 0.2}); -} - -var toProbs = function(predictions) { - var labels = map(function(i) { "predictive: cond" + i + " P(true)" }, - _.range(1, predictions.length + 1)) - var probs = map(function(model) {return Math.exp(model.score(true))}, predictions); - return _.zipObject(labels, probs); -} - -var dataSummary = function(data) { _.map(data, _.mean) }; - -var predictiveSummary = function(model) { - var labels = map(function(i) {return "predictive: cond" + i + " P(true)"}, - _.range(1, 6)); - return map(function(label) { - return expectation(model, function(s) { - return s[label] - }); - }, labels); -}; -/// - -// 5 experiment conditions / stimuli -var possibleEvidenceStream = [ - [['A']], - [['A', 'B']], - [['A', 'B'], ['B']], - [['A', 'B'], ['A', 'B']], - [[]] -]; - -// for each condition. -// note: the question is always "is A a blicket?" -var data = [ - repeat(10, function() { true }).concat(false), - repeat(6 , function() { true }).concat(repeat(5, function() { false })), - repeat(4, function() { true }).concat(repeat(7, function() { false })), - repeat(8, function() { true }).concat(repeat(3, function() { false })), - repeat(2, function() { true }).concat(repeat(9, function() { false })) -]; - -// Same model as above, but parameterized -var detectingBlickets = mem(function(evidence, params) { - return Infer({method: 'enumerate'}, function() { - var blicket = mem(function(block) { flip(params.blicketBaseRate) }); - var power = function(block) { blicket(block) ? params.blicketPower : params.nonBlicketPower }; - var machine = function(blocks) { - return (blocks.length == 0 ? - flip(params.machineSpontaneouslyGoesOff) : - flip(power(first(blocks))) || machine(rest(blocks))); - }; - map(function(blocks){condition(machine(blocks))}, evidence); - return blicket('A'); - }) -}) - -var dataAnalysis = Infer({method: 'MCMC', - samples: 5000, - callbacks: [editor.MCMCProgress()]}, - function() { - var params = { - blicketBaseRate: sample(Uniform({a: 0, b: 1}), {driftKernel: uniformKernel}), - blicketPower: sample(Uniform({a: 0, b: 1}), {driftKernel: uniformKernel}), - nonBlicketPower: sample(Uniform({a: 0, b: 1}), {driftKernel: uniformKernel}), - machineSpontaneouslyGoesOff: sample(Uniform({a: 0, b: 1}), {driftKernel: uniformKernel}) - } - - var cognitiveModelPredictions = map(function(evidence) { - return detectingBlickets(evidence, params) - }, possibleEvidenceStream); - - // observe each data point under the model's predictions - map2(function(dataForStim, modelPosterior) { - map(function(dataPoint) { - observe(modelPosterior, dataPoint); - }, dataForStim) - }, data, cognitiveModelPredictions); - - var predictives = toProbs(cognitiveModelPredictions); - return _.extend(params, predictives); -}) - -viz.marginals(dataAnalysis); -viz.scatter(predictiveSummary(dataAnalysis), dataSummary(data), - {xLabel: 'model', yLabel: 'data'}); -~~~~ - -### Exercise 3.3 - -> Now, run the program. -> [Note: This will take between 15-30 seconds to run.] -> Interpret each of the resulting plots. - -`blicketBaseRate` | blickets are common, but not *that* common -`blicketPower` | blickets rarely fail to set off the detector -`nonBlicketPower` | non-blickets might *rarely* set off the detector -`machineSpontaneouslyGoesOff` | the detector might *rarely* just go off for no reason -`predictive: cond1 P(true)` | `A` is probably a blicket... -`predictive: cond2 P(true)` | `A` is slightly more likely to be a blicket -`predictive: cond3 P(true)` | no idea if `A` is a blicket -`predictive: cond4 P(true)` | `A` is slightly more likely to be a blicket -`predictive: cond5 P(true)` | `A` is probably not a blicket -model (`x`) vs. data (`y`) | We can accurately guess people's response from model, but they're not exactly 1-1 - -### Exercise 3.4 - -> How do the posterior parameter values relate to the parameter values that were set in the original program? - -The original program's parameter values were approximately the expected value of the posterior parameter values. - -### Exercise 3.5 - -> Look carefully at the priors (in the code) and the posteriors (in the plots) over `blicketPower` and `nonBlicketPower`. -> Were there any a priori assumptions about the relationship between these parameters in the experimental setup? -> Do you think we would be justified in imposing any assumptions to the model? -> Consider the posterior distributions. -> How was the data analysis model able to find the relationship between these parameters? - -The experiment assumes that blickets make the machine go off (it's what the kids were told), but the model makes -no such a priori assumptions, i.e. `blicketPower` > `nonBlicketPower`, etc. -However, since kids show they know this from the responses they gave -(when `A` makes the machine go off most of the time, they call it a blicket, not a non-blicket), -the inference model can learn this asymmetry from the data. -We can test this by switching the `true` and `false` responses that kids give. - -~~~~ -var data = [ - repeat(10, function(){return false}).concat(true), - repeat(6 , function(){return false}).concat(repeat(5, function(){return true})), - repeat(4, function(){return false}).concat(repeat(7, function(){return true})), - repeat(8, function(){return false}).concat(repeat(3, function(){return true})), - repeat(2, function(){return false}).concat(repeat(9, function(){return true})) -]; -~~~~ - -When we do that, we see that `nonBlicketPower` is greater than `blicketPower` in the posteriors. - -Leaving this relationship for the model to infer is a nice sanity check. -It's cool that we can learn the appropriate relationship (`blicketPower > nonBlicketPower`) from the data, -but it would be OK to code it in. -It wasn't a key part of our theory, and we're pretty confident that kids understood how blickets worked. - -### Exercise 3.6 - -> Do you notice anything about the scatter plot? -> How would you interpret this? -> Is there something we could add to the data analysis model to account for this? - -There seems to be a linear relationship between the model predictions and the data, but the values are not always equal. -If we add some scaling factor, we could translate the model outputs to get to accurate predictions of people's responses. - -### Exercise 3.7 - -> Now, we're going to examine the predictions of the model if we had done a more traditional analysis of point-estimates of parameters (i.e. fitting parameters). -> Examine your histograms and determine the "maximum a posteriori" (MAP) value for each parameter. -> Plug those into the code below and run it. - -~~~~ -///fold: - -var toProbs = function(predictions) { - var labels = map(function(i) { "predictive: cond" + i + " P(true)" }, - _.range(1, predictions.length + 1)) - var probs = map(function(model) {return Math.exp(model.score(true))}, predictions); - return _.zipObject(labels, probs); -} - -var dataSummary = function(data) { _.map(data, _.mean) }; - -// 5 experiment conditions / stimuli -var possibleEvidenceStream = [ - [['A']], - [['A', 'B']], - [['A', 'B'], ['B']], - [['A', 'B'], ['A', 'B']], - [[]] -]; - -// for each condition. -// note: the question is always "is A a blicket?" -var data = [ - repeat(10, function() { true }).concat(false), - repeat(6 , function() { true }).concat(repeat(5, function() { false })), - repeat(4, function() { true }).concat(repeat(7, function() { false })), - repeat(8, function() { true }).concat(repeat(3, function() { false })), - repeat(2, function() { true }).concat(repeat(9, function() { false })) -]; - -var detectingBlickets = mem(function(evidence, params) { - return Infer({method: 'enumerate'}, function() { - var blicket = mem(function(block) { flip(params.blicketBaseRate) }); - var power = function(block) { blicket(block) ? params.blicketPower : params.nonBlicketPower }; - var machine = function(blocks) { - return (blocks.length == 0 ? - flip(params.machineSpontaneouslyGoesOff) : - flip(power(first(blocks))) || machine(rest(blocks))); - }; - map(function(blocks){condition(machine(blocks))}, evidence); - return blicket('A'); - }) -}) -/// - -var params = { // some possible MAP values - blicketBaseRate : 0.43, - blicketPower: .97, - nonBlicketPower: .04, - machineSpontaneouslyGoesOff: .05 -}; - -var bestFitModelPredictions = map(function(evidence) { - return Math.exp(detectingBlickets(evidence, params).score(true)); -}, possibleEvidenceStream); - -viz.scatter(bestFitModelPredictions, dataSummary(data)); -~~~~ - -### Exercise 3.8 - -> What can you conclude about the two ways of looking at parameters in this model's case? - -The model predictions fits the data better using the full posterior distributions than just the MAP point estimates. diff --git a/solutions/conditional-dependence.md b/solutions/conditional-dependence.md deleted file mode 100644 index 1b4c4a5..0000000 --- a/solutions/conditional-dependence.md +++ /dev/null @@ -1,174 +0,0 @@ ---- -layout: exercise -title: conditional dependence - solutions ---- - -## Exercise 1: Epidemiology - -> Imagine that you are an epidemiologist and you are determining people's cause of death. -> In this simplified world, there are two main diseases, cancer and the common cold. -> People rarely have cancer, $$p( \text{cancer}) = 0.00001$$, but when they do have cancer, it is often fatal, $$p( \text{death} \mid \text{cancer} ) = 0.9$$. -> People are much more likely to have a common cold, $$p( \text{cold} ) = 0.2$$, but it is rarely fatal, $$p( \text{death} \mid \text{cold} ) = 0.00006$$. -> Very rarely, people also die of other causes $$p(\text{death} \mid \text{other}) = 0.000000001$$. -> -> Write this model in WebPPL and use `Infer` to answer these questions (Be sure to include your code in your answer): - -~~~~ -viz.table(Infer({method: 'enumerate'}, function() { - var cancer = flip(0.00001); - var cold = flip(0.2); - var death_by_cancer = cancer && flip(0.9); - var death_by_cold = cold && flip(0.00006); - var other_death = flip(0.000000001); - var death = death_by_cancer || death_by_cold || other_death; - return {cancer: cancer, cold: cold, death: death}; -})); -~~~~ - -### a) - -> Compute $$p( \text{cancer} \mid \text{death} , \text{cold} )$$ and $$p( \text{cancer} \mid \text{death} , \text{no cold} )$$. -> How do these probabilities compare to $$p( \text{cancer} \mid \text{death} )$$ and $$p( \text{cancer} )$$? -> Using these probabilities, give an example of explaining away. - -| Event | Prob | -| ------------------------- | ---- | -| Prior | 0.00001| -| Given death | 0.42855| -| Given death and cold | 0.13043| -| Given death and no cold | 0.99989| - -Having a cold explains away the death. -Given only the information that a person died, cancer is relatively likely. -When we learn the person also had a cold, this probability of cancer goes down, not down to prior levels, but pretty unlikely. -If we instead learn that the person died and did not have a cold, we become almost certain that the person died of cancer. - - -~~~~ -display("prior") -viz.table(Infer({method: 'enumerate'}, function() { - var cancer = flip(0.00001); - var cold = flip(0.2); - var death_by_cancer = cancer && flip(0.9); - var death_by_cold = cold && flip(0.00006); - var other_death = flip(0.000000001); - var death = death_by_cancer || death_by_cold || other_death; - return cancer; -})); -~~~~ - -~~~~ -display("death") -viz.table(Infer({method: 'enumerate'}, function() { - var cancer = flip(0.00001); - var cold = flip(0.2); - var death_by_cancer = cancer && flip(0.9); - var death_by_cold = cold && flip(0.00006); - var other_death = flip(0.000000001); - var death = death_by_cancer || death_by_cold || other_death; - condition(death); - return cancer; -})); -~~~~ - -~~~~ -display("death and cold") -viz.table(Infer({method: 'enumerate'}, function() { - var cancer = flip(0.00001); - var cold = flip(0.2); - var death_by_cancer = cancer && flip(0.9); - var death_by_cold = cold && flip(0.00006); - var other_death = flip(0.000000001); - var death = death_by_cancer || death_by_cold || other_death; - condition(death && cold) - return cancer; -})); -~~~~ - -~~~~ -display("death and no cold") -viz.table(Infer({method: 'enumerate'}, function() { - var cancer = flip(0.00001); - var cold = flip(0.2); - var death_by_cancer = cancer && flip(0.9); - var death_by_cold = cold && flip(0.00006); - var other_death = flip(0.000000001); - var death = death_by_cancer || death_by_cold || other_death; - condition(death && !cold) - return cancer; -})); -~~~~ - -### b) - -> Compute $$p( \text{cold} \mid \text{death} , \text{cancer} )$$ and $$p( \text{cold} \mid \text{death} , \text{no cancer} )$$. -> How do these probabilities compare to $$p( \text{cold} \mid \text{death} )$$ and $$p( \text{cold} )$$? -> Using these probabilities, give an example of explaining away. - -| Event | Prob | -| ------------------------- | ---- | -| Prior | 0.20 | -| Given death | 0.66 | -| Given death and cancer | 0.20 | -| Given death and no cancer | 0.99 | - -Having cancer *really* explains away the death. -Given only the information that a person died, a cold is very likely. -When we learn the person also had cancer, this probability goes back down to almost exactly the prior. -If we instead learn that the person *didn't* have cancer, we become almost certain they died of a cold. - - -~~~~ -display("prior") -viz.table(Infer({method: 'enumerate'}, function() { - var cancer = flip(0.00001); - var cold = flip(0.2); - var death_by_cancer = cancer && flip(0.9); - var death_by_cold = cold && flip(0.00006); - var other_death = flip(0.000000001); - var death = death_by_cancer || death_by_cold || other_death; - return cold; -})); -~~~~ - -~~~~ -display("death") -viz.table(Infer({method: 'enumerate'}, function() { - var cancer = flip(0.00001); - var cold = flip(0.2); - var death_by_cancer = cancer && flip(0.9); - var death_by_cold = cold && flip(0.00006); - var other_death = flip(0.000000001); - var death = death_by_cancer || death_by_cold || other_death; - condition(death); - return cold; -})); -~~~~ - -~~~~ -display("death and cancer") -viz.table(Infer({method: 'enumerate'}, function() { - var cancer = flip(0.00001); - var cold = flip(0.2); - var death_by_cancer = cancer && flip(0.9); - var death_by_cold = cold && flip(0.00006); - var other_death = flip(0.000000001); - var death = death_by_cancer || death_by_cold || other_death; - condition(death && cancer) - return cold; -})); -~~~~ - -~~~~ -display("death and no cancer") -viz.table(Infer({method: 'enumerate'}, function() { - var cancer = flip(0.00001); - var cold = flip(0.2); - var death_by_cancer = cancer && flip(0.9); - var death_by_cold = cold && flip(0.00006); - var other_death = flip(0.000000001); - var death = death_by_cancer || death_by_cold || other_death; - condition(death && !cancer) - return cold; -})); -~~~~ diff --git a/solutions/conditioning.md b/solutions/conditioning.md deleted file mode 100644 index c3dff8c..0000000 --- a/solutions/conditioning.md +++ /dev/null @@ -1,555 +0,0 @@ ---- -layout: exercise -title: Conditioning - solutions -custom_js: -- assets/js/box2d.js -- assets/js/physics.js ---- - - -## Exercise 1: Fair coins and biased coins - -### a) - -> I flip a fair coin. What is the probability that it lands heads? - -0.5 - -~~~~ -var model = function() { - return flip() ? "H" : "T" -} - -var logProb = Infer({method:'enumerate'}, model).score('H'); -Math.exp(logProb); -~~~~ - - -### b) - -> I also have a biased coin, with $$P(\text{heads})=0.9$$. -> I hand you one of the coins (either biased or fair) without telling you which. -> You flip it three times. -> -> Given that first two coin flips landed on heads, what is the posterior distribution for the next flip? - -P(Heads) = 0.8056603773584906 - -~~~~ -var flipCoin = function(coinType) { - return coinType == "fair" ? flip() : flip(0.9); -} - -var model = function() { - var coinType = flip() ? "fair" : "biased"; - - var flip1 = flipCoin(coinType); - var flip2 = flipCoin(coinType); - var flip3 = flipCoin(coinType); - - // first 2 flips are `true` - condition(flip1 && flip2); - - // what is the next flip going to be? - return flip3; -} - -viz.table(Infer({method:'enumerate'}, model)); -~~~~ - - -### c) - -> Given that all three flips landed on heads, what is the probability that the coin was biased? - -P(biased) = 0.8536299765807963 - -~~~~ -var flipCoin = function(coinType) { - return coinType == "fair" ? flip() : flip(0.9); -} - -var model = function() { - var coinType = flip() ? "fair" : "biased"; - - var flip1 = flipCoin(coinType); - var flip2 = flipCoin(coinType); - var flip3 = flipCoin(coinType); - - // first 2 flips are `true` - condition(flip1 && flip2 && flip3); - - // what is the next flip going to be? - return coinType; -} - -viz.table(Infer({method:'enumerate'}, model)); -~~~~ - - -### d) - -> Given that the first two flips were different, what is the probability that the third flip will be heads? - -P(Heads) = 0.6058823529411763 - -~~~~ -var flipCoin = function(coinType) { - return coinType == "fair" ? flip() : flip(0.9); -} - -var model = function() { - var coinType = flip() ? "fair" : "biased"; - - var flip1 = flipCoin(coinType); - var flip2 = flipCoin(coinType); - var flip3 = flipCoin(coinType); - - // first 2 flips are `true` - condition(flip1 != flip2); - - // what is the next flip going to be? - return flip3; -} - -viz.table(Infer({method:'enumerate'}, model)); -~~~~ - - -## Exercise 2: Conditioning and Intervention - -> In the earlier [Medical Diagnosis]({{site.baseurl}}/chapters/02-generative-models.html#example-causal-models-in-medical-diagnosis) section we suggested understanding the patterns of symptoms for a particular disease by changing the prior probability of the disease such that it is always true (also called the *do* operator). - - -### a) - -> Show that *intervening* (setting) on `lungCancer` has the same effect as *conditioning* on `lungCancer` has the same effect on `cough` in this example. -> Create a table showing the marginal probabilities. -> What must be true about the causal structure for this to be the case? - -`lungCancer` is a cause of `cough` and it's not causally dependent on any other variable in the program. - -~~~~ -// original -display("original"); -viz.table(Infer({method: "enumerate"}, function() { - var lungCancer = flip(0.01); - var cold = flip(0.2); - var cough = ( - (cold && flip(0.5)) || - (lungCancer && flip(0.3)) - ); - return cough; -})) - -// intervention -display("intervention"); -viz.table(Infer({method: "enumerate"}, function() { - var lungCancer = true; - var cold = flip(0.2); - var cough = ( - (cold && flip(0.5)) || - (lungCancer && flip(0.3)) - ); - return cough; -})) - - -// conditioning -display("conditioning"); -viz.table(Infer({method: "enumerate"}, function() { - var lungCancer = flip(0.01); - condition(lungCancer); - var cold = flip(0.2); - var cough = ( - (cold && flip(0.5)) || - (lungCancer && flip(0.3)) - ); - return cough; -})) -~~~~ - - -### b) - -> This time, modify the program so that intervening and conditioning produce different results. Under what circumstances does intervening produce different results from conditioning? -> Create a table showing the marginal probabilities. -> -> *Hint:* you do not need to introduce any new variables. Think about what other questions you can ask in this example. - -Conditioning on a causally downstream variable can inform us about what the value of the causal parent *might have been*, but intervention breaks that conditional dependence. - -~~~~ -// original -display("original"); -viz.table(Infer({method: "enumerate"}, function() { - var lungCancer = flip(0.1); - var cold = flip(0.2); - var cough = ( - (cold && flip(0.5)) || - (lungCancer && flip(0.3)) - ); - return lungCancer; -})) - -// intervention -display("intervention"); -viz.table(Infer({method: "enumerate"}, function() { - var lungCancer = flip(0.1); - var cold = flip(0.2); - var cough = true; - return lungCancer; -})) - -// conditioning -display("conditioning"); -viz.table(Infer({method: "enumerate"}, function() { - var lungCancer = flip(0.1); - var cold = flip(0.2); - var cough = ( - (cold && flip(0.5)) || - (lungCancer && flip(0.3)) - ); - condition(cough); - return lungCancer; -})) -~~~~ - - -## Exercise 3: Computing marginals - -> Find the marginal distribution of the return values from these programs mathematically (by hand). - - -### a) - - - - -$$ P(a \mid a \lor b) = \frac{ P(a \land (a \lor b)) } { P(a \lor b) } = \frac{P(a)} {P(a \lor b)} = \frac{0.5} {1 - P(!a \land !b)} = \frac{0.5} {1 - (0.5)\cdot(0.5)} = 2/3 $$ - - -~~~~ -viz.table(Infer({method: "enumerate"}, function() { - var a = flip(); - var b = flip(); - condition(a || b); - return a; -})); -~~~~ - - -### b) - -~~~~ -var smilesModel = function() { - var nice = mem(function(person) { flip(.7) }); - - var smiles = function(person) { - return nice(person) ? flip(.8) : flip(.5); - } - - condition(smiles('alice') && smiles('bob') && smiles('alice')); - - return nice('alice'); -} - -viz.table(Infer({method: "enumerate"}, smilesModel)); -~~~~ - -Using Bayes rule: - -$$ P(N_A \mid S_A, S_B, S_A) \propto P(S_A, S_B, S_A \mid N_A) P(N_A) $$ - -Alice is nice: - -$$ P(S_A | N_A)^2 P(S_B | N_A) P(N_A) = P(S_A | N_A)^2 \left(P(S_B | N_B)P(N_B) + P(S_B | !N_B)P(!N_B)\right) P(N_A) = 0.31808 $$ - -Alice isn't nice: - -$$ P(S_A | !N_A)^2 P(S_B | !N_A) P(!N_A) = P(S_A | !N_A)^2 \left(P(S_B | N_B)P(N_B) + P(S_B | !N_B)P(!N_B)\right) P(!N_A) = 0.05325 $$ - -Normalize: - -$$ P(N_A \mid S_A, S_B, S_A) = 0.31808 / (0.31808 + 0.05325) = 0.85659655831 $$ - - -## Exercise 4: Extending the smiles model - -### a) - -> Describe (using ordinary English) the `smilesModel` program in Exercise 3b. - -Most people are nice. Nice people smile a lot, other people smile less. Alice smiled twice (and Bob smiled once). Is Alice nice? - - -### b) - -> Extend `smilesModel` to create a version of the model considers two additional factors: - -> 1. People will smile 80% of the time if they want something from you and 50% if they do not. -> 2. *Nice* people will only want something from you 20% of the time; non-nice people 50% of the time. - -> Don't forget that nice people also smile more often! - -> *Hint:* Which variables change at different times for the same person? Which values *depend* on other values? - -~~~~ -var extendedSmilesModel = function() { - var nice = mem(function(person) { flip(.7) }); - - var wantsSomething = function(person) { - return flip(nice(person) ? .2 : .5); - } - - var smiles = function(person, wants) { - return (wants ? flip(.8) : flip(.5)) - || (nice(person) ? flip(.8) : flip(.5)); - } - - var wants = wantsSomething('alice'); - return smiles('alice', wants); -} - -Infer({method: "enumerate"}, extendedSmilesModel); -~~~~ - -Note that smiles now has two possible causes. -Being nice makes you more likely to smile and, separately, wanting something makes you more likely to smile. -Using the OR operator here captures the intuition that either one is sufficient to make someone more likely to smile. -Critically, being nice is a persistant property of a person and is therefore held constant within an execution using `mem` while wanting something is circumstantial: the same person may want something on one occasion and not another. -Finally, by making smiles a function of a person and *whether they want something* at a given time (as opposed to calling `wantsSomething` inside smiles), we can query a particular instance of wanting something without flipping separate coins outside and inside. - - -### c) - -> Suppose you've seen Bob five times this week and each time, he was not smiling. But today, you see Bob and he *is* smiling. Use this `extendedSmilesModel` model to compute the posterior belief that Bob wants something from you today. - -> *Hint:* How will you represent the same person (Bob) smiling *multiple times*? What features of Bob will stay the same each time he smiles (or doesn't) and what features will change? - -~~~~ -var extendedSmilesModel = function() { - var nice = mem(function(person) { flip(.7) }); - - var wantsSomething = function(person) { - return flip(nice(person) ? .2 : .5); - } - - var smiles = function(person, wants) { - return (wants ? flip(.8) : flip(.5)) - || (nice(person) ? flip(.8) : flip(.5)); - } - - var wantsToday = wantsSomething('bob'); - - condition(!smiles('bob', wantsSomething('bob'))); // no smile on day 1 - condition(!smiles('bob', wantsSomething('bob'))); // no smile on day 2 - condition(!smiles('bob', wantsSomething('bob'))); // no smile on day 3 - condition(!smiles('bob', wantsSomething('bob'))); // no smile on day 4 - condition(!smiles('bob', wantsSomething('bob'))); // no smile on day 5 - condition(smiles('bob', wantsToday)); // smiles today! - - return wantsToday; -} - -viz.table(Infer({method: "enumerate"}, extendedSmilesModel)); -~~~~ - -We condition on all the data that we have; bob did not smile the previous 5 times, but then smiled today. -Again, because wantsSomething is not memoized, each of these observations is independent. -We have uncertainty over whether bob wanted something on *every* day, but we're only interested in whether he wanted something on the day that he smiled, thus why we store that value and return it at the end. - - -## Exercise 5: Sprinklers and Rain - -### a) - -> I have a particularly bad model of the sprinkler in my garden. -> It is supposed to water my grass every morning, but it turns on only half the time (at random, as far as I can tell). -> Fortunately, I live in a city where it also rains 30% of days. -> -> One day I check my lawn and see that it is wet, meaning that either it rained that morning or my sprinkler turned on (or both). -> -> Answer the following questions, either using the Rules of Probability or by writing your own sprinkler model in webppl. -> -> * What is the probability that it rained? -> * What is the probability that my sprinkler turned on? - -~~~~ -display("rain") -viz.table(Infer({method: "enumerate"}, function() { - var sprinkler = flip(); - var rain = flip(0.3); - var wetLawn = sprinkler || rain; - condition(wetLawn); - return rain; -})) - -display("sprinkler") -viz.table(Infer({method: "enumerate"}, function() { - var sprinkler = flip(); - var rain = flip(0.3); - var wetLawn = sprinkler || rain; - condition(wetLawn); - return sprinkler; -})) -~~~~ - - -### c) - -> My neighbour Kelsey, who has the same kind of sprinkler, tells me that her lawn was also wet that same morning. -> What is the new posterior probability that it rained? - -~~~~ -viz.table(Infer({method: "enumerate"}, function() { - var rain = flip(0.3); - var mySprinkler = flip(); - var herSprinkler = flip(); - var myLawnIsWet = mySprinkler || rain; - var herLawnIsWet = herSprinkler || rain; - condition(myLawnIsWet && herLawnIsWet); - return rain; -})) -~~~~ - - -### d) - -> To investigate further we poll a selection of our friends who live nearby, and ask if their grass was wet this morning. -> Kevin and Manu and Josh, each with the same sprinkler, all agree that their lawns were wet too. -> Write a model to reason about all 5 people (including me and Kelsey), and then use it to find the probability that it rained. - -~~~~ -viz.table(Infer({method: "enumerate"}, function() { - var rain = flip(0.3); - - var sprinkler = mem(function(person) { return flip() }); - var wetLawn = function(person) { return rain || sprinkler(person) }; - - condition(wetLawn("me")); - condition(wetLawn("Kelsey")); - condition(wetLawn("Kevin")); - condition(wetLawn("Manu")); - condition(wetLawn("Josh")); - return rain; -})) -~~~~ - - -## Exercise 6: Casino game - -> Consider the following game. -A machine randomly gives Bob a letter of the word "game" with and Bob has a different probability of winning depending on which letter he got: -> -> | $$h$$ | $$p(h)$$ | $$p(\text{win}\mid h)$$ | $$p(h \mid \text{win})$$ | -| ----- | -------- | ----------------------- |------------------------- | -| g | 0.05 | 1 | | -| a | 0.45 | 1/4 | | -| m | 0.05 | 1/9 | | -| e | 0.45 | 1/16 | | -> -> Suppose that we observe Bob winning, but we don't know what letter he got. -How can we use the observation that he won to update our beliefs about which letter he got? -Let's express this formally. -Before we begin, a bit of terminology: the set of letters that Bob could have gotten, $$\{g, a, m, e\}$$, is called the *hypothesis space* -- it's our set of hypotheses about the letter. - - -### a) - -> In English, what does the posterior probability $$p(h \mid \text{win})$$ represent? - -Given that Bob wins, which letter did he probably draw? - -> What does it mean for a letter to have the highest posterior? - -If we had to guess a letter, the letter with the highest posterior would be the best one. It's both likely to be drawn a priori (because it's a vowel) and likely to result in a win if Bob drew it. - - -### b) - -> Manually compute $$p(h \mid \text{win})$$ for each hypothesis. -> Remember to normalize --- make sure that summing all your $$p(h \mid \text{win})$$ values gives you 1. - -Using Bayes rule, - -$$ P(h \mid \text{win}) \propto P(h) \cdot P(\text{win} \mid h) $$ - -Let $$Z$$ be the sum of $$ P(h) \cdot P(\text{win} \mid h) $$ across all values of $$h$$. - -| $$h$$ | $$p(h)$$ | $$p(\text{win}\mid h)$$ | $$p(h \mid \text{win})$$ | -| ----- | -------- | ------------------------ |------------------------- | -| g | 0.05 | 1 | 0.05 / Z = 0.255 | -| a | 0.45 | 1/4 | 0.45/4 / Z = 0.573 | -| m | 0.05 | 1/9 | 0.05/9 / Z = 0.028 | -| e | 0.45 | 1/16 | 0.45/16 / Z = 0.143 | - - -### c) - -> Now, let's write this model in WebPPL using `Infer`. -Fill in the `...`'s in the code below to compute $$p(h \mid \text{win})$$. -Include a screenshot of the resulting graph. -> -> It might be helpful to comment out the `condition` statement so you can compare visually the prior (no `condition` statement) to the posterior (with `condition`). -> -> Make sure that your WebPPL answers and hand-computed answers agree -- note that this demonstrates the equivalence between the program view of conditional probability and the distributional view. - -~~~~ -// define some variables and utility functions -var checkVowel = function(letter) { _.includes(['a', 'e', 'i', 'o', 'u'], letter) }; -var letterVals = ['g', 'a', 'm', 'e']; -var letterProbs = map(function(letter) { checkVowel(letter) ? 0.45 : 0.05 }, letterVals); -var letters = Categorical({vs: letterVals, ps: letterProbs}); - -// Compute p(h | win) -var distribution = Infer({method: 'enumerate'}, function() { - var letter = sample(letters); - var position = letterVals.indexOf(letter) + 1; - var winProb = 1 / Math.pow(position, 2); - condition(flip(winProb)); - return letter; -}); - -viz.auto(distribution); -viz.table(distribution); -~~~~ - - -### d) - -Which is higher, $$p(\text{vowel} \mid \text{win})$$ or $$p(\text{consonant} \mid \text{win})$$? -Answer this using the WebPPL code you wrote *Hint:* use the `checkVowel` function. - -~~~~ -// define some variables and utility functions -var checkVowel = function(letter) { _.includes(['a', 'e', 'i', 'o', 'u'], letter) }; -var letterVals = ['g', 'a', 'm', 'e']; -var letterProbs = map(function(letter) { checkVowel(letter) ? 0.45 : 0.05 }, letterVals); -var letters = Categorical({vs: letterVals, ps: letterProbs}); - -// Compute p(h | win) -var distribution = Infer({method: 'enumerate'}, function() { - var letter = sample(letters); - var position = letterVals.indexOf(letter) + 1; - var winProb = 1 / Math.pow(position, 2); - condition(flip(winProb)); - return checkVowel(letter); -}); - -viz.table(distribution); -~~~~ - -A vowel is more likely ($$P(vowel) = 0.7168141592920354$$) than a consonant ($$P(vowel) = 0.28318584070796465 $$) - - -### e) - -> What difference do you see between your code and the mathematical notation? -> What are the advantages and disadvantages of each? -> Which do you prefer? - -The mathematical notation is more precise in some cases (we might get some rounding errors on the computer), but code is less error prone, easier to think about, and much easier to extend. It would be tedious to do this with all the letters of the alphabet instead by hand compared to using code. diff --git a/solutions/dependence.md b/solutions/dependence.md deleted file mode 100644 index 8b75dfe..0000000 --- a/solutions/dependence.md +++ /dev/null @@ -1,86 +0,0 @@ ---- -layout: exercise -title: Patterns of inference - solutions ---- - -## Exercise 1: Causal and statistical dependency. - -> For each of the following programs: -> -> * Draw the dependency diagram (Bayes net). If you don't have software on your computer for doing this, Google Docs has a decent interface for creating drawings. -> -> * Use informal evaluation order reasoning and the intervention method to determine causal dependency between A and B. -> -> * Use conditioning to determine whether A and B are statistically dependent. - -### a) - -~~~~ -var a = flip(); -var b = flip(); -var c = flip(a && b ? .8 : .5); -~~~~ - -neither causally dependent nor statistically dependent - -![](../assets/img/04_01_a.png) - -### b) - -~~~~ -var a = flip(); -var b = flip(a ? .9 : .2); -var c = flip(b ? .7 : .1); -~~~~ - -both causally dependent *and* statistically dependent - -![](../assets/img/04_01_b.png) - -### c) - -~~~~ -var a = flip(); -var b = flip(a ? .9 : .2); -var c = flip(a ? .7 : .1); -~~~~ - -both causally dependent *and* statistically dependent - -![](../assets/img/04_01_c.png) - -### d) - -~~~~ -var a = flip(.6); -var c = flip(.1); -var z = flip() ? a : c; -var b = z ? 'foo' : 'bar'; -~~~~ - -both causally dependent *and* statistically dependent - -![](../assets/img/04_01_d.png) - -### e) - -statistically dependent but *not* causally dependent - -~~~~ -var examFairPrior = Bernoulli({p: .8}); -var doesHomeworkPrior = Bernoulli({p: .8}); -var examFair = mem(function(exam) { return sample(examFairPrior) }); -var doesHomework = mem(function(student) { return sample(doesHomeworkPrior) }); - -var pass = function(student, exam) { - return flip(examFair(exam) ? - (doesHomework(student) ? .9 : .5) : - (doesHomework(student) ? .2 : .1)); -} -var a = pass('alice', 'historyExam'); -var b = pass('bob', 'historyExam'); -~~~~ - -![](../assets/img/04_01_e.png) - - diff --git a/solutions/generative-models.md b/solutions/generative-models.md deleted file mode 100644 index e107bb4..0000000 --- a/solutions/generative-models.md +++ /dev/null @@ -1,578 +0,0 @@ ---- -layout: exercise -title: Generative models - solutions -description: Generative models -custom_js: -- assets/js/box2d.js -- assets/js/physics.js ---- - -## Exercise 1 - -### a) - -> Show mathematically that the marginal distribution on return values for these three programs is the same by directly computing the probability using the rules of probability -> (hint: write down each possible history of random choices for each program). - -~~~~ -flip() ? flip(.7) : flip(.1) -~~~~ - -* flip a coin: - * 0.5 heads -> flip a coin: - * 0.7 heads - * 0.3 tails - * 0.5 tails -> flip a coin: - * 0.1 heads - * 0.9 tails - -* ---> heads: $$(0.5)(0.7) + (0.5)(0.1) = 0.4$$ -* ---> tails: $$(0.5)(0.3) + (0.5)( 0.9 )= 0.6$$ - -~~~~ -flip(flip() ? .7 : .1) -~~~~ - -* flip a coin: - * 0.5 heads -> 0.7 input to... -> flip a coin: - * 0.7 heads - * 0.3 tails - * 0.5 tails -> 0.1 input to... -> flip a coin: - * 0.1 heads - * 0.9 tails - -* ---> heads: $$(0.5)(0.7) + (0.5)(0.1) = 0.4$$ -* ---> tails: $$(0.5)(0.3) + (0.5)( 0.9 )= 0.6$$ - -~~~~ -flip(.4) -~~~~ - -* flip a coin: - * 0.4 heads - * 0.6 tails - -### b) - -> Check your answers by sampling from the programs, 1000 times each, and plotting the results. - -~~~~ -viz(repeat(1000, function() { - return flip() ? flip(.7) : flip(.1) -})) -~~~~ - -~~~~ -viz(repeat(1000, function() { - return flip(flip() ? .7 : .1) -})) -~~~~ - -~~~~ -viz(repeat(1000, function() { return flip(.4) })) -~~~~ - -### c) - -> Write another new program with the same marginal distribution on return values that looks different from all of the above programs. - -~~~~ -flip() ? false : flip(.8) -~~~~ - -## Exercise 2 - -### a) - -> Explain why (in terms of the evaluation process) these two programs give different answers (i.e. have different distributions on return values). - -~~~~ -var foo = flip() -display([foo, foo, foo]) -~~~~ - -~~~~ -var foo = function() { return flip() } -display([foo(), foo(), foo()]) -~~~~ - -In the first program, the variable `foo` is assigned to a value once, the literal boolean value that `flip()` happens to return the one time it's run. -So the only possible outputs are `[true, true, true]` and `[false, false, false]`. - -In the second program, the variable `foo` is the name of a function, which is run three separate times. Each time, the literal boolean value that's returned can be different. -So any combination of three `true` and `false` values is possible as output. - -### b) - -> Modify the second program using `mem` so that it has the same distribution as the first program. - -~~~~ -var foo = mem(function() { return flip() }) -display([foo(), foo(), foo()]) -~~~~ - -### c) - -> Change the program in Part B so that the first two elements in the list are always the same as each other, but the third element can be different. -> *Optional challenge:* try to do this by adding only these 4 characters: `x`, `0`, `0`, and `1`. - -~~~~ -var foo = mem(function(x) { return flip() }) -display([foo(0), foo(0), foo(1)]) -~~~~ - -## Exercise 3 - -### a) - -> Which of these programs would be more likely to generate the following proportions for 100 values of C? -> Justify your response. - -
- -~~~~ -// Program "A" -var A = flip() -var B = flip(0.9) -var C = flip() ? A && B : A || B -display([A, B, C]) -~~~~ - -~~~~ -// Program "B" -var A = flip(0.9); -var B = A && flip(0.9) -var C = B && flip(0.9) -display([A, B, C]) -~~~~ - -Program "A", because $$(0.5)(0.5)(0.9) + (0.5)(1 - (0.5)(0.1)) = 0.7$$, whereas $$(0.9)(0.9)(0.9) = 0.729$$. - -### b) - -> Could the program you did *not* choose in Part A have *also* generated those return values? Explain. - -Yes. Execution of the program is *random*, so different numbers of `true` and `false` answers are possible for 100 samples. - -## Exercise 4 - -> In the simple medical diagnosis example, we imagined a generative process for diseases and symptoms of a single patient. -> In this exercise, we'll write a version of that model that represents the diseases and symptoms of many patients. - -### a) - -> Let's look at just two common conditions (a cold and allergies) and just two symptoms (sneeze and fever), and let's assume that symptoms are deterministic. - -~~~~ -var allergies = flip(0.3) -var cold = flip(0.2) - -var sneeze = cold || allergies -var fever = cold - -display([sneeze, fever]) -~~~~ - -> Under this model, what is the probability that the patient is sneezing? What is the probability that the patient is sneezing *and* has a fever? - -$$ P(\text{sneeze}) = P(\text{sneeze} \mid \text{cold})P(\text{cold}) + P(\text{sneeze} \mid \text{allergies})P(\text{allergies})$$ - -$$\ \ \ \ - P(\text{sneeze} \mid \text{allergies AND cold})P(\text{allergies AND cold})$$ - -$$ (1)(0.2) + (1)(0.3) - (1)(0.2)(0.3) = 0.44 $$ - -$$P(\text{sneeze AND fever}) = P(\text{sneeze} \mid \text{cold})P(\text{fever} \mid \text{cold})P(\text{cold}) + P(\text{sneeze} \mid \text{cold})P(\text{fever} \mid \text{cold})P(\text{cold})$$ - -$$= (1)(1)(0.2) + (1)(0)(0.3) = 0.2$$ - -### b) - -> Inspect the joint probability distributions of `sneeze` and `fever` using `Infer`. - -~~~~ -Infer({method: "forward", samples: 1000}, function() { - var allergies = flip(0.3) - var cold = flip(0.2) - - var sneeze = cold || allergies - var fever = cold - - // a list would also be fine here - return {"sneeze": sneeze, "fever": fever} -}) -~~~~ - -### c) - -> If we wanted to represent the diseases of many patients we might have tried to make each disease and symptom into a function from a person to whether they have that disease, like this: - -~~~~ -Infer({method: "forward", samples: 1000}, function() { - var allergies = mem(function(person) { return flip(.3) }) - var cold = mem(function(person) { return flip(.2) }) - - var sneeze = function(person) { return cold(person) || allergies(person) } - var fever = function(person) { return cold(person) } - - return {"sneeze": sneeze('bob'), "fever": fever('bob')} -}) -~~~~ - -> Add `fever` to the program above, and use `Infer` to inspect the probability distribution over Bob's symptoms. -> Is this the same probability distribution that you computed for the single patient in Part A? -> If not, what can you do to fix this program to capture our intuitions correctly? - -We need `mem` on the diseases. Otherwise the two symptoms will be calculated using different values for the diseases, and that wouldn't make sense. - -## Exercise 5 - -> Work through the evaluation process for the `bend` higher-order function in this example: - -~~~~ -var makeCoin = function(weight) { - return function() { - return flip(weight) ? 'h' : 't' - } -} -var bend = function(coin) { - return function() { - return coin() == 'h' ? makeCoin(.7)() : makeCoin(.1)() - } -} - -var fairCoin = makeCoin(.5) -var bentCoin = bend(fairCoin) -~~~~ - -### a) - -> Directly compute the probability distribution of the bent coin in the example. - -$$(0.5)(0.7) + (0.5)(0.1) = 0.4$$ - -### b) - -> Check your answer by using `Infer`. - -~~~~ -var makeCoin = function(weight) { - return function() { - return flip(weight) ? 'h' : 't' - } -} -var bend = function(coin) { - return function() { - return coin() == 'h' ? makeCoin(.7)() : makeCoin(.1)() - } -} - -var fairCoin = makeCoin(.5) -var bentCoin = bend(fairCoin) - -Infer({method: 'forward', samples: 10000}, bentCoin) -~~~~ - -## Exercise 6 - -### a) - -> Directly compute the probability that the geometric distribution defined by the following stochastic recursion returns the number 5. -> *Hint:* What is the default parameter for `flip()`? - -~~~~ -var geometric = function() { - return flip() ? 0 : 1 + geometric() -} -~~~~ - -$$(0.5^5)(0.5) = 0.015625$$ - -### b) - -> Check your answer by using `Infer`. - -~~~~ -var geometric = function() { - return flip() ? 0 : 1 + geometric() -} -Infer({method: "forward", samples:10000}, geometric) -~~~~ - -## Exercise 7 - -### a) - -> Convert the following probability table to a compact WebPPL program: -> -> | A | B | P(A,B) | -> | --- | --- | --- | -> | F | F | 0.14 | -> | F | T | 0.06 | -> | T | F | 0.4 | -> | T | T | 0.4 | -> -> **Requirement:** fix the probability of A first and then define the probability of B to *depend* on whether A is true or not. - -~~~~ -var a = flip(0.8) -var b = flip(a ? 0.5 : 0.3) -display([a, b]) -~~~~ - -### b) - -> Run your WebPPL program and use `Infer` to check that you get the correct distribution. - -~~~~ -var fn = function() { - var a = flip(0.8) - var b = flip(a ? 0.5 : 0.3) - return [a, b] -} -Infer({method: "forward", samples: 10000}, fn) -~~~~ - -## Exercise 8 - -> Below we've defined a higher-order function `flipSequence` that takes a coin flipping function (e.g. `trickCoin`, below) and flips that coin until it gets a *sequence* of two heads in a row (in which case it returns heads `'h'`) or two tails in a row (in which case it returns tails `'t'`). -> Try out different weights for the `trickCoin`. - -~~~~ -var makeCoin = function(weight) { - return function() { - return flip(weight) ? 'h' : 't' - } -} -var flipSequence = function(coin) { - return function() { - var flip1 = coin() - var flip2 = coin() - if (flip1 == flip2) { - return flip1 - } else { - return flipSequence(coin)() - } - } -} - -var trickCoin = makeCoin(.6) - -var n_samples = 10000; -viz(Infer({method: "forward", samples: n_samples}, trickCoin)) -viz(Infer({method: "forward", samples: n_samples}, flipSequence(trickCoin))) -~~~~ - -### a) - -> How does `flipSequence` change the distribution over return values (qualitatively)? -> Explain why requiring two flips in a row to be the same has this effect. - -The distribution becomes peakier. The higher probability outcome becomes even more probable and the lower probability outcome becomes even less probable. - -### b) - -> What would happen if a fair coin (with weight 0.5) were input to `flipSequence`? Explain. - -Nothing would change, it would be the same as `flip(0.5)`, because there's no asymmetry in the distribution to emphasize. - -## Exercise 9 - -> Box2D is a two-dimensional simulation engine for simulating rigid bodies (those with constant shape). -> It allows for the construction of arbitrary worlds and models important physical concepts including collisions, friction, gravity, momentum, and more. -> -> We have provided a wrapper around Box2D that allows for the easy construction of worlds. -> A world consists of list of shapes. -> Shapes are created by JavaScript objects with the following properties: -> -> |`shape` |"circle" or "rect" | -> |`dims` |[width, height] for rect or [radius] for circle | -> |`x` |x_position_as_number (distance from left) | -> |`y` |y_position_as_number (distance from top) | -> |`static` |boolean (does the object move or stay still?) | -> |`velocity` |[x_velocity, y_velocity] | -> -> The variables `worldWidth` and `worldHeight` are constants representing the visible size of the simulation window. -> -> Here's an example with a ground and a single rectangle. Add another object to `bowlingWorld` and give it an initial velocity so that it knocks the original rectangle down. - -~~~~ -var ground = { - shape: 'rect', - static: true, - dims: [worldWidth, 10], - x: worldWidth/2, - y: worldHeight -} - -var rect = { - shape: 'rect', - static: false, - dims: [10, 100], - x: worldWidth/2, - y: 390 -} - -var circle = { - shape: 'circle', - static: false, - dims: [30], - x: worldWidth/4, - y: worldHeight - 40, - velocity: [300, 0] -} - -var bowlingWorld = [ground, rect, circle] -physics.animate(1000, bowlingWorld) -~~~~ - -## Exercise 10 - -> In **Example: Intuitive physics** we modeled instability of a tower as the probability that the tower falls when perturbed, and we modeled "falling" as getting shorter. -> It would be reasonable to instead measure *how much shorter* the tower gets. - -### a) - -> Below, modify the stability/instability model by writing a continuous measure, `towerFallDegree`. -> Let this measure take different values between 0 and 1. -> That way, your continuous measure will be numerically comparable to the discrete measure, `doesTowerFall` (defined here as either 0 or 1). -> Explain what your continuous measure represents and why it might be a good continuous measure of instability. - -~~~~ -///fold: -var listMin = function(xs) { - if (xs.length == 1) { - return xs[0] - } else { - return Math.min(xs[0], listMin(rest(xs))) - } -} - -var highestY = function (w) { listMin(map(function(obj) { return obj.y }, w)) } -var ground = { - shape: 'rect', - static: true, - dims: [worldWidth, 10], - x: worldWidth / 2, - y: worldHeight + 6 -} - -var almostUnstableWorld = [ - ground, - {shape: 'rect', static: false, dims: [24, 22], x: 175, y: 473}, - {shape: 'rect', static: false, dims: [15, 38], x: 159.97995044874122, y: 413}, - {shape: 'rect', static: false, dims: [11, 35], x: 166.91912737427202, y: 340}, - {shape: 'rect', static: false, dims: [11, 29], x: 177.26195677111082, y: 276}, - {shape: 'rect', static: false, dims: [11, 17], x: 168.51354470809122, y: 230} -] - -var noisify = function (world) { - var perturbX = function (obj) { - var noiseWidth = 10 - if (obj.static) { - return obj - } else { - return _.extend({}, obj, {x: uniform(obj.x - noiseWidth, obj.x + noiseWidth)}) - } - } - map(perturbX, world) -} - -/// - -// Returns height of tower -var getTowerHeight = function(world) { - return worldHeight - highestY(world) -} - -var doesTowerFall = function (initialW, finalW) { - var approxEqual = function (a, b) { Math.abs(a - b) < 1.0 } - return 1 - approxEqual(highestY(initialW), highestY(finalW)) -} - -var towerFallDegree = function(initialW, finalW) { - var initialHeight = getTowerHeight(initialW) - var finalHeight = getTowerHeight(finalW) - return (initialHeight - finalHeight) / initialHeight -} - -var visualizeInstabilityMeasure = function(measureFunction) { - var initialWorld = noisify(almostUnstableWorld) - var finalWorld = physics.run(1000, initialWorld) - var measureValue = measureFunction(initialWorld, finalWorld) - print("Instability measure: " + measureValue) - print("Initial height: " + getTowerHeight(initialWorld)) - print("Final height: " + getTowerHeight(finalWorld)) - physics.animate(1000, initialWorld) -} - -// Test binary doesTowerFall measure -// visualizeInstabilityMeasure(doesTowerFall) - -// Test custom towerFallDegree measure -visualizeInstabilityMeasure(towerFallDegree) -~~~~ - -This is the percent difference in height from before and after we introduce gravity. -The higher the final tower, the more stable the original tower was. -This is good in that it reflects the idea that even if one block falls off, if the rest of the tower stays, then the structure as a whole was probably stable. - -### b) - -> Describe a tower with a very different doesTowerFall and towerFallDegree measures look like. -> Which measure captures the meaning of "unstable" better? - -~~~~ -///fold: -var listMin = function(xs) { - if (xs.length == 1) { - return xs[0] - } else { - return Math.min(xs[0], listMin(rest(xs))) - } -} - -var highestY = function (w) { listMin(map(function(obj) { return obj.y }, w)) } -var ground = { - shape: 'rect', - static: true, - dims: [worldWidth, 10], - x: worldWidth/2, - y: worldHeight+6 -} - -var getTowerHeight = function(world) { - return worldHeight - highestY(world) -} - -var doesTowerFall = function (initialW, finalW) { - var approxEqual = function (a, b) { Math.abs(a - b) < 1.0 } - return 1 - approxEqual(highestY(initialW), highestY(finalW)); -} - -var towerFallDegree = function(initialW, finalW) { - var initialHeight = getTowerHeight(initialW) - var finalHeight = getTowerHeight(finalW) - return (initialHeight - finalHeight) / initialHeight -} -/// - -var initialWorld = [ - ground, - {shape: 'rect', static: false, dims: [100, 22], x: 175, y: 473}, - {shape: 'rect', static: false, dims: [80, 38], x: 159.97995044874122, y: 413}, - {shape: 'rect', static: false, dims: [60, 35], x: 166.91912737427202, y: 340}, - {shape: 'rect', static: false, dims: [40, 29], x: 177.26195677111082, y: 276}, - {shape: 'rect', static: false, dims: [20, 17], x: 120, y: 230} -] - -var finalWorld = physics.run(1000, initialWorld); - -physics.animate(1000, initialWorld); -print("doesTowerFall: " + doesTowerFall(initialWorld, finalWorld)) -print("towerFallDegree: " + towerFallDegree(initialWorld, finalWorld)) -~~~~ - -A tower where only one block falls off only changes a little. -The doesTowerFall makes no distinction between a tower that partially falls and completely falls whereas the towerFallsDegree does. -In the example world provided, the doesTowerFall is 1 whereas the towerFallsDegree is only 0.172, so the new metric matches intuition about this tower better. - diff --git a/solutions/hierarchical-models.md b/solutions/hierarchical-models.md deleted file mode 100644 index 09684c9..0000000 --- a/solutions/hierarchical-models.md +++ /dev/null @@ -1,457 +0,0 @@ ---- -layout: exercise -title: Hierarchical models - solutions -description: The power of abstraction. ---- - -## Exercise 1: Pseudocounts and the Dirichlet distribution - -> In the Bayesian Data Analysis exercises, we explored the Beta distribution by varying its parameters. -> The Dirichlet is a generalization of the Beta distribution to more than two categories -> (see [Appendix](http://probmods.org/chapters/appendix-useful-distributions.html)) -> Instead of Beta parameters $$(a, b)$$ governing the probabilities of two categories $$(false/true)$$, -> the Dirichlet parameter $$\alpha = [\alpha_1, \alpha_2, ..., \alpha_n]$$ controls the probabilities over categories $$[A_1, A_2, ..., A_n]$$. -> In other words, different choices of $$\alpha$$ correspond to different ways of distributing the prior probability mass over the $$N-1$$ simplex. -> -> In this exercise, we will explore a particularly intuitive way of understanding the $$\alpha$$ parameter as pseudocounts, or virtual observations. -> That is, if $$\alpha = [2, 2, 1]$$, that is the equivalent of having already observed the first category and second category twice each, and the third category one time only. -> -> Complete the code below to show that setting $$\alpha = [2, 3, 1, 1, 1]$$ is equivalent to setting $$\alpha = [1, 1, 1, 1, 1]$$, then observing the first category once and the second category twice. - - -~~~~ -var colors = ['black', 'blue', 'green', 'orange', 'red']; - -var observedData = [{bag: 'bag1', draw: 'blue'}, - {bag: 'bag1', draw: 'blue'}, - {bag: 'bag1', draw: 'black'}]; - -// first model: set alpha = [1, 1, 1, 1, 1] and observe `observedData` -var observed = Infer({method: 'MCMC', samples: 20000}, function() { - var makeBag = mem(function(bag) { - var colorProbs = dirichlet(ones([colors.length, 1])); - return Categorical({vs: colors, ps: colorProbs}); - }) - - var obsFn = function(datum) { - observe(makeBag(datum.bag), datum.draw); - } - - mapData({data: observedData}, obsFn); - - return {bag1: sample(makeBag('bag1'))}; -}) - - -// second model. Set alpha = [2, 3, 1, 1, 1] -var usealpha = Infer({method: 'MCMC', samples: 20000}, function () { - var makeBag = mem(function(bag) { - var colorProbs = dirichlet(Vector([2, 3, 1, 1, 1])); - return Categorical({vs: colors, ps: colorProbs}); - }) - - return {bag1: sample(makeBag('bag1'))}; -}) - -viz.marginals(observed); -viz.marginals(usealpha); -~~~~ - - -## Exercise 2: Rotten apples - -> On any given day, a given grocery store has some number of apples for sale. -> Some of these apples may be mushy or even rotten. -> The probability that each apple is rotten is not independent: a ripening fruit emits chemicals that cause other fruit to ripen as well. -> As they say, [one rotten apple spoils the whole barrel](https://idiomation.wordpress.com/2013/03/27/one-bad-apple-spoils-the-whole-barrel/). -> -> For each apple in a barrel, assume the probability that the apple is rotten is `flip(p)` where `p` is drawn from some prior. -> One appropriate prior distribution is Beta. -> Recall that the Beta distribution is just a Dirichlet that returns a vector of length one. -> Like the Dirichlet distribution, the Beta distribution is defined based on pseudocounts `[a, b]`. -> `Beta({a: 10, b: 2})` returns the equivalent of a Beta distribution conditioned on having previously seen 10 heads and 2 tails, while `[a,b]` values less than 1 concentrate mass at the endpoints. -> -> `Beta({a: .1, b: .2})` nicely captures our prior expectations about rotten apples: most of the time, the probability of a rotten apple is quite low. -The rest of the time, the probability is very high. -Middling probabilities are rare. - - -### Exercise 2.1 - -> Complete the function `makeBarrel` that returns a function `barrel`. -> `barrel` takes a single argument `n` and returns an array of `n` boolean values representing whether or not each of the `n` apples in the barrel is rotten. -> -> Use beta(.1, .2) as the prior for rottenness. - -~~~~ -var makeBarrel = mem(function(barrelName) { - var pRotten = beta({a: .1, b: .2}); - var barrel = function(n) { - return repeat(n, function() { flip(pRotten) }); - } - return barrel; -}); - -var post = Infer({method: 'forward'}, function() { - var barrel = makeBarrel('barrel'); - return Math.sum(barrel(10)); -}) -viz(post); -~~~~ - -### Exercise 2.1 - -> Some grocery stores have fresher produce than others. -> Complete the function `makeStore` that returns the `makeBarrel` function, which works as it did in part Exercise 2.1. -> Importantly, each store has its own Beta parameters `a` and `b` drawn from some prior. -> -> HINT: In order to maintain the likelihood either most of the apples in a barrel are rotten or few are, you need to ensure that `a < 1` and `b < 1`. -> However, if `a` is much larger than `b` (or vice versa), you will get extreme results with *every* apple being rotten or *every* apple being good. -> -> NOTE: No need to be overly fancy with this prior. Pick something simple that you know will give you what you want, e.g. -stores either have good barrels with few rotten apples or bad barrels with many rotten apples. - -~~~~ -var makeStore = mem(function(storeName) { - var storePrior = flip() ? {a: .1, b: .3} : {a: .3, b: .1}; - - var makeBarrel = mem(function(barrelName) { - var pRotten = beta(storePrior); - var barrel = function(n) { - return repeat(n, function() { flip(pRotten) }); - } - return barrel; - }) - - return makeBarrel; -}) - -display("Differences in number of rotten apples between 2 barrels from the same store."); -viz(Infer({method: 'forward', samples:10000}, function() { - var S = makeStore('S'); - var B1 = S('B1'); - var B2 = S('B2'); - return Math.abs(Math.sum(B1(10)) - Math.sum(B2(10))); -})) - -display("Differences in number of rotten apples between 2 barrels from different stores."); -viz(Infer({method: 'forward', samples:10000}, function() { - var S1 = makeStore('S1'); - var S2 = makeStore('S2'); - var B1 = S1('B1'); - var B2 = S2('B2'); - return Math.abs(Math.sum(B1(10)) - Math.sum(B2(10))); -})) -~~~~ - - -### Exercise 2.3 - -> Some cities are located in apple country and thus have more access to fresh apples. -> Most stores in those cities are going to mostly have good barrels with good apples. -> Other cities have less access to fresh apples, and so more of their stores will have bad barrels with rotten apples. -> -> In the code block below, create a `makeCity` function, which returns a `makeStore` function, which works as in (b). -> In (b), each store had a prior on `[a, b]`. -> Put a prior on *that* prior, such that cities either tend to have good stores or tend to have bad stores. -> -> HINT: Again, it is not necessary to have an overly fancy prior here. - - -~~~~ -var makeCity = mem(function(cityName){ - var cityPrior = beta({a: .25, b: .25}); - - var makeStore = mem(function(storeName) { - var storePrior = flip(cityPrior) ? {a: .1, b: .3} : {a: .3, b: .1}; - - var makeBarrel = mem(function(barrelName) { - var pRotten = beta(storePrior); - var barrel = function(n) { - return repeat(n, function() { flip(pRotten) }); - } - return barrel; - }) - - return makeBarrel; - }) - - return makeStore; -}) - -var C1 = makeCity("C1"); -var S1 = C1("S1"); -var B1 = S1("B1"); - -// repeat to see different kinds of cities -viz(Infer({method: 'forward'}, function(){ - return Math.sum(B1(20)) -})); -~~~~ - -### Exercise 2.4 - -> Suppose you go to a store in a city. -> The store has a barrel of 10 apples, 7 of which are rotten. -> You leave and go to another store in the same city. -> It also has a barrel with 10 apples. -> How many of these apples are likely to be rotten? - -~~~~ -var makeCity = mem(function(cityName){ - var cityPrior = beta({a: .25, b: .25}); - - var makeStore = mem(function(storeName) { - var storePrior = flip(cityPrior) ? {a: .1, b: .3} : {a: .3, b: .1}; - - var makeBarrel = mem(function(barrelName) { - var pRotten = beta(storePrior); - var barrel = function(n) { - return repeat(n, function() { flip(pRotten) }); - } - return barrel; - }) - - return makeBarrel; - }) - - return makeStore; -}) - -viz(Infer({method: 'MCMC', samples:5000, lag: 100}, function(){ - var C = makeCity("C"); - var S1 = C("S1"); - var B1 = S1("B1"); - var S2 = C("S2"); - var B2 = S2("B2"); - - condition(Math.sum(B1(10)) == 7); - - return Math.sum(B2(10)); -})); -~~~~ - -## Exercise 3: Hierarchical models for BDA - -> Imagine that you have conducted an experiment on word reading times to test the hypothesis that words starting with vowels take longer to read. -> Each data point includes whether the word starts with a vowel or a consonant, the word itself, the participant id, and the response time you measured ("rt"). -> A simple data analysis model attempts to infer the mean reading time for each word group, and returns the difference between the groups (a sort of Bayesian version of a t-test). -> -> Note that there is no cognitive model inside this BDA; it is directly modeling the data. - -~~~~ -var data = [{group: "vowel", word: "abacus", id: 1, rt: 210}, - {group: "vowel", word: "abacus", id: 2, rt: 212}, - {group: "vowel", word: "abacus", id: 3, rt: 209}, - {group: "vowel", word: "aardvark", id: 1, rt: 200}, - {group: "vowel", word: "aardvark", id: 2, rt: 201}, - {group: "vowel", word: "aardvark", id: 3, rt: 198}, - {group: "vowel", word: "ellipse", id: 1, rt: 220}, - {group: "vowel", word: "ellipse", id: 2, rt: 222}, - {group: "vowel", word: "ellipse", id: 3, rt: 219}, - - {group: "consonant", word: "proton", id: 1, rt: 190}, - {group: "consonant", word: "proton", id: 2, rt: 191}, - {group: "consonant", word: "proton", id: 3, rt: 189}, - {group: "consonant", word: "folder", id: 1, rt: 180}, - {group: "consonant", word: "folder", id: 2, rt: 182}, - {group: "consonant", word: "folder", id: 3, rt: 178}, - {group: "consonant", word: "fedora", id: 1, rt: 230}, - {group: "consonant", word: "fedora", id: 2, rt: 231}, - {group: "consonant", word: "fedora", id: 3, rt: 228}, - {group: "consonant", word: "fedora", id: 1, rt: 231}, - {group: "consonant", word: "fedora", id: 2, rt: 233}, - {group: "consonant", word: "fedora", id: 3, rt: 230}, - {group: "consonant", word: "fedora", id: 1, rt: 230}, - {group: "consonant", word: "fedora", id: 2, rt: 232}, - {group: "consonant", word: "fedora", id: 3, rt: 228}] - -var opts = {method: "MCMC", - burn: 10000, - lag: 5, - samples: 5000}; - -var post = Infer(opts, function() { - var groupMeans = {vowel: gaussian(200, 100), - consonant: gaussian(200, 100)}; - - var obsFn = function(d) { - //assume response times (rt) depend on group means with a small fixed noise: - observe(Gaussian({mu: groupMeans[d.group], sigma: 10}), d.rt); - } - - mapData({data: data}, obsFn); - - return groupMeans['vowel'] - groupMeans['consonant']; -}) - -print("vowel - consonant reading time:"); -viz(post); -print(expectation(post)); -~~~~ - -> This model concludes that consonants actually take significantly longer to read. -> However, looking at the data more closely, you may not trust this conclusion. -> It seems to be driven by a single outlier, the word "fedora"! - - -### Exercise 3.1 - -> Adjust the model to allow each word to have its own mean reading time that depends on the `groupMean`. -> This is called a hierarchical data analysis model. -> What do you conclude about vowel words vs. consonant words now? -> -> *Hints* -> 1. Memoize the word mean RT when sampling from the `groupMean`. -> 2. Consider how the model is sensitive to the different assumed variances (e.g. the fixed noise in the observe function we assume sigma=10). -> In particular, think about how this should affect how you choose a sigma for your word-level effects. -> -> Note: The individual word means are called *random effects* -- in a BDA, they are random variables -> (usually at the individual item or person level) that are not of interest by themselves. - -~~~~ -var data = [{group: "vowel", word: "abacus", id: 1, rt: 210}, - {group: "vowel", word: "abacus", id: 2, rt: 212}, - {group: "vowel", word: "abacus", id: 3, rt: 209}, - {group: "vowel", word: "aardvark", id: 1, rt: 200}, - {group: "vowel", word: "aardvark", id: 2, rt: 201}, - {group: "vowel", word: "aardvark", id: 3, rt: 198}, - {group: "vowel", word: "ellipse", id: 1, rt: 220}, - {group: "vowel", word: "ellipse", id: 2, rt: 222}, - {group: "vowel", word: "ellipse", id: 3, rt: 219}, - - {group: "consonant", word: "proton", id: 1, rt: 190}, - {group: "consonant", word: "proton", id: 2, rt: 191}, - {group: "consonant", word: "proton", id: 3, rt: 189}, - {group: "consonant", word: "folder", id: 1, rt: 180}, - {group: "consonant", word: "folder", id: 2, rt: 182}, - {group: "consonant", word: "folder", id: 3, rt: 178}, - {group: "consonant", word: "fedora", id: 1, rt: 230}, - {group: "consonant", word: "fedora", id: 2, rt: 231}, - {group: "consonant", word: "fedora", id: 3, rt: 228}, - {group: "consonant", word: "fedora", id: 1, rt: 231}, - {group: "consonant", word: "fedora", id: 2, rt: 233}, - {group: "consonant", word: "fedora", id: 3, rt: 230}, - {group: "consonant", word: "fedora", id: 1, rt: 230}, - {group: "consonant", word: "fedora", id: 2, rt: 232}, - {group: "consonant", word: "fedora", id: 3, rt: 228}] - -var opts = {method: "MCMC", - burn: 10000, - lag: 5, - samples: 5000}; - -var post = Infer(opts, function() { - var groupMeans = {vowel: gaussian(200, 100), - consonant: gaussian(200, 100)} - - var wordMean = mem(function(word, group) { - return gaussian(groupMeans[group], 20); - }) - - var obsFn = function(d) { - //assume response times (rt) depend on group means with a small fixed noise: - observe(Gaussian({mu: wordMean(d.word, d.group), - sigma: 10}), d.rt); - } - - mapData({data: data}, obsFn); - - return groupMeans['vowel'] - groupMeans['consonant']; -}) - -print("vowel - consonant reading time:"); -viz(post); -print(expectation(post)); -~~~~ - - -### Exercise 3.2 - -> Looking at the data further, you might notice that some participants in your experiment read slightly faster than others. -> Extend your model to also include an additional random effect of participant id, that is, an unknown (and not of interest) influence on reading time of the particular person. -> -> How does this affect your conclusion? -> Is your conclusion any stronger or weaker? - - -~~~~ -var data = [{group: "vowel", word: "abacus", id: 1, rt: 210}, - {group: "vowel", word: "abacus", id: 2, rt: 212}, - {group: "vowel", word: "abacus", id: 3, rt: 209}, - {group: "vowel", word: "aardvark", id: 1, rt: 200}, - {group: "vowel", word: "aardvark", id: 2, rt: 201}, - {group: "vowel", word: "aardvark", id: 3, rt: 198}, - {group: "vowel", word: "ellipse", id: 1, rt: 220}, - {group: "vowel", word: "ellipse", id: 2, rt: 222}, - {group: "vowel", word: "ellipse", id: 3, rt: 219}, - - {group: "consonant", word: "proton", id: 1, rt: 190}, - {group: "consonant", word: "proton", id: 2, rt: 191}, - {group: "consonant", word: "proton", id: 3, rt: 189}, - {group: "consonant", word: "folder", id: 1, rt: 180}, - {group: "consonant", word: "folder", id: 2, rt: 182}, - {group: "consonant", word: "folder", id: 3, rt: 178}, - {group: "consonant", word: "fedora", id: 1, rt: 230}, - {group: "consonant", word: "fedora", id: 2, rt: 231}, - {group: "consonant", word: "fedora", id: 3, rt: 228}, - {group: "consonant", word: "fedora", id: 1, rt: 231}, - {group: "consonant", word: "fedora", id: 2, rt: 233}, - {group: "consonant", word: "fedora", id: 3, rt: 230}, - {group: "consonant", word: "fedora", id: 1, rt: 230}, - {group: "consonant", word: "fedora", id: 2, rt: 232}, - {group: "consonant", word: "fedora", id: 3, rt: 228}] - -var opts = {method: "MCMC", - burn: 10000, - lag: 5, - samples: 5000}; - -var post = Infer(opts, function() { - var groupMeans = {vowel: gaussian(200, 100), - consonant: gaussian(200, 100)} - - var participantMean = mem(function(pid) { - return gaussian(0, 2); - }) - - var wordMean = mem(function(word, group) { - return gaussian(groupMeans[group], 20); - }) - - var obsFn = function(d) { - //assume response times (rt) depend on group means with a small fixed noise: - observe(Gaussian({mu: wordMean(d.word, d.group) + participantMean(d.id), - sigma: 10}), d.rt); - } - - mapData({data: data}, obsFn); - - return {diff: groupMeans['vowel'] - groupMeans['consonant'], - p1: participantMean(1), - p2: participantMean(2), - p3: participantMean(3)} -}) - -print("vowel - consonant reading time:"); -var diff = marginalize(post, function(x) { x.diff }); -viz(diff); -print(expectation(diff)); - -print("Participant 1"); -var p1 = marginalize(post, function(x) { x.p1 }); -viz(p1); -print(expectation(p1)); - -print("Participant 2"); -var p2 = marginalize(post, function(x) { x.p2 }); -viz(p2); -print(expectation(p2)); - -print("Participant 3"); -var p3 = marginalize(post, function(x) { x.p3 }); -viz(p3); -print(expectation(p3)); -~~~~ - -This should make the conclusion stronger since more of the variance is accounted for by the model. \ No newline at end of file diff --git a/solutions/inference-algorithms.md b/solutions/inference-algorithms.md deleted file mode 100644 index 2ab8765..0000000 --- a/solutions/inference-algorithms.md +++ /dev/null @@ -1,661 +0,0 @@ ---- -layout: exercise -title: Algorithms for Inference - solutions -description: MCMC, etc. -custom_js: -- assets/js/custom.js ---- - -## Exercise 1. Sampling Implicit Curves - -In the code box below, the `curve` function defines a vaguely heart-shaped curve. Below, we use rejection sampling to sample points along the boundary of the curve. - -~~~~ -// takes z = 0 cross section of heart surface to some tolerance -// see http://mathworld.wolfram.com/HeartSurface.html -var onCurve = function(x, y) { - var x2 = x*x; - var term1 = y - Math.pow(x2, 1/3); - var crossSection = x2 + term1*term1 - 1; - return Math.abs(crossSection) < 0.01; -}; -var xbounds = [-1, 1]; -var ybounds = [-1, 1.6]; - -var xmu = 0.5 * (xbounds[0] + xbounds[1]); -var ymu = 0.5 * (ybounds[0] + ybounds[1]); -var xsigma = 0.5 * (xbounds[1] - xbounds[0]); -var ysigma = 0.5 * (ybounds[1] - ybounds[0]); - -var model = function() { - var x = gaussian(xmu, xsigma); - var y = gaussian(ymu, ysigma); - condition(onCurve(x, y)); - return {x: x, y: y}; -}; - -var post = Infer({method: 'rejection', samples: 1000}, model); -viz.auto(post); -~~~~ - -### a) - -> Try using MCMC with Metropolis-Hastings instead of rejection sampling. -> You'll notice that it does not fare as well as rejection sampling. Why not? - -~~~~ -///fold: -var onCurve = function(x, y) { - var x2 = x*x; - var term1 = y - Math.pow(x2, 1/3); - var crossSection = x2 + term1*term1 - 1; - return Math.abs(crossSection) < 0.01; -}; -var xbounds = [-1, 1]; -var ybounds = [-1, 1.6]; - -var xmu = 0.5 * (xbounds[0] + xbounds[1]); -var ymu = 0.5 * (ybounds[0] + ybounds[1]); -var xsigma = 0.5 * (xbounds[1] - xbounds[0]); -var ysigma = 0.5 * (ybounds[1] - ybounds[0]); - -var model = function() { - var x = gaussian(xmu, xsigma); - var y = gaussian(ymu, ysigma); - condition(onCurve(x, y)); - return {x: x, y: y}; -}; -/// - -var post = Infer({method: 'MCMC', - samples: 10000, - lag: 10}, model); -viz.auto(post); -~~~~ - -Once the MH algorithm finds a state with reasonable probability, its proposals will fix one variable and try to change the other. -Since any proposals along straight vertical or horizontal lines are going to be states with much lower probability (almost every state is very low probability in this model), it is going to get stuck in a local optimum and rarely sample new states. -In contrast, every accepted sample in rejection sampling is likely to be unique. - - -### b) - -> Change the *model* to make MH successfully trace the curves. -> Your solution should result in a graph that clearly traces a heart-shaped figure -- though it need not do quite as well as rejection sampling. -> Why does this work better? - -> You may find the following piece of code useful. - -~~~~ -var a = diagCovGaussian({mu: Vector([0, 100]), - sigma: Vector([1, 10])}); -display(T.get(a, 0)); -display(T.get(a, 1)); -~~~~ - -~~~~ -///fold: -var onCurve = function(x, y) { - var x2 = x*x; - var term1 = y - Math.pow(x2, 1/3); - var crossSection = x2 + term1*term1 - 1; - return Math.abs(crossSection) < 0.01; -}; -var xbounds = [-1, 1]; -var ybounds = [-1, 1.6]; - -var xmu = 0.5 * (xbounds[0] + xbounds[1]); -var ymu = 0.5 * (ybounds[0] + ybounds[1]); -var xsigma = 0.5 * (xbounds[1] - xbounds[0]); -var ysigma = 0.5 * (ybounds[1] - ybounds[0]); -/// - -var model = function() { - var xy = diagCovGaussian({mu: Vector([xmu, xsigma]), - sigma: Vector([ymu, ysigma])}); - var x = T.get(xy, 0); - var y = T.get(xy, 1); - condition(onCurve(x, y)); - return {x: x, y: y}; -}; - -var post = Infer({method: 'MCMC', - samples: 1000, - lag: 100}, model); -viz.auto(post); -~~~~ - -This model *jointly* samples x and y which allows us to better model their dependence. -Note that this still requires many, many more samples than does rejection sampling, and provides less accurate results. - -### Exercise 1.3 - -> Using the original model (not the modified one in 1.2), change the inference *algorithm* to HMC to successfully trace the curves. -> What parameters work best? -> *Why* does this inference algorithm work better than MH? - -> HINT: start with the default parameters specified in the HMC [docs](https://webppl.readthedocs.io/en/master/inference/methods.html#mcmc) and play with different values. - -~~~~ -///fold: -var onCurve = function(x, y) { - var x2 = x*x; - var term1 = y - Math.pow(x2, 1/3); - var crossSection = x2 + term1*term1 - 1; - return Math.abs(crossSection) < 0.01; -}; -var xbounds = [-1, 1]; -var ybounds = [-1, 1.6]; - -var xmu = 0.5 * (xbounds[0] + xbounds[1]); -var ymu = 0.5 * (ybounds[0] + ybounds[1]); -var xsigma = 0.5 * (xbounds[1] - xbounds[0]); -var ysigma = 0.5 * (ybounds[1] - ybounds[0]); - -var model = function() { - var x = gaussian(xmu, xsigma); - var y = gaussian(ymu, ysigma); - condition(onCurve(x, y)); - return {x: x, y: y}; -}; -/// - -var opts = {method: 'MCMC', - samples: 10000, - callbacks: [MCMC_Callbacks.finalAccept], - kernel: {HMC : { steps: 10, stepSize: .5 }} } -var post = Infer(opts, model); -viz.auto(post); -~~~~ - -Steps 10 and stepSize 0.5 gave good results. -HMC works better in this case than MH because HMC makes proposals to all the variables at once using gradients to go "in a good direction". -The single-site MH in WebPPL makes individual proposals to each random variable, so when the proposals are strongly correlated (a posteriori), they mostly get rejected. -Even making MH proposals to all the variables at once, without gradients, it is very unlikely that the correlated variables will jointly move in the "right directions". - - -## Exercise 2. Properties and pitfalls of Metropolis-Hastings - -> Consider a very simple function that interpolates between two endpoints. - -> Suppose one endpoint is fixed at `-10`, but we have uncertainty over the value of the other endpoint and the interpolation weight between them. -> By conditioning on the resulting value being close to 0, we can infer what the free variables must have been. - -~~~~ -var interpolate = function(point1, point2, interpolationWeight) { - return (point1 * interpolationWeight + - point2 * (1 - interpolationWeight)); -} - -var model = function(){ - var point1 = -10; - var point2 = uniform(-100, 100); - var interpolationWeight = uniform(0, 1); - var pointInMiddle = interpolate(point1, point2, interpolationWeight); - observe(Gaussian({mu: 0, sigma:0.1}), pointInMiddle); - return {point2, interpolationWeight, pointInMiddle}; -} - -var posterior = Infer({method: 'MCMC', samples: 5000, lag: 100}, model); -var samples = posterior.samples; -viz(marginalize(posterior, function(x) { x.pointInMiddle })); - -// Store these for future use -editor.put("posterior", posterior); -editor.put("samples", samples); -~~~~ - -> By looking at the marginal distribution of `pointInMiddle`, we can see that `Infer()` successfully finds values of `point2` and `interpolationWeight` that satisfy our condition. - -### Exercise 2.1 - -> Visualize the separate marginal distributions of `point2` and `interpolationWeight`. -> How would you describe their shapes, compared to the marginal distribution of `pointInMiddle`? - -> HINT: use the [marginalize](http://docs.webppl.org/en/master/functions/other.html#marginalize) helper to elegantly construct these marginal distributions - -~~~~ -var posterior = editor.get("posterior"); -viz(marginalize(posterior, function(x) {return x.point2})); -viz(marginalize(posterior, function(x) {return x.interpolationWeight})); -~~~~ - -Whereas `pointInMiddle` is peaked around 0, -`point2` and `interpolationWeight` appear to be multimodal. - - -### Exercise 2.2 - -Visualize the *joint* marginal distribution of point2 and interpolationWeight. -What does this tell you about their dependence? - -~~~~ -var posterior = editor.get("posterior"); -viz(marginalize(posterior, function(x) { - return {'point2': x.point2, 'inter': x.interpolationWeight}; -})); -~~~~ - -Both variables have a close dependence. -If `point2` is large, `interpolation` weight needs to also be large to in order to bring the -interpolation point to 0. - -### Exercise 2.3 - -WebPPL also exposes the list of MCMC samples that the density plots above are built from. -This is saved in `posterior.samples`. -Set `samples = 100` and `lag = 0`, then plot `pointInMiddle` as a function of the sample number. -Run this several times to get a feel for the shape of this curve. -What do you notice about the samples generated by MCMC? - -HINT: this will require some 'data munging' on the array of samples. -Some useful functions will be [`map`](http://docs.webppl.org/en/master/functions/arrays.html#map), `_.range()`, and `viz.line` which takes arrays `x` and `y`. - -~~~~ -///fold: -var interpolate = function(point1, point2, interpolationWeight) { - return (point1 * interpolationWeight + - point2 * (1 - interpolationWeight)); -} - -var model = function(){ - var point1 = -10; - var point2 = uniform(-100, 100); - var interpolationWeight = uniform(0, 1); - var pointInMiddle = interpolate(point1, point2, interpolationWeight); - observe(Gaussian({mu: 0, sigma:0.1}), pointInMiddle); - return {point2, interpolationWeight, pointInMiddle}; -} -/// - -var posterior = Infer({method: 'MCMC', samples: 100, lag: 0}, model); -var samples = map(function(d) { d["value"]["pointInMiddle"] }, posterior.samples); -viz.line(_.range(samples.length), samples); -~~~~ - -The starting point of our chain is highly variable in for the first few samples before it converges to around 0, our observed `pointInMiddle`. -This is because our MCMC chain is initialized randomly and needs iterations reach a steady state. -One way to fix this is to add a burn-in parameter, telling the MCMC sampler to throw away these early samples. - -### Exercise 2.4 - -Rewrite the code to use rejection sampling. -Note that you will need to find a way to turn the `observe` statement into a `condition` statement (Hint: See Exercise #1). -Is using rejection sampling here a good idea? -Why or why not? - -~~~~ -///fold: -var interpolate = function(point1, point2, interpolationWeight) { - return (point1 * interpolationWeight + - point2 * (1 - interpolationWeight)); -} -/// - -var model = function(){ - var point1 = -10; - var point2 = uniform(-100, 100); - var interpolationWeight = uniform(0, 1); - var pointInMiddle = interpolate(point1, point2, interpolationWeight); - condition(Math.abs(pointInMiddle) < 0.01) - return {point2, interpolationWeight, pointInMiddle}; -} - -viz.marginals(Infer({method: 'rejection', samples: 1000}, model)); -~~~~ - -Rejection sampling doesn't work well here because the range of `point2` is very wide [-100, 100], -so the proposed samples are almost always rejected. - - -### Exercise 2.5 - -> Using `verbose: true` in our `MH` algorithm, we can observe the proportion of proposals actually accepted. -> What is the acceptance rate over time and what about the model puts it at this level? - -> Consider the list of built-in drift kernels [here](https://webppl.readthedocs.io/en/master/driftkernels.html?highlight=drift%20kernel#helpers). -> Which of these would be appropriate to use in your model in place of the current uniform prior from which `point2` is sampled? -> Replace `uniform(-100, 100)` with a drift kernel and adjust the `width` parameter to raise the acceptance rate. -> Why does using this drift kernel influence the acceptance rate? -> What is a drawback of this approach? - -~~~~ -///fold: -var interpolate = function(point1, point2, interpolationWeight) { - return (point1 * interpolationWeight + - point2 * (1 - interpolationWeight)); -} -/// - -var model = function(){ - var point1 = -10; - var point2 = uniformDrift({a: -100, b: 100, width: .1}); - var interpolationWeight = uniform(0, 1); - var pointInMiddle = interpolate(point1, point2, interpolationWeight); - observe(Gaussian({mu: 0, sigma:0.1}), pointInMiddle); - return {point2, interpolationWeight, pointInMiddle}; -} - -var posterior = Infer({method: 'MCMC', - samples: 500, - verbose: true}, model); -~~~~ - -Using a drift kernel like uniformDrift means that we will sample proposals from distributions centered at the previous value of our random choice. -This produces a random walk that allows MH to more efficiently explore areas of high probability. -We notice that the acceptance on average is about an order of magnitude larger when using a width of 0.1! -One drawback of this approach is that we will get a much narrower set of samples. - - diff --git a/solutions/learning-as-conditional-inference.md b/solutions/learning-as-conditional-inference.md deleted file mode 100644 index 83270c5..0000000 --- a/solutions/learning-as-conditional-inference.md +++ /dev/null @@ -1,275 +0,0 @@ ---- -layout: exercise -title: Learning as conditional inference - solutions ---- - -## Exercise 1 - -### Exercise 1.1 - -> Recall our final coin weight model, "fair-vs-uniform", in which the coin weight was either 0.5 with high probability or drawn from a uniform distribution otherwise. -> This implies that a two-faced coin (always heads) is equally likely as a 70% heads coin. -> Intuitively you might be inclined to think that a two-faced coin is easier to make, and thus more likely. -> Adjust the model to express a prior where 90% of biased coins are always heads. - -~~~~ -var weightPosterior = function(observedData) { - return Infer({method: 'MCMC', burn:1000, samples: 10000}, function() { - var isFair = flip(0.9); - var isTwoFaced = flip(0.7); - var realWeight = isFair ? 0.5 : (isTwoFaced ? 1 : uniform({a:0, b:1})); - var coin = Bernoulli({p: realWeight}); - var obsFn = function(datum) { observe(coin, datum=='h') }; - mapData({data: observedData}, obsFn); - return realWeight; - }) -} - -var fullDataSet = repeat(50, function() { 'h' }); -var observedDataSizes = [0,1,2,4,6,8,10,12,15,20,25,30,40,50]; -var estimates = map(function(N) { expectation(weightPosterior(fullDataSet.slice(0, N))) }, observedDataSizes); -viz.line(observedDataSizes, estimates); -~~~~ - -### Exercise 1.2 - -How does your solution behave differently than the fair-vs-uniform model from the chapter? -Find a data set such that the learning curves are qualitatively different. - -~~~~ -var fairVsUniform = function(observedData){ - return Infer({method: 'MCMC', burn: 10000, samples: 10000}, function() { - var isFair = flip(0.9); - var realWeight = isFair ? 0.5 : uniform({a:0, b:1}); - var coin = Bernoulli({p: realWeight}); - var obsFn = function(datum){ observe(coin, datum=='h') }; - mapData({data: observedData}, obsFn); - return realWeight; - }) -} - -var fairVsTfVsUniform = function(observedData) { - return Infer({method: 'MCMC', burn: 10000, samples: 10000}, function() { - var isFair = flip(0.9); - var isTwoFaced = flip(0.9); - var realWeight = isFair ? 0.5 : (isTwoFaced ? 1 : uniform({a:0, b:1})); - var coin = Bernoulli({p: realWeight}); - var obsFn = function(datum) { observe(coin, datum=='h') }; - mapData({data: observedData}, obsFn); - return realWeight; - }) -} - -var fullDataSet = ['h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 't', 't', - 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 't', 't', - 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 't', 't', - 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 't', 't', - 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 't', 't']; -var observedDataSizes = [0,1,2,4,6,8,10,12,15,20,25,30,40,50]; -var fvuEstimates = map(function(N) { expectation(fairVsUniform(fullDataSet.slice(0, N))) }, - observedDataSizes); -var fvtfvuEstimates = map(function(N) { expectation(fairVsTfVsUniform(fullDataSet.slice(0, N))) }, - observedDataSizes); -viz.line(observedDataSizes, fvuEstimates); -viz.line(observedDataSizes, fvtfvuEstimates); -~~~~ - -Here, we see that the fair-vs-twofaced-vs-uniform model quickly abandons the 50% hypothesis and latches onto the 100%, whereas the fair-vs-uniform model more gradually increases to 80%. -As soon as a single `t` is encountered, the fair-vs-twofaced-vs-uniform model immediately abandons the 100% hypothesis and drops back to 50% before climbing to 80%. - -## Exercise 2: The strength of beliefs - -> In the chapter, we observed how the model's best guess about the weight of the coin changed across a sequence of successive heads. -> See what happens if instead we see heads and tails in alternation. - -~~~~ -var pseudoCounts = {a: 10, b: 10}; - -var weightPosterior = function(observedData){ - return Infer({method: 'MCMC', burn:1000, samples: 1000}, function() { - var coinWeight = sample(Beta(pseudoCounts)); - var coinDist = Bernoulli({p: coinWeight}); - var obsFn = function(datum){ observe(coinDist, datum=='h') }; - mapData({data: observedData}, obsFn); - return coinWeight; - }) -} - -var fullDataSet = repeat(50, function() { ['h', 't'] }).flat(); -var observedDataSizes = [0,2,4,6,8,10,20,30,40,50,70,100]; -var estimates = map(function(N) { expectation(weightPosterior(fullDataSet.slice(0,N))) }, observedDataSizes); -viz.line(observedDataSizes, estimates); -~~~~ - -> It looks like we haven't learned anything! -> Since our best estimate for the coin's weight was 0.5 *prior* to observing anything, our best estimate, the maximum a posteriori (MAP), is hardly going to change when we get data consistent with that prior. - -### Exercise 2.1 - -> Modify the code below to see whether our posterior *distribution* is at all changed by observing this data set. -> Compare the prior and the posterior after all 100 observations. -> What are some similarities and differences? -> Why does this occur? - -~~~~ -var pseudoCounts = {a: 10, b: 10}; - -var weightPosterior = function(observedData){ - return Infer({method: 'MCMC', burn:1000, samples: 1000}, function() { - var coinWeight = sample(Beta(pseudoCounts)); - var coinDist = Bernoulli({p: coinWeight}); - var obsFn = function(datum){ observe(coinDist, datum=='h') }; - mapData({data: observedData}, obsFn); - return coinWeight; - }) -} - -var fullDataSet = repeat(50, function() { ['h', 't'] }).flat(); - -var prior = Beta(pseudoCounts); -var post = weightPosterior(fullDataSet); - -display("Prior distribution"); -viz(prior); -display("Posterior distribution"); -viz(post); -~~~~ - -The general shape of the prior and posterior are similar, but the posterior distribution is much narrower and taller as indicated by the smaller x-axis and larger y-axis. -This happens because as we observe more data, we become increasingly confident that the true mean is close to 50%. - - -### Exercise 2.2 - -> This time, let's see how our belief distribution changes as more data are observed in. -> Although entropy would be a good measure here, calculating entropy for a Beta distribution is [somewhat involved](https://en.wikipedia.org/wiki/Beta_distribution#Quantities_of_information_(entropy)). - -> An alternative we can use is variance: the expected squared difference between a sample from the distribution and the distribution mean. -> This doesn't take into account the shape of the distribution, and so it won't give us quite what we want if the distribution is non-symmetric; but it is a reasonable first try. - -> Modify the code to see how the variance changes as more data are observed. - -> HINT: `expectation` can take an optional function parameter. For example: -~~~~norun -expectation(Categorical({ps: [.2, .8], vs: [0, 1]}), function(x) { 2*x }); -~~~~ - -~~~~ -var pseudoCounts = {a: 10, b: 10}; - -var weightPosterior = function(observedData){ - return Infer({method: 'MCMC', burn:1000, samples: 1000}, function() { - var coinWeight = sample(Beta(pseudoCounts)); - var coinDist = Bernoulli({p: coinWeight}); - var obsFn = function(datum){ observe(coinDist, datum=='h') }; - mapData({data: observedData}, obsFn); - return coinWeight; - }) -} - -var fullDataSet = repeat(256, function(){['h', 't']}).flat() -var observedDataSizes = [0,2,4,8,16,32,64,128,256,512]; -var variances = map(function(N) { - var posterior = weightPosterior(fullDataSet.slice(0,N)); - var mean = expectation(posterior); - var variance = expectation(posterior, function(x) { Math.pow(x - mean, 2) }); - return variance -}, observedDataSizes) - -viz.line(observedDataSizes, variances); -~~~~ - -The variance decreases as we observe more data. - -## 3. Causal Power - -> Consider our model of causal power from the chapter. - -~~~~ -var causalPowerModel = function(observedData) { - // Causal power of C to cause E - var cp = uniform(0, 1); - - // Background probability of E - var b = uniform(0, 1); - - mapData({data: observedData}, function(datum) { - // The noisy causal relation to get E given C - var E = (datum.C && flip(cp)) || flip(b); - condition(E == datum.E); - }) - - return {causal_power: cp, background: b}; -} - -var observedData = [{C: true, E: false}]; -var posterior = Infer({method: 'MCMC', samples: 10000, lag:2}, - function() { causalPowerModel(observedData) }) -viz.marginals(posterior); -~~~~ - -> For each list item, find a set of `observedData` that produce the following properties. -Then explain intuitively why the data produce these results. -> -> 1. High causal power for C and low background probability of E. -> 2. Low causal power for C and high background probability of E. -> 3. High causal power for C and high background probability of E. -> 4. C is present at least 5 times, E is present each time C is present, and C does not have high causal power. - -~~~~ -///fold: -var causalPowerModel = function(observedData) { - // Causal power of C to cause E - var cp = uniform(0, 1); - - // Background probability of E - var b = uniform(0, 1); - - mapData({data: observedData}, function(datum) { - // The noisy causal relation to get E given C - var E = (datum.C && flip(cp)) || flip(b); - condition(E == datum.E); - }) - - return {causal_power: cp, background: b}; -} -/// - -var observedDataA = [{C: true, E: true}, - {C: true, E: true}, - {C: true, E: true}, - {C: false, E: false}, - {C: false, E: false}, - {C: false, E: false}]; -var observedDataB = [{C: true, E: true}, - {C: true, E: true}, - {C: true, E: false}, - {C: false, E: true}, - {C: false, E: true}, - {C: false, E: true}]; -var observedDataC = [{C: true, E: true}, - {C: true, E: true}, - {C: true, E: true}, - {C: false, E: true}, - {C: false, E: true}, - {C: false, E: false}]; -var observedDataD = [{C: true, E: true}, - {C: true, E: true}, - {C: true, E: true}, - {C: true, E: true}, - {C: true, E: true}].concat(repeat(20, function() { [{C: false, E: true}] })); - -var posterior = Infer({method: 'MCMC', samples: 10000, lag:2}, - function() { causalPowerModel(observedDataA) }); -viz.marginals(posterior); -~~~~ - -1. Since we never observe E outside the context of C, we infer that E has a low base rate. - Since E always occurs when C does, we infer that C has a high causal power. -2. Since we observe E regardless of whether or not C occurs, we infer that E has a high base rate and C has low causal power. - The single observation that E does not occur when C occurs drastically diminishes C's causal power. -3. Since we often observe E even when C does not occur, we infer that E has a high base rate. - However, since E always occurs when C occurs but only sometimes when C does not occur, we infer that C has high causal power. -4. Since we observe E many times even without C, we infer that E has a high base rate. - This alternative cause *explains away* C as the cause, thus giving us no further information about C. - As a result, we see that C's posterior is roughly the same as its prior. \ No newline at end of file diff --git a/solutions/lot-learning.md b/solutions/lot-learning.md deleted file mode 100644 index 6aabd13..0000000 --- a/solutions/lot-learning.md +++ /dev/null @@ -1,116 +0,0 @@ ---- -layout: exercise -title: Learning as conditional inference - solutions ---- - -## Exercise 1: Inferring Functions - -> Consider our model of function inference from the chapter. -> We can reconceptualize our program as a sequence-generator by making the input arguments 1,2,3,…. -> Suppose that the first number in the sequence `f(1)` is 1 and the second number `f(2)` is 4. -> What number would come next? - -~~~~ -///fold: -var plus = {fn: function(a,b) {return a + b}, expr: '+'} -var multiply = {fn: function(a,b) {return Math.round(a * b,0)}, expr: '*'} -var divide = {fn: function(a,b) {return Math.round(a/b,0)}, expr: '/'} -var minus = {fn: function(a,b) {return a - b}, expr: '-'} -var power = {fn: function(a,b) {return Math.pow(a,b)}, expr: '**'} -var binaryOps = [plus, multiply, divide, minus, power] - -var identity = {fn: function(x) {return x}, expr: 'x'} - -var randomConstantFunction = function() { - var c = uniformDraw(_.range(10)) - return {fn: function(x){return c}, expr: c} -} - -var randomCombination = function(f,g) { - var op = uniformDraw(binaryOps); - var opfn = op.fn - var ffn = f.fn - var gfn = g.fn - return {fn: function(x){return opfn(ffn(x),gfn(x))}, - expr: f.expr+op.expr+g.expr} -} - -// sample an arithmetic expression -var randomArithmeticExpression = function() { - if (flip()) { - return randomCombination(randomArithmeticExpression(), - randomArithmeticExpression()) - } else { - return flip() ? identity : randomConstantFunction() - } -} -/// - -viz.table(Infer({method: 'enumerate', maxExecutions: 1000}, function() { - var e = randomArithmeticExpression(); - var f = e.fn; - - condition(f(1) == 1); - condition(f(2) == 4); - - return f(3); // use this for Exercise 1.1 -// return e.expr; // use this for Exercise 1.2 -})) -~~~~ - - -### Exercise 1.1 - -> Not surprisingly, the model predicts `9` as the most likely result for `f(3)`. -> However, it also puts significant probability on `27`. -> Explain why these two numbers have the highest posterior probabilities. - -These results are largely due to the high probability of the functions `x * x` and `x ** x`, which return `9` and `27` for `f(3)`, respectively. - - -### Exercise 1.2 - -> Why is the probability of `x ** 2` is so much lower than `x * x`? - -The two expressions differ in the final draw from the recursive function `randomArithmeticExpression`. -On each step through the function, there is a 0.3 * 0.5 = 0.15 chance of returning `x`, but only a 0.3 * 0.5 * 0.1 = 0.015 chance of drawing `2`. -In general, drawing an `x` is much likely than drawing any particular number. - - -### Exercise 1.3 - -> Many people find the high probability assigned to `27` to be unintuitive (i.e. if we ran this as an experiment, 27 would be a very infrequent response). -> This suggests our model is an imperfect model of human intuitions. How could we decrease the probability of inferring `27`? -> -> HINT: Consider the priors. - -Currently, each function (`*`, `^`, `+`) is equally likely (they are drawn from a uniform distribution). -We could decrease the probability of the latter function by decreasing the probability of drawing `^`, e.g. - -~~~~norun -var randomCombination = function(f,g) { - var op = categorical({vs: binaryOps, ps: [.24, .24, .24, .24, .04]}); - var opfn = op.fn - var ffn = f.fn - var gfn = g.fn - return {fn: function(x){return opfn(ffn(x),gfn(x))}, - expr: f.expr+op.expr+g.expr} -} -~~~~ - -It seems reasonable that people are less likely to consider sequences made from powers, though this would need to be confirmed empirically. - - - -## Exercise 2: Role-governed concepts (optional) - -In the Rational Rules model we saw in the chapter, concepts were defined in terms of the features of single objects (e.g. "it's a raven if it has black wings"). -Psychologists have suggested that many concepts are not defined by the features of a single objects, but instead by the relations the object has to other objects. -For instance, "a key is something that opens a lock". -These are called *role-governed* concepts. - -Extend the Rational Rules model to capture role-governed concepts. - -Hint: You will need primitive relations in your language of thought. - -Hint: Consider adding quantifiers (e.g. *there exists*) to your language of thought. diff --git a/solutions/mixture-models.md b/solutions/mixture-models.md deleted file mode 100644 index 7dbbd13..0000000 --- a/solutions/mixture-models.md +++ /dev/null @@ -1,144 +0,0 @@ ---- -layout: exercise -title: Mixture models - exercises ---- - -## Exercise 1. Social group reasoning - -Our knowledge about the social world is *structured*: we do not just know a collection of facts about particular people but believe that there are *kinds* of people with shared properties. Even infants make strong assumptions about people based on what language they speak, what foods they eat, and what actions they take (check out Katherine Kinzler's work!) How do we learn this structure at the same time as we are learning about individual people? In this exercise you will explore mixture models as a formal model of how we reason about social groups. - -### a) - -Imagine you go to an alien planet and see 10 aliens: you notice three clear properties, some have antennae, some are green, and some make a distinctive 'blargh' noise. -Implement a simple model assuming that there are two kinds of aliens with different distributions over these properties, but you have a priori uncertainty over what the distributions are, and whether any particular alien belongs to group A or group B. - -HINT: each data point we observed in the chapter only had one property (from k different values). Here each alien has three properties. This means we need a way of observing all three properties under their respective prototype priors. - -~~~~ -///fold: -var expectationOver = function(results, group) { - return function(property) { - return expectation(results, function(v) {return v[group][property]}) - } -} -/// -var properties = ['antennae', 'green', 'blarghNoise'] -var data = [ - {antennae : false, green: false, blarghNoise: false}, - {antennae : true, green: true, blarghNoise: true}, - {antennae : true, green: true, blarghNoise: true}, - {antennae : true, green: true, blarghNoise: true}, - {antennae : false, green: false, blarghNoise: false}, - {antennae : true, green: true, blarghNoise: true}, - {antennae : false, green: false, blarghNoise: false}, - {antennae : true, green: true, blarghNoise: true}, - {antennae : false, green: false, blarghNoise: false}, - {antennae : false, green: false, blarghNoise: false} -] - -var sampleGroupPrototype = mem(function(groupName) { - var probs = repeat(3, function(){ beta(.5, .5)}) - return _.zipObject(properties, probs) -}) - -var results = Infer({method: 'MCMC', kernel: {HMC: {steps: 10, stepSize: .01}}, - samples: 3000}, function(){ - mapData({data: data}, function(datum) { - var group = flip() ? 'group1' : 'group2'; - var prototype = sampleGroupPrototype(group) - mapData({data: properties}, function(property) { - observe(Bernoulli({p: prototype[property]}), datum[property]) - }) - }) - return {group1: sampleGroupPrototype('group1'), - group2: sampleGroupPrototype('group2')} -}) -viz.bar(properties, map(expectationOver(results, 'group1'), properties)) -viz.bar(properties, map(expectationOver(results, 'group2'), properties)) -~~~~ - -### b) - -Now imagine you hear a noise from inside a crater but you cannot see the alien that emitted it; this is a noisy observation. How can you use the model you learned above to make an educated guess about their other features? - -~~~~ -var results = Infer({method: 'MCMC', kernel: {HMC: {steps: 10, stepSize: .01}}, - samples: 3000}, function(){ - mapData({data: data}, function(datum) { - var group = flip() ? 'group1' : 'group2'; - var prototype = sampleGroupPrototype(group) - mapData({data: properties}, function(property) { - observe(Bernoulli({p: prototype[property]}), datum[property]) - }) - }) - var mysteryGroup = flip() ? 'group1' : 'group2' - var mysteryPrototype = sampleGroupPrototype(mysteryGroup) - observe(Bernoulli({p: mysteryPrototype['blarghNoise']}), true) - - return {group1: sampleGroupPrototype('group1'), - group2: sampleGroupPrototype('group2'), - mysteryGroup: mysteryGroup } -}) -viz.bar(properties, map(expectationOver(results, 'group1'), properties)) -viz.bar(properties, map(expectationOver(results, 'group2'), properties)) -marginalize(results, function(x) { - x.mysteryGroup -}) -~~~~ - - -## Exercise 2: Detecting cheating - -This problem is adapted from Section 6.5 of Lee \& Wagenmakers (2013). - -Consider the practical challenge of detecting if people cheat on a test. For example, people who have been in a car accident may seek financial compensation from insurance companies by feigning cognitive impairment such as pronounced memory loss. When these people are confronted with a memory test that is intended to measure the extent of their impairment, they may deliberately under-perform. This behavior is called malingering, and it may be accompanied by performance much worse than that displayed by real amnesiacs. Sometimes, for example, malingerers may perform substantially below chance. - -Malingering is not always easy to detect, but is naturally addressed by a mixture model. Using this approach, it is possible to infer which of two categories -- those who malinger, and those who are truthful or bona fide -- each person belongs to, and quantify the confidence in each of these classifications. -We consider an experimental study on malingering, in which each of p = 22 participants completed a memory test (Ortega, Wagenmakers, Lee, Markowitsch, & Piefke, 2012). One group of participants was told to do their best. These are the bona fide participants. The other group of participants was told to under-perform by deliberately simulating amnesia. These are the malingerers. Out of a total of n = 45 test items, the participants get 45, 45, 44, 45, 44, 45, 45, 45, 45, 45, 30, 20, 6, 44, 44, 27, 25, 17, 14, 27, 35, and 30 correct. Because this was an experimental study, we know that the first 10 participants were bona fide and the next 12 were instructed to malinger. - -### a) - -Implement a simple mixture model inferring which group each participant belongs to. Examine the posteriors over group-level parameters. - -HINT: the group-level variables you are trying to infer are the error rates; it probably makes sense to assume that the malingerers are worse than bonafide participants, but have uncertainty over the values of each. - -~~~~ -var scores = [45, 45, 44, 45, 44, 45, 45, 45, 45, 45, 30, 20, 6, 44, 44, 27, 25, 17, 14, 27, 35, 30] -var subjIDs = _.range(scores.length) -var data = map(function(datum) {return _.zipObject(['subjID', 'score'], datum)}, _.zip(subjIDs, scores)); - -var inferOpts = {method: 'MCMC', //kernel: {HMC: {steps: 10, stepSize: .01}}, - samples: 10000} -var results = Infer(inferOpts, function() { - var group_1_p = uniform(0.5, 1) - var group_2_p = uniform(0, group_1_p) - var participant2Group = mem(function(participantID) { - return flip() ? 'group1' : 'group2' - }) - var group2Prob = mem(function(group) { - return group == 'group1' ? group_1_p : group_2_p - }) - - var obsFn = function(datum){ - var p = group2Prob(participant2Group(datum.subjID)) - observe(Binomial({p: p, n: 45}), datum.score) - } - mapData({data: data}, obsFn) - - // Get participant group membership posteriors - var participantResults_ = map(function(datum) {return participant2Group(datum.subjID)}, data) - var participantResults = _.zipObject(_.range(participantResults_.length), participantResults_) - // Merge overall group success probs - return _.merge(participantResults, {group_1_p: group_1_p, group_2_p: group_2_p}) -}) - -viz.marginals(results) -~~~~ - - -### b) - -Examine the posteriors over group membership for each participant. Did all of the participants follow the instructions? (i.e. are the first 10 inferred to be in one group and the next 12 in the other?) - -*According to our run we infer that all the bonafides (10/10) followed the instructions, however 3/12 malingerers we're included with the bonafides! (Take a look at participants scores for 13 , 14, 21.)* - diff --git a/solutions/occams-razor.md b/solutions/occams-razor.md deleted file mode 100644 index b195ab4..0000000 --- a/solutions/occams-razor.md +++ /dev/null @@ -1,481 +0,0 @@ ---- -layout: exercise -title: Occam's razor - solutions ---- - -## Exercise 1. The Number Game - -> In a task called the [*number game*](https://web.mit.edu/cocosci/Papers/nips99preprint.ps), participants were presented with *sets* of numbers and asked how well different numbers completed them. -> A rule-based generative model accurately captured responses for some stimuli (e.g. for $$16, 8, 2, 64$$ or $$60, 80, 10, 30$$, participants assigned high fit to powers of two and multiples of ten, respectively). -> However, it failed to capture others such as the set $$16, 23, 19, 20$$. -> How good is 18, relative to 13, relative to 99? - -### Exercise 1.1 - -> Using the rule-based model of this task below, examine the posteriors for the following inputs: -> `[3]`, `[3, 9]`, `[3, 5, 9]`. -> Describe how the posterior probabilities of the rules change based on the observed sets. -> Why are they so different despite having the same priors? -> Do these results match your intuition? - -~~~~ -var maxNumber = 100; - -///fold: -var filterByInRange = function(set) { - var inRange = function(v) {v <= maxNumber && v >= 0}; - return _.uniq(filter(inRange, set)); -} - -var genEvens = function() { - return filter(function(v) {return v % 2 == 0}, _.range(1, maxNumber)); -} - -var genOdds = function() { - return filter(function(v) {return (v + 1) % 2 == 0}, _.range(1, maxNumber)); -} - -var genMultiples = function(base) { - var multiples = map(function(v) {return base * v}, _.range(maxNumber)); - return filterByInRange(multiples); -} - -var genPowers = function(base) { - var powers = map(function(v) {return Math.pow(base, v)}, _.range(maxNumber)); - return filterByInRange(powers); -} - -var inSet = function(val, set) { - return _.includes(set, val); -} - -var getSetFromHypothesis = function(rule) { - var parts = rule.split('_'); - return (parts[0] == 'multiples' ? genMultiples(_.parseInt(parts[2])) : - parts[0] == 'powers' ? genPowers(_.parseInt(parts[2])) : - parts[0] == 'evens' ? genEvens() : - parts[0] == 'odds' ? genOdds() : - console.error('unknown rule' + rule)) -}; -/// - -// Considers 4 kinds of rules: evens, odds, and multiples and powers of small numbers < 12 -var makeRuleHypothesisSpace = function() { - var multipleRules = map(function(base) {return 'multiples_of_' + base}, _.range(1, 12)); - var powerRules = map(function(base) {return 'powers_of_' + base}, _.range(1, 12)); - return multipleRules.concat(powerRules).concat(['evens', 'odds']); -} - -// Takes an unordered array of examples of a concept in the number game -// and a test query (i.e. a new number that the experimenter is asking about) -var learnConcept = function(examples, testQuery) { - Infer({method: 'enumerate'}, function() { - var rules = makeRuleHypothesisSpace() - var hypothesis = uniformDraw(rules) - var set = getSetFromHypothesis(hypothesis) - mapData({data: examples}, function(example) { - // note: this likelihood corresponds to size principle - observe(Categorical({vs: set}), example) - }) - return {hypothesis, testQueryResponse : inSet(testQuery, set)} - }); -} - -var examples = [3, 9]; -var testQuery = 12; -var posterior = learnConcept(examples, testQuery); -viz.marginals(posterior); -~~~~ - - -Although the prior over the hypotheses is uniform, the likelihood for each of the hypotheses is not. -The more "general" rules such as "odd-numbers" cover much larger spaces than "multiples of three" or "powers of three". -Therefore, in accordance with the size principle, observing a particular set of values under the hypotheses with smaller domains produces much higher posterior probabilities. - - -### Exercise 1.2 - -> Modify the model to include similarity-based hypotheses, represented as numbers generated by sampling from a common interval. -> Implement `genSetFromInterval` to generate all integers in `[a, b]`. -> Implement `makeIntervalHypothesisSpace` to build a list of all possible intervals in `[start, end]`. -> For example, `makeIntervalHypothesisSpace(1, 4)` should produce the following: - -~~~~norun -["interval_1_2", - "interval_1_3", - "interval_1_4", - "interval_2_3", - "interval_2_4", - "interval_3_4"] -~~~~ - -> Then modify `getSetFromHypothesis` to account for interval hypotheses. - -~~~~ -var maxNumber = 20; - -///fold: -var filterByInRange = function(set) { - var inRange = function(v) {v <= maxNumber && v >= 0}; - return _.uniq(filter(inRange, set)); -} - -var genEvens = function() { - return filter(function(v) {return v % 2 == 0}, _.range(1, maxNumber)); -} - -var genOdds = function() { - return filter(function(v) {return (v + 1) % 2 == 0}, _.range(1, maxNumber)); -} - -var genMultiples = function(base) { - var multiples = map(function(v) {return base * v}, _.range(maxNumber)); - return filterByInRange(multiples); -} - -var genPowers = function(base) { - var powers = map(function(v) {return Math.pow(base, v)}, _.range(maxNumber)); - return filterByInRange(powers); -} - -var inSet = function(val, set) { - return _.includes(set, val); -} - -var makeRuleHypothesisSpace = function() { - var multipleRules = map(function(base) {return 'multiples_of_' + base}, _.range(1, 12)); - var powerRules = map(function(base) {return 'powers_of_' + base}, _.range(1, 12)); - return multipleRules.concat(powerRules).concat(['evens', 'odds']); -} -/// - -var genSetFromInterval = function(a, b) { - return _.range(a, b+1); -} - -var makeIntervalHypothesisSpace = function(start, end) { - var allIntervals = _.flatten(map(function(s) { - return map(function(e) { [s, e] }, - genSetFromInterval(s+1, end)); - }, genSetFromInterval(start, end))); - - var createIntervalName = function(a, b) { 'interval_' + a + '_' + b }; - var intervalNames = map(function(x) { 'interval_' + x[0] + '_' + x[1] }, - allIntervals); - return intervalNames; -} - -var getSetFromHypothesis = function(rule) { - var parts = rule.split('_'); - return (parts[0] == 'multiples' ? genMultiples(_.parseInt(parts[2])) : - parts[0] == 'powers' ? genPowers(_.parseInt(parts[2])) : - parts[0] == 'evens' ? genEvens() : - parts[0] == 'odds' ? genOdds() : - parts[0] == 'interval' ? genSetFromInterval(_.parseInt(parts[1]), _.parseInt(parts[2])) : - console.error('unknown rule' + rule)); -}; - -var learnConcept = function(examples, testQuery) { - Infer({method: 'enumerate'}, function() { - var rules = makeRuleHypothesisSpace(); - var intervals = makeIntervalHypothesisSpace(1, maxNumber); - var hypothesis = flip(0.5) ? uniformDraw(rules) : uniformDraw(intervals); - var set = getSetFromHypothesis(hypothesis); - mapData({data: examples}, function(example) { - observe(Categorical({vs: set}), example) - }) - return {hypothesis: hypothesis, - testQueryResponse: inSet(testQuery, set)}; - }); -} - -var examples = [3, 10]; -var testQuery = 12; -var posterior = learnConcept(examples, testQuery); -viz.marginals(posterior); -~~~~ - - -### Exercise 1.3 - -> Now examine the sets `[3]`, `[3, 6, 9]`, and `[3, 5, 6, 7, 9]`. -> Sweep across all integers as testQueries to see the 'hotspots' of the model predictions. -> What do you observe? - -~~~~ -var maxNumber = 20; - -///fold: -var filterByInRange = function(set) { - var inRange = function(v) {v <= maxNumber && v >= 0}; - return _.uniq(filter(inRange, set)); -} - -var genEvens = function() { - return filter(function(v) {return v % 2 == 0}, _.range(1, maxNumber)); -} - -var genOdds = function() { - return filter(function(v) {return (v + 1) % 2 == 0}, _.range(1, maxNumber)); -} - -var genMultiples = function(base) { - var multiples = map(function(v) {return base * v}, _.range(maxNumber)); - return filterByInRange(multiples); -} - -var genPowers = function(base) { - var powers = map(function(v) {return Math.pow(base, v)}, _.range(maxNumber)); - return filterByInRange(powers); -} - -var inSet = function(val, set) { - return _.includes(set, val); -} - -var makeRuleHypothesisSpace = function() { - var multipleRules = map(function(base) {return 'multiples_of_' + base}, _.range(1, 12)); - var powerRules = map(function(base) {return 'powers_of_' + base}, _.range(1, 12)); - return multipleRules.concat(powerRules).concat(['evens', 'odds']); -} - -var genSetFromInterval = function(a, b) { - return _.range(a, b+1); -} - -var makeIntervalHypothesisSpace = function(start, end) { - var allIntervals = _.flatten(map(function(s) { - return map(function(e) { [s, e] }, - genSetFromInterval(s+1, end)); - }, genSetFromInterval(start, end))); - - var createIntervalName = function(a, b) { 'interval_' + a + '_' + b }; - var intervalNames = map(function(x) { 'interval_' + x[0] + '_' + x[1] }, - allIntervals); - return intervalNames; -} - -var getSetFromHypothesis = function(rule) { - var parts = rule.split('_'); - return (parts[0] == 'multiples' ? genMultiples(_.parseInt(parts[2])) : - parts[0] == 'powers' ? genPowers(_.parseInt(parts[2])) : - parts[0] == 'evens' ? genEvens() : - parts[0] == 'odds' ? genOdds() : - parts[0] == 'interval' ? genSetFromInterval(_.parseInt(parts[1]), _.parseInt(parts[2])) : - console.error('unknown rule' + rule)); -}; - -var learnConcept = function(examples, testQuery) { - Infer({method: 'enumerate'}, function() { - var rules = makeRuleHypothesisSpace(); - var intervals = makeIntervalHypothesisSpace(1, maxNumber); - var hypothesis = flip(0.5) ? uniformDraw(rules) : uniformDraw(intervals); - var set = getSetFromHypothesis(hypothesis); - mapData({data: examples}, function(example) { - observe(Categorical({vs: set}), example) - }) - return {hypothesis: hypothesis, - testQueryResponse: inSet(testQuery, set)}; - }); -} -/// - -var examples = [3, 6, 9]; -var queries = genSetFromInterval(1, maxNumber); -var pQueries = map(function(query) { - var post = learnConcept(examples, query); - return expectation(marginalize(post, function(x) { x.testQueryResponse })) -}, queries); -viz.line(queries, pQueries, {xLabel: 'query', yLabel: 'P(query | examples)'}); -~~~~ - -Rule-based hypotheses are more likely when the examples are `[3]` and `[3, 6, 9]`, but the hypotheses are more likely -once we have `[3, 5, 6, 7, 9]`. - -### Exercise 1.4 - -> Look at some of the data in the large-scale replication of the number game [here](https://openpsychologydata.metajnl.com/articles/10.5334/jopd.19/). ->Can you think of an additional concept people might be using that we did not include in our model? - -Answers may vary. The authors mention that a common hypothesis are rules involving "numbers [ending/starting] in 3". - -#### e) Challenge! - -Can you replicate the results from the paper (reproduced in figure below) by adding in the other hypotheses from the paper? - - - - -## Exercise 2: Causal induction revisited - -> In a [previous exercise](learning-as-conditional-inference.html) we explored the Causal Power (CP) model of causal learning. -> However, Griffiths and Tenenbaum [-@Griffiths2005], "Structure and strength in causal induction", hypothesized that when people do causal induction, they are not estimating a power parameter (as in CP) but instead they are deciding whether there is a causal relation at all -- they called this model Causal Support (CS). -> In other words, they are inferring whether C and E are related, and if so, then C must cause E. - -### Exercise 2.1 - -> Implement the Causal Support model by modifying the Causal Power model. - -~~~~ -var observedData = [{C:true, E:false}]; - -var causalPost = Infer({method: 'MCMC', samples: 10000, lag:2}, function() { - - // Is there a causal relation between C and E? - var relation = flip(); - - // Causal power of C to cause E - var cp = uniform(0, 1); - - // Background probability of E occurring regardless of C - var b = uniform(0, 1); - - mapData({data: observedData}, function(datum) { - var E = (relation && datum.C && flip(cp)) || flip(b); - condition(E == datum.E); - }) - - return {relation, cp, b}; -}) - -viz.marginals(causalPost); -~~~~ - - -### Exercise 2.2 - -> Inference with the MCMC method will not be very efficient for the above CS model because the MCMC algorithm is using the single-site Metropolis-Hastings procedure, changing only one random choice at a time. -> (To see why this is a problem, think about what happens when you try to change the choice about whether there is a causal relation.) -> -> To make this more efficient, construct the marginal probability of the effect directly and use it in an `observe` statement. -> -> *Hint:* You can do this either by figuring out the probability of the effect mathematically, or by using `Infer`. - -~~~~ -var observedData = [{C:true, E:false}]; - -var causalPost = Infer({method: 'MCMC', samples: 10000, lag:2}, function() { - - // Is there a causal relation between C and E? - var relation = flip(); - - // Causal power of C to cause E - var cp = uniform(0, 1); - - // Background probability of E occurring regardless of C - var b = uniform(0, 1); - - var noisyOrMarginal = function(C) { - return Infer({method: 'enumerate'}, function() { - return (relation && C && flip(cp)) || flip(b); - }) - } - - mapData({data: observedData}, function(datum) { - observe(noisyOrMarginal(datum.C), datum.E); - }) - - return {relation, cp, b}; -}) - -viz.marginals(causalPost); -~~~~ - - -### Exercise 2.3 - -> Fig. 1 of [-@Griffiths2005] (shown below) shows a critical difference in the predictions of CP and CS, -> specifically when the effect happens just as often with and without the cause. -> Show by running simulations the difference between CP and CS in these cases. - - - -~~~~ -var generateData = function(numEWithC, numEWithoutC) { - var eWithC = repeat(numEWithC, function() {return {C: true, E: true}}); - var noEWithC = repeat(8 - numEWithC, function() {return {C: true, E: false}}); - var eWithoutC = repeat(numEWithoutC, function() {return {C: false, E: true}}); - var noEWithoutC = repeat(8 - numEWithoutC, function() {return {C: false, E: false}}); - return _.flatten([eWithC, noEWithC, eWithoutC, noEWithoutC]); -} - -var dataParams = [[8, 8], [6, 6], [4, 4], [2, 2], [0, 0], [8, 6], - [6, 4], [4, 2], [2, 0], [8, 4], [6, 2], [4, 0], - [8, 2], [6, 0], [8, 0]]; - -var data = map(function(x) { generateData(x[0], x[1]) }, dataParams); - -var cpPost = function(observedData) { - return Infer({method: 'MCMC', burn: 2000, samples: 1000, lag:2}, function() { - var cp = uniform(0, 1); - var b = uniform(0, 1); - - var noisyOrMarginal = function(C) { - return Infer({method: 'enumerate'}, function() { - return (C && flip(cp)) || flip(b); - }) - } - - mapData({data: observedData}, function(datum) { - observe(noisyOrMarginal(datum.C), datum.E); - }) - - return cp; - }) -} - -var csPost = function(observedData) { - return Infer({method: 'MCMC', burn: 2000, samples: 1000, lag:2}, function() { - var relation = flip(); - var cp = uniform(0, 1); - var b = uniform(0, 1); - - var noisyOrMarginal = function(C) { - return Infer({method: 'enumerate'}, function() { - return (relation && C && flip(cp)) || flip(b); - }) - } - - mapData({data: observedData}, function(datum) { - observe(noisyOrMarginal(datum.C), datum.E); - }) - - return relation * cp; - }) -} - -var paramNames = map(function(x) { - var letter = (x + 10).toString(36).toUpperCase(); - var params = dataParams[x]; - return letter + '. (' + params[0] + ', ' + params[1] + ')' -}, _.range(dataParams.length)); - -var cpValues = map(function(d) { expectation(cpPost(d)) }, data); -var csValues = map(function(d) { expectation(csPost(d)) }, data); - -display("Causal power model"); -viz.bar(paramNames, cpValues); - -display("Causal support model"); -viz.bar(paramNames, csValues); -~~~~ - - -### Exercise 2.4 - -> Explain why the Causal Support model shows this effect using Bayesian Occam's razor. -> -> *Hint:* Recall that Causal Support selects between two models (one where there is a causal relation and one where there isn't). - -The 'model selection' in the Causal Support model applies Occam's razor to have a bias towards no relation. - - -## Exercise 3 (Challenge!) - -Try an informal behavioral experiment with several friends as experimental subjects to see whether the Bayesian approach to curve fitting given on the wiki page corresponds with how people actually find functional patterns in sparse noisy data. Your experiment should consist of showing each of 4-6 people 8-10 data sets (sets of x-y values, illustrated graphically as points on a plane with x and y axes), and asking them to draw a continuous function that interpolates between the data points and extrapolates at least a short distance beyond them (as far as people feel comfortable extrapolating). Explain to people that the data were produced by measuring y as some function of x, with the possibility of noise in the measurements. - -The challenge of this exercise comes in choosing the data sets you will show people, interpreting the results and thinking about how to modify or improve a probabilistic program for curve fitting to better explain what people do. Of the 8-10 data sets you use, devise several ("type A") for which you believe the WebPPL program for polynomial curve fitting will match the functions people draw, at least qualitatively. Come up with several other data sets ("type B") for which you expect people to draw qualitatively different functions than the WebPPL polynomial fitting program does. Does your experiment bear out your guesses about type A and type B? If yes, why do you think people found different functions to best explain the type B data sets? If not, why did you think they would? There are a number of factors to consider, but two important ones are the noise model you use, and the choice of basis functions: not all functions that people can learn or that describe natural processes in the world can be well described in terms of polynomials; other types of functions may need to be considered. - -Can you modify the WebPPL program to fit curves of qualitatively different forms besides polynomials, but of roughly equal complexity in terms of numbers of free parameters? Even if you can't get inference to work well for these cases, show some samples from the generative model that suggest how the program might capture classes of human-learnable functions other than polynomials. - -You should hand in the data sets you used for the informal experiment, discussion of the experimental results, and a modified WebPPL program for fitting qualitatively different forms from polynomials plus samples from running the program forward. diff --git a/solutions/process-models.md b/solutions/process-models.md deleted file mode 100644 index ebe8d41..0000000 --- a/solutions/process-models.md +++ /dev/null @@ -1,344 +0,0 @@ ---- -layout: exercise -title: Rational process models - solutions ---- - -> Consider once again the simple blicket detector experiment from the Conditional Dependence chapter and Bayesian Data Analysis exercises. -> Here, we have simplified the model such that the only free parameter is the base rate of being a blicket and the participant only sees one data point of evidence at a time (i.e. one set of blocks that makes the machine beep). -> -> In this exercise, you will extend the model from the Bayesian Data Analysis exercises to evaluate different process models on new data sets. -> -> Specifically, imagine we went to Mars to study the cognition of the aliens that live there, and in addition to collecting judgements about whether `A` was a blicket, we also collected response times (RTs) to get a better resolution into their cognitive processes. -> Response time is measured in behavioral experiments by calculating the time elapsed between presentation of the stimulus and the participant's response. -> Assume that the participants make inferences about the base rate by sampling a certain number of times. -> If they take many samples, their responses will be more accurate but at the cost of longer RTs. -> If they take few samples, their responses may be noisier but have shorter RTs. -> -> For simplicity, assume that the RT measures are in the same units as returned by `timeIt()` (milliseconds). - - -## Exercise 1 - -> Complete the code to infer the posterior distributions of the base rate and that the model is conditioned on both the participants' responses and response times. - -> HINT: The `observe()` function requires a distribution as its first parameter. - -~~~ -///fold: -var timeIt = function(func) { - var start = _.now(); - func(); - var end = _.now(); - return end - start; -} - -var detectingBlickets = function(evidence, baseRate, numSamples) { - return Infer({method: 'rejection', samples: numSamples}, function() { - var blicket = mem(function(block) { flip(baseRate) }); - var power = function(block) { blicket(block) ? .95 : .05 }; - var machineBeeps = function(blocks) { - blocks.length == 0 - ? flip(0.05) - : flip(power(first(blocks))) || machineBeeps(rest(blocks)) - }; - condition(machineBeeps(evidence)); - return blicket('A'); - }) -} - -var marsData = [ - {subjectID: 1, evidence: ['A'], response: true, RT: .9}, - {subjectID: 1, evidence: ['A', 'B', 'C', 'D', 'E', 'F'], response: true, RT: 1.1}, - {subjectID: 1, evidence: ['A', 'B', 'C'], response: true, RT: 1.2}, - {subjectID: 2, evidence: ['A'], response: true, RT: 3.5}, - {subjectID: 2, evidence: ['A', 'B', 'C', 'D', 'E', 'F'], response: false, RT: 4}, - {subjectID: 2, evidence: ['A', 'B', 'C'], response: true, RT: 3.4}, -]; -/// - -var getModelRT = function(func, numRepeats) { - var rt = repeat(numRepeats, function() { timeIt(func) }); - return Gaussian({mu: listMean(rt), sigma: Math.max(listVar(rt), 1)}); -} - -var dataAnalysis = function() { - var baseRate = uniform(0, 1); - var numSamples = randomInteger(100) + 1; - - map(function(datapoint) { - var blicketModel = function() { - return detectingBlickets(datapoint.evidence, baseRate, numSamples) - }; - - observe(blicketModel(), datapoint.response); - observe(getModelRT(blicketModel, 10), datapoint.RT); - }, marsData); - - return {baseRate, numSamples}; -} - -var opts = {method: 'MCMC', - callbacks: [editor.MCMCProgress()], - samples: 500, - burn: 100}; -viz.marginals(Infer(opts, dataAnalysis)); -~~~ - - -## Exercise 2 - -> How do your inferences about the base rates change with the following modifications? -> -> 1. Only `observe()` on `response`. -> 2. Only `observe()` on `RT`. -> -> What does this say about the information provided about the base rate from each source? - -Looking at just the responses, we see that the `base rate` is relatively high. -This is because 5 of the 6 responses were `true`. -Looking at just the `RT`, we now see that the `base rate` is much lower. -This is because slow `RT` means that more proposals were rejected which suggests a low `base rate`. - - -## Exercise 3 - -> Note that there is some subject variability in RT. -> Modify your model to allow the two subjects to have different base rates in mind. -> Visualize the base rates for each participant. -> -> What do you notice about the base rates? -> What makes their base rates so different? - - -~~~ -///fold: -var timeIt = function(func) { - var start = _.now(); - func(); - var end = _.now(); - return end - start; -} - -var detectingBlickets = function(evidence, baseRate, numSamples) { - return Infer({method: 'rejection', samples: numSamples}, function() { - var blicket = mem(function(block) { flip(baseRate) }); - var power = function(block) { blicket(block) ? .95 : .05 }; - var machineBeeps = function(blocks) { - blocks.length == 0 - ? flip(0.05) - : flip(power(first(blocks))) || machineBeeps(rest(blocks)) - }; - condition(machineBeeps(evidence)); - return blicket('A'); - }) -} - -var marsData = [ - {subjectID: 1, evidence: ['A'], response: true, RT: .9}, - {subjectID: 1, evidence: ['A', 'B', 'C', 'D', 'E', 'F'], response: true, RT: 1.1}, - {subjectID: 1, evidence: ['A', 'B', 'C'], response: true, RT: 1.2}, - {subjectID: 2, evidence: ['A'], response: true, RT: 3.5}, - {subjectID: 2, evidence: ['A', 'B', 'C', 'D', 'E', 'F'], response: false, RT: 4}, - {subjectID: 2, evidence: ['A', 'B', 'C'], response: true, RT: 3.4}, -]; - -var getModelRT = function(func, numRepeats) { - var rt = repeat(numRepeats, function() { timeIt(func) }); - return Gaussian({mu: listMean(rt), sigma: Math.max(listVar(rt), 1)}); -} -/// - -var dataAnalysis = function() { - var baseRate = mem(function(subjectID) { uniform(0, 1) }); - var numSamples = randomInteger(100) + 1; - - map(function(datapoint) { - var blicketModel = function() { - return detectingBlickets(datapoint.evidence, baseRate(datapoint.subjectID), numSamples) - }; - - observe(blicketModel(), datapoint.response); - observe(getModelRT(blicketModel, 10), datapoint.RT); - }, marsData); - - return {subject1: baseRate(1), - subject2: baseRate(2), - numSamples: numSamples}; -} - -var opts = {method: 'MCMC', - callbacks: [editor.MCMCProgress()], - samples: 500, - burn: 100}; -viz.marginals(Infer(opts, dataAnalysis)); -~~~ - -Looking at the responses, we see that Subject 1 responds `true` to trial 2 whereas Subject 2 responds `false`. -This suggests that Subject 1 has a very high prior believing that a block is a Blicket since there are 6 blocks that could have set the machine off. -Looking at the response times (RT), we see that Subject 1 was very quick to respond while Subject 2 took much longer. -Since we assumed that they both used rejection sampling, Subject 2 most likely had far more rejections which also indicates a low prior. - - -## Exercise 4 - -> Suppose we went to survey another group of aliens on Venus and collected another data set. -> Run this same BDA on these subjects. -> How do the Venusians compare to the Martians? - -~~~ -///fold: -var timeIt = function(func) { - var start = _.now(); - func(); - var end = _.now(); - return end - start; -} - -var detectingBlickets = function(evidence, baseRate, numSamples) { - return Infer({method: 'rejection', samples: numSamples}, function() { - var blicket = mem(function(block) { flip(baseRate) }); - var power = function(block) { blicket(block) ? .95 : .05 }; - var machineBeeps = function(blocks) { - blocks.length == 0 - ? flip(0.05) - : flip(power(first(blocks))) || machineBeeps(rest(blocks)) - }; - condition(machineBeeps(evidence)); - return blicket('A'); - }) -} - -var venusData = [ - {subjectID: 1, evidence: ['A'], response: true, RT: .9}, - {subjectID: 1, evidence: ['A', 'B', 'C', 'D', 'E', 'F'], response: true, RT: 4}, - {subjectID: 1, evidence: ['A', 'B', 'C'], response: true, RT: 2}, - {subjectID: 2, evidence: ['A'], response: true, RT: 1.5}, - {subjectID: 2, evidence: ['A', 'B', 'C', 'D', 'E', 'F'], response: false, RT: 5}, - {subjectID: 2, evidence: ['A', 'B', 'C'], response: true, RT: 2.2}, -]; - -var getModelRT = function(func, numRepeats) { - var rt = repeat(numRepeats, function() { timeIt(func) }); - return Gaussian({mu: listMean(rt), sigma: Math.max(listVar(rt), 1)}); -} -/// - -var dataAnalysis = function() { - var baseRate = mem(function(subjectID) { uniform(0, 1) }); - var numSamples = randomInteger(100) + 1; - - map(function(datapoint) { - var blicketModel = function() { - return detectingBlickets(datapoint.evidence, baseRate(datapoint.subjectID), numSamples) - }; - - observe(blicketModel(), datapoint.response); - observe(getModelRT(blicketModel, 10), datapoint.RT); - }, venusData); - - return {subject1: baseRate(1), - subject2: baseRate(2), - numSamples: numSamples}; -} - -var opts = {method: 'MCMC', - callbacks: [editor.MCMCProgress()], - samples: 500, - burn: 100}; -viz.marginals(Infer(opts, dataAnalysis)); -~~~ - -The trends are fairly similar. - - -## Exercise 5 - -> Suppose you want to compare the hypotheses that the aliens use rejection sampling versus enumeration to estimate probabilities. -> Modify your code to infer the posterior probabilities of each method for each planet. -> Which algorithm is each kind of alien most likely to be using? - -> Hint: Make `method` a random variable. - - -~~~ -///fold: -var timeIt = function(func) { - var start = _.now(); - func(); - var end = _.now(); - return end - start; -} - -var detectingBlickets = function(evidence, baseRate, algorithm, numSamples) { - return Infer({method: algorithm, samples: numSamples}, function() { - var blicket = mem(function(block) { flip(baseRate) }); - var power = function(block) { blicket(block) ? .95 : .05 }; - var machineBeeps = function(blocks) { - blocks.length == 0 - ? flip(0.05) - : flip(power(first(blocks))) || machineBeeps(rest(blocks)) - }; - condition(machineBeeps(evidence)); - return blicket('A'); - }) -} - -var data = [ - {planet: 'Mars', subjectID: 1, evidence: ['A'], response: true, RT: .9}, - {planet: 'Mars', subjectID: 1, evidence: ['A', 'B', 'C', 'D', 'E', 'F'], response: true, RT: 1.1}, - {planet: 'Mars', subjectID: 1, evidence: ['A', 'B', 'C'], response: true, RT: 1.2}, - {planet: 'Mars', subjectID: 2, evidence: ['A'], response: true, RT: 3.5}, - {planet: 'Mars', subjectID: 2, evidence: ['A', 'B', 'C', 'D', 'E', 'F'], response: false, RT: 4}, - {planet: 'Mars', subjectID: 2, evidence: ['A', 'B', 'C'], response: true, RT: 3.4}, - {planet: 'Venus', subjectID: 3, evidence: ['A'], response: true, RT: .9}, - {planet: 'Venus', subjectID: 3, evidence: ['A', 'B', 'C', 'D', 'E', 'F'], response: true, RT: 4}, - {planet: 'Venus', subjectID: 3, evidence: ['A', 'B', 'C'], response: true, RT: 2}, - {planet: 'Venus', subjectID: 4, evidence: ['A'], response: true, RT: 1.5}, - {planet: 'Venus', subjectID: 4, evidence: ['A', 'B', 'C', 'D', 'E', 'F'], response: false, RT: 5}, - {planet: 'Venus', subjectID: 4, evidence: ['A', 'B', 'C'], response: true, RT: 2.2}, -]; - - -var getModelRT = function(func, numRepeats) { - var rt = repeat(numRepeats, function() { timeIt(func) }); - return Gaussian({mu: listMean(rt), sigma: Math.max(listVar(rt), 1)}); -} -/// - -var dataAnalysis = function() { - var baseRate = mem(function(subjectID) { uniform(0, 1) }); - var algorithm = mem(function(planet) { flip() ? 'rejection' : 'enumerate' }); - var numSamples = randomInteger(100) + 1; - - map(function(datapoint) { - var blicketModel = function() { - return detectingBlickets(datapoint.evidence, - baseRate(datapoint.subjectID), - algorithm(datapoint.planet), - numSamples) - }; - - observe(blicketModel(), datapoint.response); - observe(getModelRT(blicketModel, 10), datapoint.RT); - }, data); - - return {algVenus: algorithm('Venus'), - algMars: algorithm('Mars')}; -} - -var opts = {method: 'MCMC', - callbacks: [editor.MCMCProgress()], - samples: 500, - burn: 100}; -viz.marginals(Infer(opts, dataAnalysis)); -~~~ - - -## Exercise 6 - -> Do you think any of these algorithms are good descriptions of how people intuitively do the Blicket task? -> Explain what aspects of the inference may or may not be analogous to what people do. - -Answers may vary. Some possible observations are -1. Full enumeration seems unlikely when many blocks are involved since people would have to calculate probability estimates for an exponential number of quantities. -2. Rejection sampling would be difficult when most of the proposed samples are rejected. \ No newline at end of file diff --git a/solutions/social-cognition.md b/solutions/social-cognition.md deleted file mode 100644 index 0dcf015..0000000 --- a/solutions/social-cognition.md +++ /dev/null @@ -1,488 +0,0 @@ ---- -layout: exercise -title: Inference about inference - exercises ---- - -## Exercise 1: Tricky Agents - -> What would happen if Sally knew you were watching her and wanted to deceive you? - -### Exercise 1.1 - -> Complete the code below so that `chooseAction` chooses a misdirection if Sally is deceptive. -> Then describe and show what happens if you knew Sally was deceptive and chose action "b". - -~~~~ -var actionPrior = Categorical({vs: ['a', 'b', 'c'], - ps: [1/3, 1/3, 1/3]}); -var foodPrior = Categorical({vs: ['bagel', 'cookie', 'doughnut'], - ps: [1/3, 1/3, 1/3]}); - -var vendingMachine = function(state, action) { - return action == 'a' ? categorical({vs: ['bagel', 'cookie', 'doughnut'], - ps: [.8, .1, .1]}) : - action == 'b' ? categorical({vs: ['bagel', 'cookie', 'doughnut'], - ps: [.1, .8, .1]}) : - action == 'c' ? categorical({vs: ['bagel', 'cookie', 'doughnut'], - ps: [.1, .1, .8]}) : - 'nothing'; -} - -var chooseAction = function(goal, transition, state, deceive) { - return Infer({method: 'enumerate'}, function() { - var action = sample(actionPrior); - var outcome = transition(state, action); - condition(deceive ? !goal(outcome) : goal(outcome)); - return action; - }) -}; - -var goalPosterior = Infer({method: 'enumerate'}, function() { - var deceive = flip(); - var goalFood = sample(foodPrior); - var goal = function(outcome) {return outcome == goalFood}; - var sallyActionDist = chooseAction(goal, vendingMachine, 'state', deceive); - condition(deceive); - condition(sample(sallyActionDist) == 'b'); - return goalFood; -}); - -viz.auto(goalPosterior); -~~~~ - -Results: The probabilities that Sally wants a bagel or doughnut (p=0.45 for both) are much larger than -the probability she wants a cookie (p=0.1). - -### Exercise 1.2 - -> You observe that Sally chooses `a`, and then `b`. -How likely is it that she is deceptive? -What if you instead observed that she chose `b` and then `b` again? -Explain how deceptiveness and preferences interact to produce her actions. - -~~~~ -///fold: -var actionPrior = Categorical({vs: ['a', 'b', 'c'], - ps: [1/3, 1/3, 1/3]}); -var foodPrior = Categorical({vs: ['bagel', 'cookie', 'doughnut'], - ps: [1/3, 1/3, 1/3]}); - -var vendingMachine = function(state, action) { - return action == 'a' ? categorical({vs: ['bagel', 'cookie', 'doughnut'], - ps: [.8, .1, .1]}) : - action == 'b' ? categorical({vs: ['bagel', 'cookie', 'doughnut'], - ps: [.1, .8, .1]}) : - action == 'c' ? categorical({vs: ['bagel', 'cookie', 'doughnut'], - ps: [.1, .1, .8]}) : - 'nothing'; -} - -var chooseAction = function(goal, transition, state, deceive) { - return Infer({method: 'enumerate'}, function() { - var action = sample(actionPrior); - var outcome = transition(state, action); - condition(deceive ? !goal(outcome) : goal(outcome)); - return action; - }) -}; -/// - -var goalPosterior = Infer({method: 'enumerate'}, function() { - var deceive = flip(); - var goalFood = sample(foodPrior); - var goal = function(outcome) {return outcome == goalFood}; - var sallyActionDist = chooseAction(goal, vendingMachine, 'state', deceive); - - // condition(sample(sallyActionDist) == 'a'); // case 1 - condition(sample(sallyActionDist) == 'b'); // case 2 - condition(sample(sallyActionDist) == 'b'); - return goalFood; -}); - -viz.auto(goalPosterior); - -var possibleActions = Infer({method: 'enumerate'}, function() { - var deceive = flip(); - var goalFood = sample(foodPrior); - var goal = function(outcome) {return outcome == goalFood}; - var sallyActionDist = chooseAction(goal, vendingMachine, 'state', deceive); - - condition(deceive); - var outcome1 = sample(sallyActionDist); - var outcome2 = sample(sallyActionDist); - return {o1: outcome1, o2: outcome2}; -}); - -viz.auto(possibleActions); -~~~~ - -When Sally chooses `a` and `b`, it's unlikely that she wanted a bagel or a cookie since she would have then selected -`a` twice or `b` twice. -However, if she really wanted a doughnut and deceptive, it makes sense that she would avoid `c` both times. -When Sally chooses `b` twice, the scenario where she's honest is much more consistent with the outcome. -In the second visualization above, we can see that if Sally is deceptive, the probability of any two actions is -relatively uniform. However, if we set `condition(!deceive)` instead, we see much higher peaks for pairs of the same -actions. - - -## Exercise 2: Monty Hall. - -> Here, we will use the tools of Bayesian inference to explore a classic statistical puzzle -- the Monty Hall problem. -Here is one statement of the problem: -> ->> Alice is on a game show, and she's given the choice of three doors. ->> Behind one door is a car; behind the others, goats. ->> She picks door 1. The host, ->> Monty, knows what's behind the doors and opens another door, say No. 3, revealing a goat. ->> He then asks Alice if she wants to switch doors. ->> Should she switch? -> -> Intuitively, it may seem like switching doesn't matter. -> However, the canonical solution is that you *should* switch doors. -> We will explore why this is the case. -> -> For this problem, we will assume (condition) that we observe Monty opening the door that -> is neither Alice's door nor the prize door. - -### Exercise 2.1 - -> The decision to switch depends crucially on how you believe Monty chooses doors to pick. -First, write the model such that the host *randomly* picks doors (for this, fill in `montyRandom`). -In this setting, should Alice switch, or does it not matter? - -~~~~ -///fold: -var removeBadItems = function(l, badItems) { - return reduce(function(badItem, remainingL) { - return remove(badItem, remainingL) - }, l, badItems); -} - -var doors = [1, 2, 3]; -/// - -var montyRandom = function(aliceDoor, prizeDoor) { - return Infer({method: 'enumerate'}, function() { - return categorical({vs: doors}); - }) -}; - -var model = function(switches) { - var aliceDoor = categorical({vs: doors}); - var prizeDoor = categorical({vs: doors}); - - var montyDoorDist = montyRandom(aliceDoor, prizeDoor); - var montyDoor = sample(montyDoorDist); - condition(montyDoor != prizeDoor); - condition(montyDoor != aliceDoor); - - var aliceDoor = switches ? removeBadItems(doors, [aliceDoor, montyDoor])[0] : aliceDoor; - - return aliceDoor == prizeDoor; -} - -display("P(win) if Alice doesn't switch"); -viz.auto(Infer({method: 'enumerate'}, function() {model(false)})); -display("P(win) if Alice does switch"); -viz.auto(Infer({method: 'enumerate'}, function() {model(true)})); -~~~~ - -In this case, it doesn't matter whether Alice switches. -*A priori*, all doors are equally likely to be the prize door. -Monty has eliminated one of the non-prize doors, -but there's no reason to favor either of the other two. - -### Exercise 2.2 - -> This time, fill in the code so that Monty behaves according to the original Monty Hall problem, -i.e. picking the door that is neither the prize door nor Alice's door. -For both-avoiding Monty, you'll find that Alice *should* switch. - -~~~~ -///fold: -var removeBadItems = function(l, badItems) { - return reduce(function(badItem, remainingL) { - return remove(badItem, remainingL) - }, l, badItems); -} - -var doors = [1, 2, 3]; -/// - -var montyAvoidBoth = function(aliceDoor, prizeDoor) { - return Infer({method: 'enumerate'}, function() { - var montyDoor = categorical({vs: doors}); - condition(montyDoor != aliceDoor); - condition(montyDoor != prizeDoor); - return montyDoor; - }) -}; - -var model = function(switches) { - var aliceDoor = categorical({vs: doors}); - var prizeDoor = categorical({vs: doors}); - - var montyDoorDist = montyAvoidBoth(aliceDoor, prizeDoor); - var montyDoor = sample(montyDoorDist); - condition(montyDoor != prizeDoor); - condition(montyDoor != aliceDoor); - var aliceDoor = switches ? removeBadItems(doors, [aliceDoor, montyDoor])[0] : aliceDoor; - - return aliceDoor == prizeDoor; -} - -display("P(win) if Alice doesn't switch"); -viz.auto(Infer({method: 'enumerate'}, function() {model(false)})); -display("P(win) if Alice does switch"); -viz.auto(Infer({method: 'enumerate'}, function() {model(true)})); -~~~~ - -By running the model, we see that switching doors allows Alice to find the car 2/3 of the time. - -### Exercise 2.3 - -> This is unintuitive -- we know that Monty picked door 3, so why should the process he used to arrive at this choice matter? -By hand, complete the probability table for P(Alice, Prize, Monty) under both `montyRandom` and `montyAvoidBoth`. -Your tables should look like: - -> Alice's door| Prize door| Monty's Door| P(Alice, Prize, Monty) --------------| -----------| -------------| ----------------------- -1| 1| 1| ... -1| 1| 2| ... -...| ...| ...| ... - -> Using these tables, explain why Alice should switch for both-avoiding Monty but why switching doesn't matter for random Monty. -Hint: you will want to compare particular *rows* of these tables. - -| Alice's Door | Prize Door | Monty's Door | P(Alice, Prize, Monty) | -|--------------|------------|--------------|------------------------| -| 1 | 1 | 1 | 0.037 | -| 1 | 1 | 2 | 0.037 | -| 1 | 1 | 3 | 0.037 | -| 1 | 2 | 1 | 0.037 | -| 1 | 2 | 2 | 0.037 | -| 1 | 2 | 3 | 0.037 | -| 1 | 3 | 1 | 0.037 | -| 1 | 3 | 2 | 0.037 | -| 1 | 3 | 3 | 0.037 | -| 2 | 1 | 1 | 0.037 | -| 2 | 1 | 2 | 0.037 | -| 2 | 1 | 3 | 0.037 | -| 2 | 2 | 1 | 0.037 | -| 2 | 2 | 2 | 0.037 | -| 2 | 2 | 3 | 0.037 | -| 2 | 3 | 1 | 0.037 | -| 2 | 3 | 2 | 0.037 | -| 2 | 3 | 3 | 0.037 | -| 3 | 1 | 1 | 0.037 | -| 3 | 1 | 2 | 0.037 | -| 3 | 1 | 3 | 0.037 | -| 3 | 2 | 1 | 0.037 | -| 3 | 2 | 2 | 0.037 | -| 3 | 2 | 3 | 0.037 | -| 3 | 3 | 1 | 0.037 | -| 3 | 3 | 2 | 0.037 | -| 3 | 3 | 3 | 0.037 | - -If we condition on Alice choosing Door 1, Monty choosing Door 3, and Door 3 not being the prize, -there are only two remaining possibilities: - -| Alice's Door | Prize Door | Monty's Door | P(Alice, Prize, Monty) | -|--------------|------------|--------------|------------------------| -| 1 | 1 | 3 | 0.037 | -| 1 | 2 | 3 | 0.037 | - -These are equally likely in the prior and thus equally likely in the posterior. - -Under `montyAvoidBoth`: - -| Alice's Door | Prize Door | Monty's Door | P(Alice, Prize, Monty) | -|--------------|------------|--------------|------------------------| -| 1 | 1 | 1 | 0 | -| 1 | 1 | 2 | 0.06 | -| 1 | 1 | 3 | 0.06 | -| 1 | 2 | 1 | 0 | -| 1 | 2 | 2 | 0 | -| 1 | 2 | 3 | 0.11 | -| 1 | 3 | 1 | 0 | -| 1 | 3 | 2 | 0.11 | -| 1 | 3 | 3 | 0 | -| 2 | 1 | 1 | 0 | -| 2 | 1 | 2 | 0 | -| 2 | 1 | 3 | 0.11 | -| 2 | 2 | 1 | 0.06 | -| 2 | 2 | 2 | 0 | -| 2 | 2 | 3 | 0.06 | -| 2 | 3 | 1 | 0.11 | -| 2 | 3 | 2 | 0 | -| 2 | 3 | 3 | 0 | -| 3 | 1 | 1 | 0 | -| 3 | 1 | 2 | 0.11 | -| 3 | 1 | 3 | 0 | -| 3 | 2 | 1 | 0.11 | -| 3 | 2 | 2 | 0 | -| 3 | 2 | 3 | 0 | -| 3 | 3 | 1 | 0.06 | -| 3 | 3 | 2 | 0.06 | -| 3 | 3 | 3 | 0 | - -Again, conditioning leaves only the two possibilities: - -| Alice's Door | Prize Door | Monty's Door | P(Alice, Prize, Monty) | -|--------------|------------|--------------|------------------------| -| 1 | 1 | 3 | 0.06 | -| 1 | 2 | 3 | 0.11 | - -Thus, in the posterior, the possibility where Door 2 is the prize door is twice as likely as the possibility where Door 1 is the prize door. -Alice should switch. - -Via code: - -~~~ -///fold: -var removeBadItems = function(l, badItems) { - return reduce(function(badItem, remainingL) { - return remove(badItem, remainingL) - }, l, badItems); -} - -var doors = [1, 2, 3]; -/// - -var montyRandom = function(aliceDoor, prizeDoor) { - return Infer({method: 'enumerate'}, function() { - return categorical({vs: doors}); - }) -}; - -var montyAvoidBoth = function(aliceDoor, prizeDoor) { - return Infer({method: 'enumerate'}, function() { - return categorical({vs: removeBadItems(doors, [aliceDoor, prizeDoor])}); - }) -}; - -var model = function(montyFunction) { - var aliceDoor = categorical({vs: doors}); - var prizeDoor = categorical({vs: doors}); - - var montyDoorDist = montyFunction(aliceDoor, prizeDoor); - var montyDoor = sample(montyDoorDist); - condition(montyDoor != prizeDoor); - condition(montyDoor != aliceDoor); - return {alice: aliceDoor, prize: prizeDoor, monty: montyDoor}; -} - -display("Using montyRandom") -viz.table(Infer({method: 'enumerate'}, function() { model(montyRandom) })); - -display("Using montyAvoidBoth") -viz.table(Infer({method: 'enumerate'}, function() { model(montyAvoidBoth) })); -~~~ - - -### Exercise 2.4 - -> This time, fill in the code so that Monty randomly chooses between the two doors that aren't Alice's door. -> What should Alice do now? - -~~~ -///fold: -var removeBadItems = function(l, badItems) { - return reduce(function(badItem, remainingL) { - return remove(badItem, remainingL) - }, l, badItems); -} - -var doors = [1, 2, 3]; -/// - -var montyAvoidAlice = function(aliceDoor, prizeDoor) { - return Infer({method: 'enumerate'}, function() { - var montyDoor = categorical({vs: doors}); - condition(montyDoor != aliceDoor); - return montyDoor; - }) -}; - -var model = function(switches) { - var aliceDoor = categorical({vs: doors}); - var prizeDoor = categorical({vs: doors}); - - var montyDoorDist = montyAvoidAlice(aliceDoor, prizeDoor); - var montyDoor = sample(montyDoorDist); - condition(montyDoor != prizeDoor); - condition(montyDoor != aliceDoor); - var aliceDoor = switches ? removeBadItems(doors, [aliceDoor, montyDoor])[0] : aliceDoor; - - condition(montyDoor != prizeDoor); - return aliceDoor == prizeDoor; -} - -display("P(win) if Alice doesn't switch"); -viz.auto(Infer({method: 'enumerate'}, function() {model(false)})); -display("P(win) if Alice does switch"); -viz.auto(Infer({method: 'enumerate'}, function() {model(true)})); -~~~ - -If Monty's policy is to open a door that Alice didn't choose, but we observe -that his door isn't the prize door, it doesn't matter whether she switches or not. - - -### Exercise 2.5 - -> This time, fill in the code so that Monty randomly chooses between the two doors that aren't the prize door. -> What should Alice do now? - -~~~ -///fold: -var removeBadItems = function(l, badItems) { - return reduce(function(badItem, remainingL) { - return remove(badItem, remainingL) - }, l, badItems); -} - -var doors = [1, 2, 3]; -/// - -var montyAvoidPrize = function(aliceDoor, prizeDoor) { - return Infer({method: 'enumerate'}, function() { - var montyDoor = categorical({vs: doors}); - condition(montyDoor != prizeDoor); - return montyDoor; - }) -}; - -var model = function(switches) { - var aliceDoor = categorical({vs: doors}); - var prizeDoor = categorical({vs: doors}); - - var montyDoorDist = montyAvoidPrize(aliceDoor, prizeDoor); - var montyDoor = sample(montyDoorDist); - condition(montyDoor != prizeDoor); - condition(montyDoor != aliceDoor); - var aliceDoor = switches ? removeBadItems(doors, [aliceDoor, montyDoor])[0] : aliceDoor; - - return aliceDoor == prizeDoor; -} - -display("P(win) if Alice doesn't switch"); -viz.auto(Infer({method: 'enumerate'}, function() {model(false)})); -display("P(win) if Alice does switch"); -viz.auto(Infer({method: 'enumerate'}, function() {model(true)})); -~~~ - -If Monty's policy is to open a door that isn't the prize door, but we observe -that his door isn't Alice's door, it doesn't matter whether she switches or not. - - -### Exercise 2.6 - -> The psychological question is why do people have the initial intuition that switching shouldn’t matter? -> Given your explorations, propose a hypothesis. -> Can you think of an experiment that would test this hypothesis? - -[Note: There’s no right answer to this, so answers may vary.] - -One model might be that people believe that Monty is trying to avoid the prize door, -or believe that he actually acts randomly. -Either possibility would lead to the prediction that Alice should be indifferent to switching. diff --git a/teaching_extras/Algorithms_for_Inference_creativity.md b/teaching_extras/Algorithms_for_Inference_creativity.md new file mode 100644 index 0000000..4b16030 --- /dev/null +++ b/teaching_extras/Algorithms_for_Inference_creativity.md @@ -0,0 +1,67 @@ +--- +layout: exercise +title: Theory of Emotion +custom_js: +- assets/js/draw.js +- assets/js/paper-full.js +--- + +# Suchow creativity algorithm + +Here is a fairly straightforward implementation of the example given by Suchow and colleagues of word search. Here, there are 3 cue words and 6 potential targets. The goal is to find a word from the target list that is closely related to all three cues. Similarity here is strictly ordinal: each word in each vector is successfully less related to the start word. + + + +~~~~ +var words = function(cue){ + return(cue == "falling" ? ["air", "asleep", "star", "curtain", "dirt", "actress"] : + cue == "actor" ? ["actress", "star", "curtain", "asleep", "dirt"] : + cue == "dust" ? ["dirt", "air", "star", "curtain", "asleep", "actress"] : + cue == "asleep" ? ["star", "curtain", "air", "dirt", "actress"] : + cue == "air" ? ["star", "curtain", "dirt", "asleep", "actress"] : + cue == "star" ? ["air", "actress", "asleep", "curtain", "dirt"] : + cue == "curtain" ? ["air", "asleep", "actress", "star", "dirt"] : + cue == "actress" ? ["star", "curtain", "asleep", "dirt", "air"] : + cue == "dirt" ? ["curtain", "air", "asleep", "actress", "star"] : + 6) +} + +var propose = function(current){ + var dist = Math.min(sample(Poisson({mu: .6})), 4) + return words(current)[dist] +} + +var prob = function(w) { + var meandist = Math.round(1/3*(words("falling").indexOf(w) + words("actor").indexOf(w) + words("dust").indexOf(w))) + return Poisson({mu:.6}).score(meandist) + +} + +var accept_prob = function(word1, word2){ + return(Math.min(1, + (Math.exp(prob(word2))*Math.exp(Poisson({mu: 1}).score(words(word2).indexOf(word1)+1)))/ + (Math.exp(prob(word1))*Math.exp(Poisson({mu: 1}).score(words(word1).indexOf(word2)+1))) + )) +} + +var transition = function(word1){ + var proposal = propose(word1) + return flip(accept_prob(word1, proposal)) ? + proposal : + word1 +} + +var mcmc = function(state, iterations){ + return ((iterations == 1) ? [state] : mcmc(transition(state), iterations-1).concat(state)) +} + +var findword = function(iterations){ + var startword = sample(Categorical({vs: ["falling","actor","dust"], ps: [1/3, 1/3, 1/3]})) + return mcmc(startword, iterations) +} + +viz(findword(10000)) +findword(40) +~~~~ + +As can be seen, the chain will hop around a bit until it lands on 'air' and then generally stick there. (Making this happen took some playing with the Poisson distributions. You might choose different mus and see how it affects the probability of transitioning.) \ No newline at end of file