From d47dcbf9da9645adde08b72d9f54205e5cfe653b Mon Sep 17 00:00:00 2001 From: Jillian Rowe Date: Sun, 19 Jul 2020 12:48:43 +0300 Subject: [PATCH] breaking down the PCA user story --- .../000-Data-Structure.ipynb | 635 ++++++++ ...01-Exploratory-Statistics-Attributes.ipynb | 505 +++++++ ...ploratory-Statistics-Variant-Quality.ipynb | 588 ++++++++ .../003-Filter-Variants.ipynb | 329 +++++ .../004-Subset-Genotypes.ipynb | 709 +++++++++ ...005-Exploratory-Statistics-Sample-QC.ipynb | 920 ++++++++++++ .../006-PCA-Populations.ipynb | 1279 +++++++++++++++++ 7 files changed, 4965 insertions(+) create mode 100644 stories/pop-structure-pca/000-Data-Structure.ipynb create mode 100644 stories/pop-structure-pca/001-Exploratory-Statistics-Attributes.ipynb create mode 100644 stories/pop-structure-pca/002-Exploratory-Statistics-Variant-Quality.ipynb create mode 100644 stories/pop-structure-pca/003-Filter-Variants.ipynb create mode 100644 stories/pop-structure-pca/004-Subset-Genotypes.ipynb create mode 100644 stories/pop-structure-pca/005-Exploratory-Statistics-Sample-QC.ipynb create mode 100644 stories/pop-structure-pca/006-PCA-Populations.ipynb diff --git a/stories/pop-structure-pca/000-Data-Structure.ipynb b/stories/pop-structure-pca/000-Data-Structure.ipynb new file mode 100644 index 0000000..63bb66d --- /dev/null +++ b/stories/pop-structure-pca/000-Data-Structure.ipynb @@ -0,0 +1,635 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MalariaGen Directory and Data Structure\n", + "\n", + "This notebook describes the directory and the data structure of the MalariaGen Phase2 Variant data on Google Cloud Storage." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import zarr\n", + "import pandas as pd\n", + "import dask.array as da\n", + "import allel\n", + "from pprint import pprint\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "distributed.scheduler - INFO - Clear task state\n", + "distributed.scheduler - INFO - Scheduler at: tcp://10.35.63.92:38717\n", + "distributed.scheduler - INFO - dashboard at: :8787\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4158d342208b44d5b57da7c12f8fe3f2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HTML(value='

KubeCluster

'), HBox(children=(HTML(value='\\n
\\n \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ox_codesrc_codepopulationcountrylocationsitecontributorcontactyearm_ssexn_sequencesmean_coverageebi_sample_acclatitudelongitude
0AA0040-CTwifo_Praso__E2GHcolGhanaTwifo PrasoTwifo PrasoDavid WeetmanDavid Weetman2012MF9503336830.99ERS3118785.60858-1.54926
1AA0041-CTwifo_Praso__H3GHcolGhanaTwifo PrasoTwifo PrasoDavid WeetmanDavid Weetman2012MF9584380431.70ERS3118865.60858-1.54926
2AA0042-CTakoradi_C7GHcolGhanaTakoradiTakoradiDavid WeetmanDavid Weetman2012MF10742066635.65ERS3118944.91217-1.77397
3AA0043-CTakoradi_H8GHcolGhanaTakoradiTakoradiDavid WeetmanDavid Weetman2012MF9599375229.46ERS3119024.91217-1.77397
4AA0044-CTakoradi_D10GHcolGhanaTakoradiTakoradiDavid WeetmanDavid Weetman2012MF10304426233.67ERS3119104.91217-1.77397
\n", + "
" + ], + "text/plain": [ + " ox_code src_code population country location site \\\n", + "0 AA0040-C Twifo_Praso__E2 GHcol Ghana Twifo Praso Twifo Praso \n", + "1 AA0041-C Twifo_Praso__H3 GHcol Ghana Twifo Praso Twifo Praso \n", + "2 AA0042-C Takoradi_C7 GHcol Ghana Takoradi Takoradi \n", + "3 AA0043-C Takoradi_H8 GHcol Ghana Takoradi Takoradi \n", + "4 AA0044-C Takoradi_D10 GHcol Ghana Takoradi Takoradi \n", + "\n", + " contributor contact year m_s sex n_sequences mean_coverage \\\n", + "0 David Weetman David Weetman 2012 M F 95033368 30.99 \n", + "1 David Weetman David Weetman 2012 M F 95843804 31.70 \n", + "2 David Weetman David Weetman 2012 M F 107420666 35.65 \n", + "3 David Weetman David Weetman 2012 M F 95993752 29.46 \n", + "4 David Weetman David Weetman 2012 M F 103044262 33.67 \n", + "\n", + " ebi_sample_acc latitude longitude \n", + "0 ERS311878 5.60858 -1.54926 \n", + "1 ERS311886 5.60858 -1.54926 \n", + "2 ERS311894 4.91217 -1.77397 \n", + "3 ERS311902 4.91217 -1.77397 \n", + "4 ERS311910 4.91217 -1.77397 " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_samples = pd.read_csv('samples.meta.txt', sep='\\t')\n", + "df_samples.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:root] *", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/stories/pop-structure-pca/001-Exploratory-Statistics-Attributes.ipynb b/stories/pop-structure-pca/001-Exploratory-Statistics-Attributes.ipynb new file mode 100644 index 0000000..160662b --- /dev/null +++ b/stories/pop-structure-pca/001-Exploratory-Statistics-Attributes.ipynb @@ -0,0 +1,505 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exploratory Statistics on Variant Attributes\n", + "\n", + "This comes from the example notebook and the [tour of scikit-allel](http://alimanfoo.github.io/2016/06/10/scikit-allel-tour.html) blog post.\n", + "\n", + "The first stage of any analysis is mostly to look at distributions and make sure they aren't too weird looking." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "distributed.scheduler - INFO - Register tcp://10.35.35.2:38435\n", + "distributed.scheduler - INFO - Starting worker compute stream, tcp://10.35.35.2:38435\n", + "distributed.core - INFO - Starting established connection\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import zarr\n", + "import pandas as pd\n", + "import dask.array as da\n", + "import allel\n", + "from pprint import pprint\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "sns.set_style('white')\n", + "sns.set_style('ticks')\n", + "sns.set_context('notebook')\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cluster Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "distributed.scheduler - INFO - Clear task state\n", + "/opt/conda/lib/python3.7/site-packages/distributed/dashboard/core.py:72: UserWarning: \n", + "Port 8787 is already in use. \n", + "Perhaps you already have a cluster running?\n", + "Hosting the diagnostics dashboard on a random port instead.\n", + " warnings.warn(\"\\n\" + msg)\n", + "distributed.scheduler - INFO - Scheduler at: tcp://10.35.63.92:39333\n", + "distributed.scheduler - INFO - dashboard at: :41583\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bab24c1eb11b4fa399d8f7dd9608b297", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HTML(value='

KubeCluster

'), HBox(children=(HTML(value='\\n
\\n \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ox_codesrc_codepopulationcountrylocationsitecontributorcontactyearm_ssexn_sequencesmean_coverageebi_sample_acclatitudelongitude
0AA0040-CTwifo_Praso__E2GHcolGhanaTwifo PrasoTwifo PrasoDavid WeetmanDavid Weetman2012MF9503336830.99ERS3118785.60858-1.54926
1AA0041-CTwifo_Praso__H3GHcolGhanaTwifo PrasoTwifo PrasoDavid WeetmanDavid Weetman2012MF9584380431.70ERS3118865.60858-1.54926
2AA0042-CTakoradi_C7GHcolGhanaTakoradiTakoradiDavid WeetmanDavid Weetman2012MF10742066635.65ERS3118944.91217-1.77397
3AA0043-CTakoradi_H8GHcolGhanaTakoradiTakoradiDavid WeetmanDavid Weetman2012MF9599375229.46ERS3119024.91217-1.77397
4AA0044-CTakoradi_D10GHcolGhanaTakoradiTakoradiDavid WeetmanDavid Weetman2012MF10304426233.67ERS3119104.91217-1.77397
\n", + "
" + ], + "text/plain": [ + " ox_code src_code population country location site \\\n", + "0 AA0040-C Twifo_Praso__E2 GHcol Ghana Twifo Praso Twifo Praso \n", + "1 AA0041-C Twifo_Praso__H3 GHcol Ghana Twifo Praso Twifo Praso \n", + "2 AA0042-C Takoradi_C7 GHcol Ghana Takoradi Takoradi \n", + "3 AA0043-C Takoradi_H8 GHcol Ghana Takoradi Takoradi \n", + "4 AA0044-C Takoradi_D10 GHcol Ghana Takoradi Takoradi \n", + "\n", + " contributor contact year m_s sex n_sequences mean_coverage \\\n", + "0 David Weetman David Weetman 2012 M F 95033368 30.99 \n", + "1 David Weetman David Weetman 2012 M F 95843804 31.70 \n", + "2 David Weetman David Weetman 2012 M F 107420666 35.65 \n", + "3 David Weetman David Weetman 2012 M F 95993752 29.46 \n", + "4 David Weetman David Weetman 2012 M F 103044262 33.67 \n", + "\n", + " ebi_sample_acc latitude longitude \n", + "0 ERS311878 5.60858 -1.54926 \n", + "1 ERS311886 5.60858 -1.54926 \n", + "2 ERS311894 4.91217 -1.77397 \n", + "3 ERS311902 4.91217 -1.77397 \n", + "4 ERS311910 4.91217 -1.77397 " + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples_subset = samples[sample_selection]\n", + "samples_subset.reset_index(drop=True, inplace=True)\n", + "samples_subset.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let’s subset the genotype calls to keep only variants that pass our quality filters and only samples in our two populations of interest." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1min 37s, sys: 31 s, total: 2min 8s\n", + "Wall time: 1min 1s\n" + ] + } + ], + "source": [ + "%%time\n", + "genotypes_subset = genotypes.subset(variant_selection, sample_selection)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
<GenotypeChunkedArray shape=(304050, 67, 2) dtype=int8 chunks=(4751, 67, 2)\n", + " nbytes=38.9M cbytes=2.7M cratio=14.4\n", + " compression=blosc compression_opts={'cname': 'lz4', 'clevel': 5, 'shuffle': 1, 'blocksize': 0}\n", + " values=zarr.core.Array>
01234...6263646566
00/00/00/00/00/0...0/00/00/00/00/0
10/00/00/00/00/0...0/00/00/00/00/0
20/00/00/00/00/0...0/00/00/00/00/0
......
3040470/00/00/00/00/0...0/00/00/00/00/0
3040480/00/00/00/00/0...0/00/00/00/00/0
3040490/00/00/00/00/0...0/00/00/00/00/0
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "genotypes_subset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:root] *", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/stories/pop-structure-pca/005-Exploratory-Statistics-Sample-QC.ipynb b/stories/pop-structure-pca/005-Exploratory-Statistics-Sample-QC.ipynb new file mode 100644 index 0000000..38555ef --- /dev/null +++ b/stories/pop-structure-pca/005-Exploratory-Statistics-Sample-QC.ipynb @@ -0,0 +1,920 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exploratory Statistics - Sample QC\n", + "\n", + "This comes from the example notebook and the [tour of scikit-allel](http://alimanfoo.github.io/2016/06/10/scikit-allel-tour.html) blog post." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import zarr\n", + "import pandas as pd\n", + "import dask.array as da\n", + "import allel\n", + "import scipy\n", + "from pprint import pprint\n", + "\n", + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import seaborn as sns\n", + "sns.set_style('white')\n", + "sns.set_style('ticks')\n", + "sns.set_context('notebook')\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "distributed.scheduler - INFO - Clear task state\n", + "/opt/conda/lib/python3.7/site-packages/distributed/dashboard/core.py:72: UserWarning: \n", + "Port 8787 is already in use. \n", + "Perhaps you already have a cluster running?\n", + "Hosting the diagnostics dashboard on a random port instead.\n", + " warnings.warn(\"\\n\" + msg)\n", + "distributed.scheduler - INFO - Scheduler at: tcp://10.35.63.92:39679\n", + "distributed.scheduler - INFO - dashboard at: :42599\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1737526da1ec46ea8c511b68d5ca4588", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HTML(value='

KubeCluster

'), HBox(children=(HTML(value='\\n
\\n