projectglow
diff --git a/‎docs/dev/gen-nb-src.py‎
Lines changed: 1 addition & 1 deletion b/‎docs/dev/gen-nb-src.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/_static/notebooks/etl/simulate_covariates_phenotypes.html‎
Lines changed: 43 additions & 0 deletions b/‎docs/source/_static/notebooks/etl/simulate_covariates_phenotypes.html‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎docs/source/_static/notebooks/etl/simulate_delta_pvcf.html‎
Lines changed: 43 additions & 0 deletions b/‎docs/source/_static/notebooks/etl/simulate_delta_pvcf.html‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/etl/simulate_covariates_phenotypes.py‎
Lines changed: 144 additions & 0 deletions b/‎docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/etl/simulate_covariates_phenotypes.py‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/etl/simulate_delta_pvcf.py‎
Lines changed: 243 additions & 0 deletions b/‎docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/etl/simulate_delta_pvcf.py‎
Lines changed: 243 additions & 0 deletions
@@ -21,7 +21,7 @@
 def run_cli_workspace_cmd(cli_profile, args):
     cmd = ['databricks', '--profile', cli_profile, 'workspace'] + args
     res = subprocess.run(cmd, capture_output=True)
-    if res.returncode is not 0:
+    if res.returncode != 0:
         raise ValueError(res)
 
 
 
@@ -0,0 +1,144 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC 
+# MAGIC ### Simulate random binary / quantitative covariates and phenotypes
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### import libraries
+
+# COMMAND ----------
+
+import random
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### Data Generation Constants
+
+# COMMAND ----------
+
+#genotype matrix
+n_samples = 50000
+
+#phenotypes
+n_binary_phenotypes = 1
+n_quantitative_phenotypes = 1
+n_phenotypes = n_binary_phenotypes + n_quantitative_phenotypes
+
+#covariates
+n_quantitative_covariates = 8
+n_binary_covariates = 2
+n_covariates = n_quantitative_covariates + n_binary_covariates
+missingness = 0.7
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### Data Storage Path Constants
+
+# COMMAND ----------
+
+user=dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply('user')
+dbfs_home_path_str = "dbfs:/home/{}/".format(user)
+dbfs_fuse_home_path_str = "/dbfs/home/{}/".format(user)
+dbfs_home_path = Path("dbfs:/home/{}/".format(user))
+dbfs_fuse_home_path = Path("/dbfs/home/{}/".format(user))
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### simulate covariates helper functions
+
+# COMMAND ----------
+
+def np_array_to_pandas_with_missing(np_array, missingness, n_cols, col_prefix='Q'):
+  pdf =  pd.DataFrame(np_array, columns=[col_prefix + str(i+1) for i in range(n_cols)])
+  pdf = pdf.mask(np.random.choice([True, False], size=pdf.shape, p=[missingness, 1- missingness]))
+  return pdf
+ 
+def add_sample_index_pdf(pdf, sample="sample_id"):
+  pdf.index.name = "sample_id"
+  pdf.index = pdf.index.map(str)
+  return pdf
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### set variables
+
+# COMMAND ----------
+
+dbfs_file_path = dbfs_home_path / "genomics/data/pandas/"
+dbutils.fs.mkdirs(str(dbfs_file_path))
+
+dbfs_file_fuse_path = dbfs_fuse_home_path / "genomics/data/pandas/"
+simulate_file_prefix = f"simulate_{n_samples}_samples_"
+
+output_covariates = str(dbfs_file_fuse_path / (simulate_file_prefix + f"{n_covariates}_covariates.csv"))
+output_quantitative_phenotypes = str(dbfs_file_fuse_path / (simulate_file_prefix + f"{n_quantitative_phenotypes}_quantitative_phenotypes.csv"))
+output_binary_phenotypes = str(dbfs_file_fuse_path / (simulate_file_prefix + f"{n_binary_phenotypes}_binary_phenotypes.csv"))
+
+output_covariates, output_quantitative_phenotypes, output_binary_phenotypes
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### simulate covariates
+
+# COMMAND ----------
+
+covariates_quantitative = np.random.random((n_samples, n_quantitative_covariates))
+covariates_quantitative_pdf = np_array_to_pandas_with_missing(covariates_quantitative, 0, n_quantitative_covariates, col_prefix='Q')
+
+# COMMAND ----------
+
+covariates_quantitative_pdf
+
+# COMMAND ----------
+
+covariates_binary = np.random.randint(0, 2, (n_samples, n_binary_covariates))
+covariates_binary_pdf = np_array_to_pandas_with_missing(covariates_binary, 0, n_binary_covariates, col_prefix='B')
+covariates_binary_pdf = covariates_binary_pdf.astype(pd.Int64Dtype())
+covariates_binary_pdf
+
+# COMMAND ----------
+
+covariates = pd.concat([covariates_binary_pdf, covariates_quantitative_pdf], axis=1)
+covariates = add_sample_index_pdf(covariates)
+covariates.head(5)
+
+# COMMAND ----------
+
+covariates.to_csv(output_covariates, index=True, header=True, sep = ',')
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### simulate phenotypes
+
+# COMMAND ----------
+
+binary_phenotypes = np.random.randint(0, 2, (n_samples, n_binary_phenotypes))
+binary_phenotypes_pdf = np_array_to_pandas_with_missing(binary_phenotypes, missingness, n_binary_phenotypes, col_prefix='BP')
+binary_phenotypes_pdf = add_sample_index_pdf(binary_phenotypes_pdf)
+binary_phenotypes_pdf.head(10)
+
+# COMMAND ----------
+
+binary_phenotypes_pdf.to_csv(output_binary_phenotypes, index=True, header=True, sep = ',')
+
+# COMMAND ----------
+
+quantitative_phenotypes = np.random.normal(loc=0.0, scale=1.0, size=(n_samples, n_quantitative_phenotypes))
+quantitative_phenotypes_pdf = np_array_to_pandas_with_missing(quantitative_phenotypes, missingness, n_quantitative_phenotypes, col_prefix='QP')
+quantitative_phenotypes_pdf = add_sample_index_pdf(quantitative_phenotypes_pdf)
+quantitative_phenotypes_pdf.head(5)
+
+# COMMAND ----------
+
+quantitative_phenotypes_pdf.to_csv(output_quantitative_phenotypes, index=True, header=True, sep = ',')
@@ -0,0 +1,243 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC 
+# MAGIC #### Simulate pVCF
+# MAGIC 
+# MAGIC Uses the 1000 genomes to simulate a project-level VCF at a larger scale and write to delta lake
+# MAGIC 
+# MAGIC For now we manually define functions to handle hardy-weinberg allele frequency and multiallelic variants
+# MAGIC 
+# MAGIC _TODO_ use sim1000G to get realistic families to test offset correction
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### import libraries
+
+# COMMAND ----------
+
+import glow
+spark = glow.register(spark)
+import pyspark.sql.functions as fx
+from pyspark.sql.types import *
+
+import random
+import string
+import pandas as pd
+import numpy as np
+import os
+from pathlib import Path
+import itertools
+from collections import Counter
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### Data Generation Constants
+
+# COMMAND ----------
+
+#genotype matrix
+n_samples = 50000
+n_variants = 1000
+
+# chromosomes for simulating pvcf
+random_seed = 42
+random.seed(random_seed)
+minor_allele_frequency_cutoff = 0.005
+chromosomes = ["21", "22"] #glow whole genome regression leave one chromosome out (loco) method requires at least two chromosomes
+
+n_partitions = 5 #good heuristic is 20 variants per partition at 500k samples
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### Data Storage Path Constants
+
+# COMMAND ----------
+
+user=dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply('user')
+dbfs_home_path_str = "dbfs:/home/{}/".format(user)
+dbfs_fuse_home_path_str = "/dbfs/home/{}/".format(user)
+dbfs_home_path = Path("dbfs:/home/{}/".format(user))
+dbfs_fuse_home_path = Path("/dbfs/home/{}/".format(user))
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### simulate genotypes helper functions
+
+# COMMAND ----------
+
+def hardy_weinberg_principle(minor_allele_frequency):
+  """
+  given a minor allele frequency for a biallelic variant, 
+  return an array of frequencies for each genotype
+  """
+  p = 1-minor_allele_frequency
+  q = minor_allele_frequency
+  aa = p * p
+  aA = 2 * p * q
+  AA = q * q
+  return [aa, aA, AA]
+
+def get_allele_frequencies(minor_allele_frequency):
+  """
+  given a list of `minor_allele_frequency`
+  add in the reference allele frequency and return a list of frequencies
+  """
+  ref_allele_frequency = 1 - sum(minor_allele_frequency)
+  allele_frequencies = [ref_allele_frequency] +  minor_allele_frequency
+  return allele_frequencies
+
+def get_allele_frequency_combos(allele_frequencies):
+  """
+  given a list of allele frequencies, 
+  return all combinations of frequencies
+  """
+  allele_frequency_product = list(itertools.product(allele_frequencies, allele_frequencies))
+  allele_freq_combos = [i[0]*i[1] for i in allele_frequency_product]
+  return allele_freq_combos
+
+def get_genotype_calls_combinations(allele_frequencies):
+  """
+  given a list of allele frequencies, 
+  return all possible genotype call combinations
+  for example, if len(allele_frequencies) = 6, one combination may be [0,5]
+  """
+  genotypes = [i for i in range(len(allele_frequencies))]
+  genotype_combinations = list(itertools.product(genotypes, genotypes)) 
+  genotype_calls = [[i[0], i[1]] for i in genotype_combinations]
+  return genotype_calls
+  
+def generate_multiallelic_frequencies(minor_allele_frequency, n_samples):
+  """
+  given a multiallelic variant with a list of `minor_allele_frequency`
+  return an array of frequencies for each genotype for n_samples
+  """ 
+  allele_frequencies = get_allele_frequencies(minor_allele_frequency)
+  allele_freq_combos = get_allele_frequency_combos(allele_frequencies)
+  genotype_calls = get_genotype_calls_combinations(allele_frequencies)
+  genotype_list = random.choices(genotype_calls, k=n_samples, weights=allele_freq_combos)
+  return genotype_list
+
+sample_id_list = [str(i) for i in range (0, n_samples)]
+
+def simulate_genotypes(minor_allele_frequency, n_samples, sample_list=sample_id_list):
+  """
+  given an array that contains the minor_allele_frequency as the first element, 
+  return a genotypes struct of length=n_samples that conforms to the Glow variant schema, 
+  with genotypes that are in Hardy Weinberg Equilibrium
+  """
+  n_samples = int(n_samples)
+  frequencies = hardy_weinberg_principle(minor_allele_frequency[0])
+  calls = [[0,0], [0,1], [1,1]]
+  if len(minor_allele_frequency) > 1:
+    genotype_list = generate_multiallelic_frequencies(minor_allele_frequency, n_samples)
+  else:
+    genotype_list = random.choices(calls, k=n_samples, weights=frequencies)
+  new_lst = [list(x) for x in zip(sample_id_list, genotype_list)]
+  genotypes = [{"sampleId":x, "calls": y} for x, y in new_lst]
+  return genotypes
+
+simulate_genotypes_udf = udf(simulate_genotypes, ArrayType(StructType([
+              StructField("sampleId", StringType(), True),
+              StructField("calls", ArrayType(IntegerType(), True))
+              ])))
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### set paths
+
+# COMMAND ----------
+
+vcfs_path = str(dbfs_home_path / "genomics/data/1kg-vcfs-autosomes")
+vcfs_path_local = str(dbfs_fuse_home_path / "genomics/data/1kg-vcfs-autosomes")
+
+os.environ['vcfs_path_local'] = vcfs_path_local
+output_vcf_delta = str(dbfs_home_path / f'genomics/data/delta/1kg_variants_pvcf.delta')
+output_simulated_delta = str(dbfs_home_path / f'genomics/data/delta/simulate_{n_samples}_samples_{n_variants}_variants_pvcf.delta')
+vcfs_path, vcfs_path_local, output_vcf_delta, output_simulated_delta
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### download 1000G data for chrom 21 and 22
+
+# COMMAND ----------
+
+# MAGIC %sh
+# MAGIC declare -a chroms=("21" "22")
+# MAGIC 
+# MAGIC for i in "${chroms[@]}"; do wget ftp://hgdownload.cse.ucsc.edu/gbdb/hg19/1000Genomes/phase3/ALL.chr$i.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz; done
+# MAGIC 
+# MAGIC mkdir -p $vcfs_path_local
+# MAGIC 
+# MAGIC cp ALL*.genotypes.vcf.gz $vcfs_path_local
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### read 1000 Genomes VCF
+
+# COMMAND ----------
+
+vcf = spark.read.format("vcf").load(vcfs_path) \
+                              .drop("genotypes") \
+                              .where(fx.col("INFO_AF")[0] >= minor_allele_frequency_cutoff)
+total_variants = vcf.count()
+fraction = n_variants / total_variants
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### checkpoint to delta
+
+# COMMAND ----------
+
+vcf.write.mode("overwrite").format("delta").save(output_vcf_delta)
+
+# COMMAND ----------
+
+vcf = spark.read.format("delta").load(output_vcf_delta)
+
+# COMMAND ----------
+
+display(vcf)
+
+# COMMAND ----------
+
+simulated_vcf = vcf.sample(withReplacement=False, fraction=fraction) \
+                   .repartition(n_partitions) \
+                   .withColumn("genotypes", simulate_genotypes_udf(fx.col("INFO_AF"), 
+                                                                   fx.lit(n_samples)))
+
+# COMMAND ----------
+
+simulated_vcf.count()
+
+# COMMAND ----------
+
+display(simulated_vcf.drop("genotypes"))
+
+# COMMAND ----------
+
+simulated_vcf.write.mode("overwrite").format("delta").save(output_simulated_delta)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### check output delta table
+
+# COMMAND ----------
+
+delta_vcf = spark.read.format("delta").load(output_simulated_delta).drop("genotypes")
+
+# COMMAND ----------
+
+display(delta_vcf)
+
+# COMMAND ----------
+
+display(delta_vcf.groupBy("contigName").count())