projectglow
diff --git a/‎docs/source/_static/notebooks/0_setup_constants_glow.html‎
Lines changed: 7 additions & 7 deletions b/‎docs/source/_static/notebooks/0_setup_constants_glow.html‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎docs/source/_static/notebooks/tertiary/1_quality_control.html‎
Lines changed: 8 additions & 8 deletions b/‎docs/source/_static/notebooks/tertiary/1_quality_control.html‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/0_setup_constants_glow.py‎
Lines changed: 4 additions & 0 deletions b/‎docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/0_setup_constants_glow.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/tertiary/1_quality_control.py‎
Lines changed: 98 additions & 24 deletions b/‎docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/tertiary/1_quality_control.py‎
Lines changed: 98 additions & 24 deletions
@@ -198,6 +198,10 @@
 
 # COMMAND ----------
 
+reference_genome_path = '/dbfs/databricks-datasets/genomics/grch38/data/GRCh38_full_analysis_set_plus_decoy_hla.fa'
+output_delta_split_multiallelics = simulate_prefix + "_variants_pvcf_glow_qc_split_multiallelics.delta"
+output_delta_split_multiallelics_normalize = simulate_prefix + "_variants_pvcf_glow_qc_normalize_indels.delta"
+
 output_delta_glow_qc_transformers = simulate_prefix + "_variants_pvcf_glow_qc_transformers.delta"
 output_delta_glow_qc_variants = simulate_prefix + "_variants_pvcf_glow_qc_variants.delta"
 output_delta_transformed = simulate_prefix + "_variants_pvcf_transformed.delta"
 
@@ -5,6 +5,8 @@
 # MAGIC 
 # MAGIC By running glow transform functions `split_multiallelics`, `mean_substitute`, and `genotype_states`
 # MAGIC 
+# MAGIC Important: please checkpoint to parquet/delta after each step in this process
+# MAGIC 
 # MAGIC Then filter,
 # MAGIC 
 # MAGIC 1. monomorphic variants using `array_distinct`
@@ -13,34 +15,40 @@
 
 # COMMAND ----------
 
-# MAGIC %md
-# MAGIC ##### adjust spark confs
-# MAGIC 
-# MAGIC see [split-multiallelics](https://glow.readthedocs.io/en/latest/etl/variant-splitter.html#split-multiallelics) docs
+# MAGIC %md ##### setup constants
 
 # COMMAND ----------
 
-spark.conf.set("spark.sql.codegen.wholeStage", False)
+# MAGIC %run ../0_setup_constants_glow
 
 # COMMAND ----------
 
-# MAGIC %md ##### setup constants
+# MAGIC %run ../2_setup_metadata
 
 # COMMAND ----------
 
-# MAGIC %run ../0_setup_constants_glow
+# MAGIC %md
+# MAGIC ##### adjust spark confs
+# MAGIC 
+# MAGIC see [split-multiallelics](https://glow.readthedocs.io/en/latest/etl/variant-splitter.html#split-multiallelics) docs
 
 # COMMAND ----------
 
-# MAGIC %run ../2_setup_metadata
+spark.conf.set("spark.sql.codegen.wholeStage", False)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### Define QC steps
 
 # COMMAND ----------
 
 method = 'quality_control'
-step1 = 'glow_qc_transformers'
-step2 = 'call_summary_stats'
-step3 = 'variant_filter'
-step4 = 'sample_filter'
+step1 = 'split_multiallelics'
+step2 = 'left_normalize_indels'
+step3 = 'mean_substitute'
+step4 = 'call_summary_stats'
+step5 = 'variant_filter'
 library = 'glow'
 datetime = datetime.now(pytz.timezone('US/Pacific'))
 
@@ -87,19 +95,85 @@ def calculate_pval_bonferroni_cutoff(df, cutoff=0.05):
 # COMMAND ----------
 
 # MAGIC %md
-# MAGIC ##### prepare simulated delta table for GWAS using glow transformers
+# MAGIC ##### split simulated data
+# MAGIC 
+# MAGIC 1. biallelic SNPs
+# MAGIC 2. multiallelic variants
+# MAGIC 3. indels
 
 # COMMAND ----------
 
-start_time_step1 = time.time()
 delta_vcf = spark.read.format("delta").load(output_delta)
-delta_gwas_vcf = (glow.transform('split_multiallelics', delta_vcf). \
-                  withColumn('values', glow.mean_substitute(glow.genotype_states('genotypes'))). \
+
+# COMMAND ----------
+
+display(delta_vcf.drop("genotypes"))
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### split multiallelics
+# MAGIC 
+# MAGIC write out biallelics and split multiallelics
+
+# COMMAND ----------
+
+start_time = time.time()
+multiallelic_df = delta_vcf.where(fx.size(fx.col("alternateAlleles")) > 1)
+multiallelic_df = glow.transform('split_multiallelics', multiallelic_df)
+bialleleic_df = delta_vcf.where(fx.size(fx.col("alternateAlleles")) == 1)
+
+multiallelic_df.write.mode("overwrite").format("delta").save(output_delta_split_multiallelics)
+bialleleic_df.write.mode("append").format("delta").save(output_delta_split_multiallelics)
+
+end_time = time.time()
+log_metadata(datetime, n_samples, n_variants, 0, 0, 'etl', step1, library, spark_version, node_type_id, n_workers, start_time, end_time, run_metadata_delta_path)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### extract indels and left-normalize
+
+# COMMAND ----------
+
+start_time = time.time()
+split_multiallelic_df = spark.read.format("delta").load(output_delta_split_multiallelics)
+indels_df = split_multiallelic_df.where((fx.length("referenceAllele") > 1) | (fx.length(fx.col("alternateAlleles")[0]) > 1))
+snps_df = split_multiallelic_df.where((fx.length("referenceAllele") == 1) & (fx.length(fx.col("alternateAlleles")[0]) == 1))
+
+normalized_variants_df = glow.transform(
+  "normalize_variants",
+  indels_df,
+  reference_genome_path=reference_genome_path
+)
+
+num_variants_changed = normalized_variants_df.where(fx.col("normalizationStatus.changed") == True).count()
+
+print("number of variants left normalized = " + str(num_variants_changed))
+
+snps_df.write.mode("overwrite").format("delta").save(output_delta_split_multiallelics_normalize)
+normalized_variants_df.drop("normalizationStatus"). \
+                       write.mode("append").format("delta"). \
+                       save(output_delta_split_multiallelics_normalize)
+
+end_time = time.time()
+log_metadata(datetime, n_samples, n_variants, 0, 0, 'etl', step2, library, spark_version, node_type_id, n_workers, start_time, end_time, run_metadata_delta_path)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### prepare simulated delta table for GWAS using glow transformers
+
+# COMMAND ----------
+
+start_time = time.time()
+delta_vcf = spark.read.format("delta").load(output_delta_split_multiallelics_normalize)
+delta_gwas_vcf = (delta_vcf.withColumn('values', glow.mean_substitute(glow.genotype_states('genotypes'))). \
                   filter(fx.size(fx.array_distinct('values')) > 1)
                  )
 delta_gwas_vcf.write.mode("overwrite").format("delta").save(output_delta_glow_qc_transformers)
-end_time_step1 = time.time()
-log_metadata(datetime, n_samples, n_variants, 0, 0, 'etl', step1, library, spark_version, node_type_id, n_workers, start_time_step1, end_time_step1, run_metadata_delta_path)
+end_time = time.time()
+log_metadata(datetime, n_samples, n_variants, 0, 0, 'etl', step3, library, spark_version, node_type_id, n_workers, start_time, end_time, run_metadata_delta_path)
 
 # COMMAND ----------
 
@@ -108,7 +182,7 @@ def calculate_pval_bonferroni_cutoff(df, cutoff=0.05):
 
 # COMMAND ----------
 
-start_time_step2 = time.time()
+start_time = time.time()
 delta_gwas_vcf = spark.read.format("delta").load(output_delta_glow_qc_transformers)
 
 summary_stats_df = delta_gwas_vcf.select(
@@ -119,8 +193,8 @@ def calculate_pval_bonferroni_cutoff(df, cutoff=0.05):
     withColumn("log10pValueHwe", fx.when(fx.col("pValueHwe") == 0, 26).otherwise(-fx.log10(fx.col("pValueHwe"))))
 summary_stats_df.drop("genotypes").write.mode("overwrite").format("delta").save(output_delta_glow_qc_variants)
 
-end_time_step2 = time.time()
-log_metadata(datetime, n_samples, n_variants, 0, 0, method, step2, library, spark_version, node_type_id, n_workers, start_time_step2, end_time_step2, run_metadata_delta_path)
+end_time = time.time()
+log_metadata(datetime, n_samples, n_variants, 0, 0, method, step4, library, spark_version, node_type_id, n_workers, start_time, end_time, run_metadata_delta_path)
 
 # COMMAND ----------
 
@@ -145,7 +219,7 @@ def calculate_pval_bonferroni_cutoff(df, cutoff=0.05):
 
 # COMMAND ----------
 
-start_time_step3 = time.time()
+start_time = time.time()
 variant_filter_df = spark.read.format("delta").load(output_delta_glow_qc_variants)
 
 variant_filter_df = summary_stats_df.where((fx.col("alleleFrequencies").getItem(0) >= allele_freq_cutoff) & 
@@ -155,8 +229,8 @@ def calculate_pval_bonferroni_cutoff(df, cutoff=0.05):
 
 variant_filter_df.write.option("overwriteSchema", "true").mode("overwrite").format("delta").save(output_delta_transformed)
 
-end_time_step3 = time.time()
-log_metadata(datetime, n_samples, n_variants, 0, 0, method, step3, library, spark_version, node_type_id, n_workers, start_time_step3, end_time_step3, run_metadata_delta_path)
+end_time = time.time()
+log_metadata(datetime, n_samples, n_variants, 0, 0, method, step5, library, spark_version, node_type_id, n_workers, start_time, end_time, run_metadata_delta_path)
 
 # COMMAND ----------