handle multiple sequencing units per sample for sample annotation generation

sreichl · sreichl · commit df795ef403ff · 2025-07-07T16:01:26.000+02:00
diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
@@ -51,7 +51,7 @@ rule sample_annotation:
     run:
         multiqc_df = pd.read_csv(os.path.join(input.multiqc_stats,"multiqc_general_stats.txt"), delimiter='\t', index_col=0).loc[list(samples.keys()),:]
         # merge by sample names (index) and drop redundant or unnecessary columns
-        annot_df = pd.merge(annot, multiqc_df, left_index=True, right_index=True, how='inner').drop(['bam_file', 'sample_name'], axis=1)
+        annot_df = pd.merge(annot_samples, multiqc_df, left_index=True, right_index=True, how='inner').drop(['bam_file', 'sample_name'], axis=1)
         # make column names R compatible
         annot_df.columns = (
                 annot_df.columns
diff --git a/workflow/scripts/plot_sample_annotation.R b/workflow/scripts/plot_sample_annotation.R
@@ -22,7 +22,9 @@ sample_annotation_html_path <- snakemake@output[["sample_annotation_html"]]
 
 #### load & prepare data ####
 # load data
-sample_annotation <- data.frame(fread(file.path(sample_annotation_path), header=TRUE), row.names=1, check.names = FALSE)
+sample_annotation <- data.table::fread(file.path(sample_annotation_path), header = TRUE)
+sample_annotation <- data.frame(sample_annotation[!duplicated(sample_annotation[[1]]), ], row.names = 1, check.names = FALSE)
+
 anno <- data.frame(fread(file.path(sample_annotation_w_QC_path), header=TRUE), row.names=1)
 
 # determine QC (pipeline provided) columns