Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions db_schema_patches/3_to_4.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Updates schema from version 3 to version 4
ALTER TABLE QC ADD samtools_positions_with_depth_of_0 INT UNSIGNED,
samtools_positions_with_depth_atleast_2 INT UNSIGNED,
samtools_positions_with_depth_atleast_5 INT UNSIGNED,
samtools_positions_with_depth_atleast_10 INT UNSIGNED,
samtools_positions_with_depth_atleast_20 INT UNSIGNED,
samtools_positions_with_depth_atleast_100 INT UNSIGNED;

UPDATE Version SET version=4 WHERE version=3;

16 changes: 15 additions & 1 deletion python/clockwork/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import re
import sys
import tempfile
import shutil
from operator import itemgetter
from clockwork import (
db_connection,
Expand Down Expand Up @@ -558,7 +559,9 @@ def make_qc_jobs_tsv(
output_dir = iso_dir.pipeline_dir(
row["sequence_replicate_number"], "qc", pipeline_version
)
assert not os.path.exists(output_dir)
if os.path.exists(output_dir):
print("Warning:", output_dir, "already exists. Removing.", file=sys.stderr)
shutil.rmtree(output_dir)
try:
os.makedirs(output_dir)
except:
Expand Down Expand Up @@ -1163,6 +1166,10 @@ def _update_qc_stats(self, seqrep_id, pipeline_version, pipeline_root):
samtools_stats = samtools_qc.SamtoolsQc.stats_from_report(
os.path.join(qc_dir, "samtools_qc", "samtools_qc.stats")
)
depth_stats = samtools_qc.SamtoolsQc.depth_stats(
os.path.join(qc_dir, "samtools_qc", "samtools_qc.depths")
)

fastqc_stats = fastqc.Fastqc.gather_all_stats(os.path.join(qc_dir, "fastqc"))
assert len(fastqc_stats) == 2
new_row = {"seqrep_id": seqrep_id, "pipeline_version": pipeline_version}
Expand Down Expand Up @@ -1196,6 +1203,13 @@ def _update_qc_stats(self, seqrep_id, pipeline_version, pipeline_root):
new_row["het_snp_positions"] = het_stats["Positions_used"]
new_row["het_snp_total_snps"] = het_stats["Total_SNPs"]
new_row["het_snp_het_calls"] = het_stats["Het_SNPs"]
new_row["samtools_positions_with_depth_of_0"] = depth_stats["eq_0"]
new_row["samtools_positions_with_depth_atleast_2"] = depth_stats["atleast_2"]
new_row["samtools_positions_with_depth_atleast_5"] = depth_stats["atleast_5"]
new_row["samtools_positions_with_depth_atleast_10"] = depth_stats["atleast_10"]
new_row["samtools_positions_with_depth_atleast_20"] = depth_stats["atleast_20"]
new_row["samtools_positions_with_depth_atleast_100"] = depth_stats["atleast_100"]


self.add_row_to_table("QC", new_row)

Expand Down
10 changes: 8 additions & 2 deletions python/clockwork/db_schema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version = 3
version = 4

tables = {
"Isolate": [
Expand Down Expand Up @@ -67,7 +67,13 @@
("het_snp_positions", "integer unsigned"),
("het_snp_total_snps", "integer unsigned"),
("het_snp_het_calls", "integer unsigned"),
],
("samtools_positions_with_depth_of_0", "integer unsigned"),
("samtools_positions_with_depth_atleast_2", "integer unsigned"),
("samtools_positions_with_depth_atleast_5", "integer unsigned"),
("samtools_positions_with_depth_atleast_10", "integer unsigned"),
("samtools_positions_with_depth_atleast_20", "integer unsigned"),
("samtools_positions_with_depth_atleast_100", "integer unsigned"),
],
"Read_counts": [
("seqrep_id", "integer"),
("original_total", "integer unsigned"),
Expand Down
36 changes: 36 additions & 0 deletions python/clockwork/samtools_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,41 @@ def __init__(self, ref_fasta, reads1, reads2, outdir):
def _map_reads(cls, ref_fasta, reads1, reads2, outfile):
read_map.map_reads(ref_fasta, reads1, reads2, outfile, markdup=True)

@classmethod
def _make_depth_stats(cls, samfile, outprefix):
depth_file = outprefix + ".depths"
cmd = " ".join(["samtools depth", "-a", samfile, ">", depth_file])
utils.syscall(cmd)

@classmethod
def depth_stats(cls, filename):
depths = {
"eq_0": 0,
"atleast_2": 0,
"atleast_5": 0,
"atleast_10": 0,
"atleast_20": 0,
"atleast_100": 0,
}

with open(filename) as f:
for line in f:
_, _, depth = line.rstrip().split("\t")
depth = int(depth)
if depth == 0:
depths["eq_0"] += 1
if depth >= 2:
depths["atleast_2"] += 1
if depth >= 5:
depths["atleast_5"] += 1
if depth >= 10:
depths["atleast_10"] += 1
if depth >= 20:
depths["atleast_20"] += 1
if depth >= 100:
depths["atleast_100"] += 1
return depths

@classmethod
def _make_stats_and_plots(cls, samfile, ref_fasta, outprefix):
stats_file = outprefix + ".stats"
Expand Down Expand Up @@ -101,6 +136,7 @@ def run(self):
outprefix = os.path.join(self.outdir, "samtools_qc")
samfile = os.path.join(self.outdir, "tmp.sam")
SamtoolsQc._map_reads(self.ref_fasta, self.reads1, self.reads2, samfile)
SamtoolsQc._make_depth_stats(samfile, outprefix)
SamtoolsQc._make_stats_and_plots(samfile, self.ref_fasta, outprefix)
hsc = het_snp_caller.HetSnpCaller(
samfile, self.ref_fasta, os.path.join(self.outdir, "het_snps")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
1 1 0
1 2 1
1 3 2
1 4 3
1 5 5
1 6 7
1 7 8
1 8 0
1 9 11
1 10 110
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified python/clockwork/tests/data/nextflow_qc/Reads/reads.1.1.fq.gz
Binary file not shown.
Binary file modified python/clockwork/tests/data/nextflow_qc/Reads/reads.1.2.fq.gz
Binary file not shown.
70 changes: 38 additions & 32 deletions python/clockwork/tests/data/nextflow_qc/mysql.dump
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
-- MySQL dump 10.13 Distrib 5.7.19, for Linux (x86_64)
-- MySQL dump 10.13 Distrib 5.7.32, for Linux (x86_64)
--
-- Host: localhost Database: test_db
-- ------------------------------------------------------
-- Server version 5.7.19-0ubuntu0.17.04.1
-- Server version 5.7.32-0ubuntu0.18.04.1

/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
Expand Down Expand Up @@ -66,7 +66,7 @@ CREATE TABLE `Pipeline` (

LOCK TABLES `Pipeline` WRITE;
/*!40000 ALTER TABLE `Pipeline` DISABLE KEYS */;
INSERT INTO `Pipeline` VALUES (1,1,NULL,'0.0.1','remove_contam',1,1),(2,2,NULL,'0.0.1','remove_contam',1,1),(3,3,NULL,'0.0.1','remove_contam',1,1),(4,4,NULL,'0.0.1','remove_contam',1,1);
INSERT INTO `Pipeline` VALUES (1,1,NULL,'0.9.0','remove_contam',1,1),(2,2,NULL,'0.9.0','remove_contam',1,1),(3,3,NULL,'0.9.0','remove_contam',1,1),(4,4,NULL,'0.9.0','remove_contam',1,1);
/*!40000 ALTER TABLE `Pipeline` ENABLE KEYS */;
UNLOCK TABLES;

Expand All @@ -84,8 +84,8 @@ CREATE TABLE `QC` (
`fastqc1_adapter_content` text,
`fastqc1_basic_statistics` text,
`fastqc1_kmer_content` text,
`fastqc1_max_sequence_length` int(11) DEFAULT NULL,
`fastqc1_min_sequence_length` int(11) DEFAULT NULL,
`fastqc1_max_sequence_length` int(10) unsigned DEFAULT NULL,
`fastqc1_min_sequence_length` int(10) unsigned DEFAULT NULL,
`fastqc1_overrepresented_sequences` text,
`fastqc1_per_base_n_content` text,
`fastqc1_per_base_sequence_content` text,
Expand All @@ -94,14 +94,14 @@ CREATE TABLE `QC` (
`fastqc1_per_sequence_quality_scores` text,
`fastqc1_sequence_duplication_levels` text,
`fastqc1_sequence_length_distribution` text,
`fastqc1_sequences_flagged_as_poor_quality` int(11) DEFAULT NULL,
`fastqc1_total_sequences` int(11) DEFAULT NULL,
`fastqc1_sequences_flagged_as_poor_quality` int(10) unsigned DEFAULT NULL,
`fastqc1_total_sequences` int(10) unsigned DEFAULT NULL,
`fastqc2_gc` float DEFAULT NULL,
`fastqc2_adapter_content` text,
`fastqc2_basic_statistics` text,
`fastqc2_kmer_content` text,
`fastqc2_max_sequence_length` int(11) DEFAULT NULL,
`fastqc2_min_sequence_length` int(11) DEFAULT NULL,
`fastqc2_max_sequence_length` int(10) unsigned DEFAULT NULL,
`fastqc2_min_sequence_length` int(10) unsigned DEFAULT NULL,
`fastqc2_overrepresented_sequences` text,
`fastqc2_per_base_n_content` text,
`fastqc2_per_base_sequence_content` text,
Expand All @@ -110,23 +110,29 @@ CREATE TABLE `QC` (
`fastqc2_per_sequence_quality_scores` text,
`fastqc2_sequence_duplication_levels` text,
`fastqc2_sequence_length_distribution` text,
`fastqc2_sequences_flagged_as_poor_quality` int(11) DEFAULT NULL,
`fastqc2_total_sequences` int(11) DEFAULT NULL,
`samtools_raw_total_sequences` int(11) DEFAULT NULL,
`samtools_reads_mapped` int(11) DEFAULT NULL,
`samtools_reads_duplicated` int(11) DEFAULT NULL,
`samtools_bases_mapped_cigar` int(11) DEFAULT NULL,
`samtools_bases_trimmed` int(11) DEFAULT NULL,
`fastqc2_sequences_flagged_as_poor_quality` int(10) unsigned DEFAULT NULL,
`fastqc2_total_sequences` int(10) unsigned DEFAULT NULL,
`samtools_raw_total_sequences` int(10) unsigned DEFAULT NULL,
`samtools_reads_mapped` int(10) unsigned DEFAULT NULL,
`samtools_reads_duplicated` int(10) unsigned DEFAULT NULL,
`samtools_bases_mapped_cigar` bigint(20) unsigned DEFAULT NULL,
`samtools_bases_trimmed` bigint(20) unsigned DEFAULT NULL,
`samtools_error_rate` float DEFAULT NULL,
`samtools_average_quality` float DEFAULT NULL,
`samtools_insert_size_average` float DEFAULT NULL,
`samtools_insert_size_standard_deviation` float DEFAULT NULL,
`samtools_inward_oriented_pairs` int(11) DEFAULT NULL,
`samtools_outward_oriented_pairs` int(11) DEFAULT NULL,
`samtools_pairs_with_other_orientation` int(11) DEFAULT NULL,
`het_snp_positions` int(11) DEFAULT NULL,
`het_snp_total_snps` int(11) DEFAULT NULL,
`het_snp_het_calls` int(11) DEFAULT NULL
`samtools_inward_oriented_pairs` int(10) unsigned DEFAULT NULL,
`samtools_outward_oriented_pairs` int(10) unsigned DEFAULT NULL,
`samtools_pairs_with_other_orientation` int(10) unsigned DEFAULT NULL,
`het_snp_positions` int(10) unsigned DEFAULT NULL,
`het_snp_total_snps` int(10) unsigned DEFAULT NULL,
`het_snp_het_calls` int(10) unsigned DEFAULT NULL,
`samtools_positions_with_depth_of_0` int(10) unsigned DEFAULT NULL,
`samtools_positions_with_depth_atleast_2` int(10) unsigned DEFAULT NULL,
`samtools_positions_with_depth_atleast_5` int(10) unsigned DEFAULT NULL,
`samtools_positions_with_depth_atleast_10` int(10) unsigned DEFAULT NULL,
`samtools_positions_with_depth_atleast_20` int(10) unsigned DEFAULT NULL,
`samtools_positions_with_depth_atleast_100` int(10) unsigned DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
/*!40101 SET character_set_client = @saved_cs_client */;

Expand All @@ -148,11 +154,11 @@ DROP TABLE IF EXISTS `Read_counts`;
/*!40101 SET character_set_client = utf8 */;
CREATE TABLE `Read_counts` (
`seqrep_id` int(11) DEFAULT NULL,
`original_total` int(11) DEFAULT NULL,
`contamination` int(11) DEFAULT NULL,
`not_contamination` int(11) DEFAULT NULL,
`unmapped` int(11) DEFAULT NULL,
`total_after_remove_contam` int(11) DEFAULT NULL
`original_total` int(10) unsigned DEFAULT NULL,
`contamination` int(10) unsigned DEFAULT NULL,
`not_contamination` int(10) unsigned DEFAULT NULL,
`unmapped` int(10) unsigned DEFAULT NULL,
`total_after_remove_contam` int(10) unsigned DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
/*!40101 SET character_set_client = @saved_cs_client */;

Expand All @@ -162,7 +168,7 @@ CREATE TABLE `Read_counts` (

LOCK TABLES `Read_counts` WRITE;
/*!40000 ALTER TABLE `Read_counts` DISABLE KEYS */;
INSERT INTO `Read_counts` VALUES (1,156,12,132,12,144),(2,156,12,132,12,144),(3,156,12,132,12,144),(4,156,12,132,12,144);
INSERT INTO `Read_counts` VALUES (2,156,12,132,12,144),(4,156,12,132,12,144),(1,156,12,132,12,144),(3,156,12,132,12,144);
/*!40000 ALTER TABLE `Read_counts` ENABLE KEYS */;
UNLOCK TABLES;

Expand Down Expand Up @@ -230,7 +236,7 @@ DROP TABLE IF EXISTS `Seqrep`;
CREATE TABLE `Seqrep` (
`seqrep_id` int(11) NOT NULL AUTO_INCREMENT,
`isolate_id` int(11) DEFAULT NULL,
`sequence_replicate_number` int(11) DEFAULT NULL,
`sequence_replicate_number` bigint(20) unsigned DEFAULT NULL,
`original_reads_file_1_md5` text,
`original_reads_file_2_md5` text,
`remove_contam_reads_file_1_md5` text,
Expand All @@ -252,7 +258,7 @@ CREATE TABLE `Seqrep` (

LOCK TABLES `Seqrep` WRITE;
/*!40000 ALTER TABLE `Seqrep` DISABLE KEYS */;
INSERT INTO `Seqrep` VALUES (1,1,43,'a3b22e537be7e1fc03c29df820f65b06','1676bdb60c23159fbaccc6975aed86c4','d9ea5077779c50c82bb9852fe5582cc0','5a1ef78ffdf7a4071a2b91d7c7dde8d5',0,1,'2017-12-25',0,'Illumina HiSeq 2000',NULL,0),(2,2,45,'b08620e2b9231ee66ae56f926d97f097','808911bb28af5c5674bda3b7c71e3a75','0da67fe9d1f3b4313304c87c4dbc635f','4f20723c34220b96995550b0e627cc1f',0,1,'2017-12-25',0,'Illumina HiSeq 2000',NULL,0),(3,3,47,'9981927389818aa1b57d6285ff490d23','5ffb53b040c7491807f963c3a94ad8fb','8e59250c494f103450c55f373a28ed96','e0c29c85b29af66dd810c5b0e741d0b7',0,1,'2017-12-25',0,'Illumina HiSeq 2000',NULL,0),(4,4,49,'9306e04decb6b721daf0ea765d860c3a','fb0f64bf6f7ff2b05d1fb521634b41b4','164f99ffb7f4416b2c60d7491019d65e','fd7a1fdfe55bcc47ffc5cfa0a655b52f',0,1,'2017-12-25',0,'Illumina HiSeq 2000',NULL,0);
INSERT INTO `Seqrep` VALUES (1,1,43,'25c461ececa0271428a283c534497417','ece4028afddf2365477e130d84748c56','2096a6996aef36c5c75c0a9dba3ed3a4','e77815e19d1f81db41888900039cb1b1',0,1,'2017-12-25',0,'Illumina HiSeq 2000',NULL,0),(2,2,45,'e52a403846ab49eeb5cb9573822e1d92','3eef5d5e74295f7951526e23c7de656a','a4b21345c9ee84baeb4ce3b94aa478ca','3dc2b9875ddbf95de24990aa916492e4',0,1,'2017-12-25',0,'Illumina HiSeq 2000',NULL,0),(3,3,47,'bba6cc17819a133c18e8f0c0e099a55e','e72c48a02f49ee98a45682b6e4dfac56','f35b1d902a4fcfd475b4e3e325942683','a00056bcca5a6c99a4448a2354f6245c',0,1,'2017-12-25',0,'Illumina HiSeq 2000',NULL,0),(4,4,49,'866e1700d3c7e0d8c37edf8bb8399ffe','58ea111df4310f356460023a5483229f','e615dc85493f66a33594f8ff22292aa7','a69ad444c1b84271101f460c63e43e2b',0,1,'2017-12-25',0,'Illumina HiSeq 2000',NULL,0);
/*!40000 ALTER TABLE `Seqrep` ENABLE KEYS */;
UNLOCK TABLES;

Expand All @@ -274,7 +280,7 @@ CREATE TABLE `Version` (

LOCK TABLES `Version` WRITE;
/*!40000 ALTER TABLE `Version` DISABLE KEYS */;
INSERT INTO `Version` VALUES (1);
INSERT INTO `Version` VALUES (4);
/*!40000 ALTER TABLE `Version` ENABLE KEYS */;
UNLOCK TABLES;
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
Expand All @@ -287,4 +293,4 @@ UNLOCK TABLES;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;

-- Dump completed on 2017-10-13 16:08:54
-- Dump completed on 2020-11-13 14:09:42
Loading