diff --git a/examples/gwas.gwas b/examples/gwas.gwas new file mode 100644 index 00000000..ce7eb4c6 --- /dev/null +++ b/examples/gwas.gwas @@ -0,0 +1,13 @@ +CHR BP SNP P BETA SE MAF +X 3001200 rs100001 2.5e-09 0.62 0.09 0.21 +X 3003450 rs100002 7.8e-08 0.55 0.10 0.23 +X 3007890 rs100003 1.1e-06 0.41 0.08 0.27 +X 3009990 rs100004 4.3e-05 0.33 0.08 0.30 +X 3032100 rs100005 9.2e-10 0.71 0.11 0.18 +X 3045600 rs100006 3.4e-08 0.59 0.10 0.20 +X 3060200 rs100007 6.7e-07 0.48 0.09 0.25 +X 3078500 rs100008 1.9e-05 0.36 0.08 0.29 +X 3100400 rs100009 8.8e-06 -0.40 0.09 0.26 +X 3150000 rs100010 2.2e-07 -0.52 0.10 0.22 +X 3195000 rs100011 1.5e-09 -0.68 0.11 0.19 +X 3199800 rs100012 5.0e-08 -0.60 0.10 0.21 \ No newline at end of file diff --git a/pygenometracks/readGwas.py b/pygenometracks/readGwas.py new file mode 100644 index 00000000..4a2089fd --- /dev/null +++ b/pygenometracks/readGwas.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- +import sys +import collections +from .utilities import to_string, InputError + + +class ReadGwas(object): + """ + Reads a GWAS file. The expected fields are: + chromosome, position, name, and pvalue. + + Example: + gwas = ReadGwas(open("file.gwas", 'r')) + for record in gwas: + print(record.chromosome, record.position, record.pvalue) + """ + + def __init__(self, file_handle, has_header=False): + """ + :param file_handle: file handle + """ + self.file_handle = file_handle + self.line_number = 0 + + # Define the fields for GWAS + self.fields = ['chromosome', 'position', 'name', 'pvalue'] + self.GwasRecord = collections.namedtuple('GwasRecord', self.fields) + + # Skip the header line if present + if has_header: + header = next(self.file_handle) + self.line_number += 1 + + def __iter__(self): + return self + + def get_no_comment_line(self): + """ + Skips comment lines starting with '#' or empty lines. + :return: a valid line + """ + line = next(self.file_handle) + line = to_string(line) + if line.startswith("#") or line.strip() == '': + line = self.get_no_comment_line() + + self.line_number += 1 + return line + + def __next__(self): + """ + :return: GwasRecord object + """ + line = self.get_no_comment_line() + return self.get_gwas_record(line) + + def get_gwas_record(self, gwas_line): + """ + Processes each line from a GWAS file and returns a namedtuple object. + + :param gwas_line: a single line from the GWAS file + :return: GwasRecord object + """ + line_data = gwas_line.strip() + line_data = to_string(line_data) + line_data = line_data.split("\t") + + if len(line_data) < 4: + raise InputError(f"Line {self.line_number} does not have 4 fields: {gwas_line}." + f"We expect at least 4 field, corresponding to: chromosome, position, name, pvalue.") + + try: + chromosome = line_data[0] + position = int(line_data[1]) + name = line_data[2] + pvalue = float(line_data[3]) + except ValueError as e: + raise InputError(f"Error parsing line {self.line_number}: {gwas_line}\n{e}") + + return self.GwasRecord(chromosome, position, name, pvalue) diff --git a/pygenometracks/tests/test_data/gwas.ini b/pygenometracks/tests/test_data/gwas.ini new file mode 100644 index 00000000..f0aa169e --- /dev/null +++ b/pygenometracks/tests/test_data/gwas.ini @@ -0,0 +1,16 @@ + +[gwas] +file = gwas_1.gwas +height = 4 +title = test_1 + +[spacer] + +[gwas_2] +file = gwas_2.gwas +file_has_header = True +height = 2 +title = test_2 +color = #50E3C2 + +[x-axis] diff --git a/pygenometracks/tests/test_data/gwas_1.gwas b/pygenometracks/tests/test_data/gwas_1.gwas new file mode 100644 index 00000000..dae8e6b2 --- /dev/null +++ b/pygenometracks/tests/test_data/gwas_1.gwas @@ -0,0 +1,12 @@ +X 3002145 rs7823451 4.7e-08 +X 3005892 rs6610293 1.3e-06 +X 3008734 rs9034128 9.2e-05 +X 3031022 rs4420981 6.1e-10 +X 3042567 rs2093847 2.8e-07 +X 3056789 rs7734012 5.5e-09 +X 3072341 rs1182736 3.9e-06 +X 3079880 rs5502918 7.4e-05 +X 3112450 rs3847291 1.8e-08 +X 3145670 rs6029184 9.9e-07 +X 3180230 rs2938471 3.2e-10 +X 3197650 rs8840123 6.3e-08 \ No newline at end of file diff --git a/pygenometracks/tests/test_data/gwas_2.gwas b/pygenometracks/tests/test_data/gwas_2.gwas new file mode 100644 index 00000000..ce7eb4c6 --- /dev/null +++ b/pygenometracks/tests/test_data/gwas_2.gwas @@ -0,0 +1,13 @@ +CHR BP SNP P BETA SE MAF +X 3001200 rs100001 2.5e-09 0.62 0.09 0.21 +X 3003450 rs100002 7.8e-08 0.55 0.10 0.23 +X 3007890 rs100003 1.1e-06 0.41 0.08 0.27 +X 3009990 rs100004 4.3e-05 0.33 0.08 0.30 +X 3032100 rs100005 9.2e-10 0.71 0.11 0.18 +X 3045600 rs100006 3.4e-08 0.59 0.10 0.20 +X 3060200 rs100007 6.7e-07 0.48 0.09 0.25 +X 3078500 rs100008 1.9e-05 0.36 0.08 0.29 +X 3100400 rs100009 8.8e-06 -0.40 0.09 0.26 +X 3150000 rs100010 2.2e-07 -0.52 0.10 0.22 +X 3195000 rs100011 1.5e-09 -0.68 0.11 0.19 +X 3199800 rs100012 5.0e-08 -0.60 0.10 0.21 \ No newline at end of file diff --git a/pygenometracks/tests/test_data/master_gwas.png b/pygenometracks/tests/test_data/master_gwas.png new file mode 100644 index 00000000..99eafe05 Binary files /dev/null and b/pygenometracks/tests/test_data/master_gwas.png differ diff --git a/pygenometracks/tests/test_gwasTrack.py b/pygenometracks/tests/test_gwasTrack.py new file mode 100644 index 00000000..336201ef --- /dev/null +++ b/pygenometracks/tests/test_gwasTrack.py @@ -0,0 +1,50 @@ +import matplotlib as mpl +from matplotlib.testing.compare import compare_images +from tempfile import NamedTemporaryFile +import os.path +import pygenometracks.plotTracks + +mpl.use('agg') + +ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), + "test_data") + +tracks = """ +[gwas] +file = gwas_1.gwas +height = 4 +title = test_1 + +[spacer] + +[gwas_2] +file = gwas_2.gwas +file_has_header = True +height = 2 +title = test_2 +color = #50E3C2 + +[x-axis] +""" + +with open(os.path.join(ROOT, "gwas.ini"), 'w') as fh: + fh.write(tracks) + +tolerance = 13 # default matplotlib pixed difference tolerance + + +def test_gwas_track(): + outfile = NamedTemporaryFile(suffix='.png', prefix='gwas_test_', + delete=False) + ini_file = os.path.join(ROOT, "gwas.ini") + region = "X:3000000-3200000" + expected_file = os.path.join(ROOT, 'master_gwas.png') + args = f"--tracks {ini_file} --region {region} " \ + "--trackLabelFraction 0.2 --dpi 130 " \ + f"--outFileName {outfile.name}".split() + pygenometracks.plotTracks.main(args) + res = compare_images(expected_file, + outfile.name, tolerance) + assert res is None, res + + os.remove(outfile.name) diff --git a/pygenometracks/tracks/GwasTrack.py b/pygenometracks/tracks/GwasTrack.py new file mode 100644 index 00000000..06834b1c --- /dev/null +++ b/pygenometracks/tracks/GwasTrack.py @@ -0,0 +1,76 @@ +from .GenomeTrack import GenomeTrack +import numpy as np +from .. readGwas import ReadGwas + +DEFAULT_GWAS_COLOR = '#ff7f00' + + +class GwasTrack(GenomeTrack): + SUPPORTED_ENDINGS = ['.gwas', '.linear', '.logistic', '.assoc', '.qassoc'] # this is used by make_tracks_file to guess the type of track based on file name + TRACK_TYPE = 'gwas' + OPTIONS_TXT = GenomeTrack.OPTIONS_TXT + f""" +# Title of the track. Usually displayed to the right as a label +title = +# Height of the track +height = +# File containing the data. We expect an IGV .gwas format file with the columns: CHR, BP, SNP and P. Optionally, extra +# annotation columns can be added. +file = +# Y label text +ylabel = +# Fontsize of the labels +fontsize = +# Color +color = +# Optional. If not given is guessed from the file ending. +file_type = {TRACK_TYPE} + """ + + DEFAULTS_PROPERTIES = {'fontsize': 12, + 'orientation': None, + 'color': DEFAULT_GWAS_COLOR, + 'border_color': 'black', + 'labels': True, + 'line_width': 0.5, + 'max_labels': 60, + 'max_value': 1, + 'min_value': 0, + 'fontstyle': 'normal', + 'y_axis_max_val': None, + 'id_fontsize': 12, + 'marker_size': 45, + 'file_has_header': False} + + NECESSARY_PROPERTIES = ['file'] + SYNONYMOUS_PROPERTIES = {} + POSSIBLE_PROPERTIES = {} + BOOLEAN_PROPERTIES = ['file_has_header'] + STRING_PROPERTIES = ['title', 'file_type', 'file', 'color'] + FLOAT_PROPERTIES = {'height': [0, np.inf], 'fontsize': [0, np.inf], 'id_fontsize': [0, np.inf], + 'marker_size': [0, np.inf], 'y_axis_max_val': [0, np.inf]} + INTEGER_PROPERTIES = {} + + def plot(self, ax, chrom, region_start, region_end): + """ + Plot a scatter plot for the GWAS data. + The p-values are transformed as -log10(pvalue), so the y-axis will show the exponents of the p-values. + + :param ax: matplotlib axis + :param chrom: chromosome name + :param region_start: start position of the region + :param region_end: end position of the region + :return: None + """ + gwas_reader = ReadGwas(open(self.properties['file'], 'r'), has_header=self.properties['file_has_header']) + + # Fill in the position and pvalues lists with data from the GWAS file + position = [] + pvalues = [] + for record in gwas_reader: + if record.chromosome == chrom and region_start <= record.position <= region_end: + position.append(record.position) + pvalues.append(-np.log10(record.pvalue) if record.pvalue > 0 else 0) # Notice the -log10 transformation + + # Plot the scatterplot + ax.scatter(position, pvalues, s=self.properties['marker_size'], color=self.properties['color'], marker='o', + edgecolors='black', linewidths=.66)