readTags_Stacks accepts gzipped files

lvclark · lvclark · commit 37c498d28150 · 2017-09-08T16:55:30.000-05:00
diff --git a/README.md b/README.md
@@ -149,7 +149,7 @@ Mrker4050,2,AGTAGGGAAAGGCCGGCAAGGCAACTAAA,
 ```
 
 ### Stacks catalog
-The program `cstacks` from the [Stacks](http://catchenlab.life.illinois.edu/stacks/) software generates three files in the format `batch_X.catalog.tags.tsv`, `batch_X.catalog.snps.tsv`, and `batch_X.catalog.alleles.tsv`.  TagDigger can read all three of these files and extract tag sequences.  Marker names will be numbers identical to the Catalog IDs in Stacks.  There is an option to ignore all non-biallelic markers.
+The program `cstacks` from the [Stacks](http://catchenlab.life.illinois.edu/stacks/) software generates three files in the format `batch_X.catalog.tags.tsv`, `batch_X.catalog.snps.tsv`, and `batch_X.catalog.alleles.tsv`.  TagDigger can read all three of these files and extract tag sequences.  Marker names will be numbers identical to the Catalog IDs in Stacks.  There is an option to ignore all non-biallelic markers.  If the file name ends with ".gz", TagDigger will assume it is gzipped, and otherwise will assume it is not compressed.
 
 ### SAM files from TASSEL-GBSv2
 [TASSEL 5](http://www.maizegenetics.net/#!tassel/c17q9) includes as part of its pipeline a [SAM](https://samtools.github.io/hts-specs/SAMv1.pdf) file produced by [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) or [BWA](http://bio-bwa.sourceforge.net/).  TagDigger can read tag sequences from this file and generate SNP names in the same format as the TASSEL GBS version 2 pipeline.  Since TASSEL can output multiple SNPs from the same tag, TagDigger generates a different set of names for the tags (in the format `chromosome-position-strand_allele`) but can output a CSV file matching the TASSEL SNP names to the TagDigger marker names.  If supplying a list of markers to retain, the user should put them in the format of TASSEL SNP names (e.g. `S01_1026`).  There is also an option to ignore all non-biallelic markers.
diff --git a/tagdigger_fun.py b/tagdigger_fun.py
@@ -622,23 +622,23 @@ def readTags_Stacks(tagsfile, snpsfile, allelesfile, toKeep = None, binaryOnly=F
     '''Read tags from the catalog format produced by Stacks.'''
     try:
         alltags = dict() # keys are locus numbers, values are sequences
-        with open(tagsfile, mode = 'r') as mycon:
+        with gzip.open(tagsfile, mode = 'rt') if tagsfile.endswith('.gz') else open(tagsfile, mode = 'r') as mycon:
             tr = csv.reader(mycon, delimiter='\t')
             for row in tr:
                 if row[0].startswith("#"):
                     continue  # skip comment line
                 if toKeep == None or row[2] in toKeep:
                     alltags[row[2]] = row[9]
         alleles = list() # tuples, where first item is locus number and second is haplotype
-        with open(allelesfile, mode = 'r') as mycon:
+        with gzip.open(allelesfile, mode = 'rt') if allelesfile.endswith('.gz') else open(allelesfile, mode = 'r') as mycon:
             ar = csv.reader(mycon, delimiter='\t')
             for row in ar:
                 if row[0].startswith("#"):
                     continue
                 if toKeep == None or row[2] in toKeep:
                     alleles.append((row[2], row[3]))
         positions = dict() # keys are locus numbers, values are lists of variant positions
-        with open(snpsfile, mode = 'r') as mycon:
+        with gzip.open(snpsfile, mode = 'rt') if snpsfile.endswith('.gz') else open(snpsfile, mode = 'r') as mycon:
             sr = csv.reader(mycon, delimiter='\t')
             for row in sr:
                 if row[0].startswith("#"):