Skip to content

Commit 3a8ef5a

Browse files
committed
TSS like old code
Use get_id() not name
1 parent 99e5948 commit 3a8ef5a

File tree

9 files changed

+43
-8
lines changed

9 files changed

+43
-8
lines changed

bin/pyreference_gtf_to_json.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@
77
from collections import defaultdict, Counter
88
import gzip
99
import json
10+
import os
1011
from pyreference.settings import CHROM, START, END, STRAND, IS_CODING, CODING_FEATURES, \
1112
PYREFERENCE_JSON_VERSION_KEY, PYREFERENCE_JSON_VERSION
12-
from pyreference.utils.file_utils import name_from_file_name
13+
from pyreference.utils.file_utils import name_from_file_name, file_md5sum
1314

1415

1516
class SetEncoder(json.JSONEncoder):
@@ -158,6 +159,8 @@ def main():
158159
add_UTR_features(transcripts_by_id, transcript_cds_by_id)
159160

160161
data = {PYREFERENCE_JSON_VERSION_KEY : PYREFERENCE_JSON_VERSION,
162+
"reference_gtf" : {"path" : os.path.abspath(args.gtf),
163+
"md5sum" : file_md5sum(args.gtf)},
161164
"genes_by_id" : genes_by_id,
162165
"transcripts_by_id" : transcripts_by_id,
163166
"gene_id_by_name" : gene_id_by_name,

pyreference/gene.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def get_longest_transcript(self, coding_only=False):
6666
# We also want NM_007041 (len 2209) over NM_001001976 (len 2209)
6767
# Which is annoyingly zero padded - so use smallest ID length, then only if equal do alpha sort
6868
def min_transcript_key(t):
69-
(sys.maxint - t.length, len(t.get_id()), t.get_id())
69+
return (sys.maxint - t.length, len(t.get_id()), t.get_id())
7070

7171
longest_transcript = min(transcripts, key=min_transcript_key)
7272
return longest_transcript

pyreference/genomic_region.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,10 @@ def iv(self):
3030

3131
@lazy
3232
def tss(self):
33-
return self.iv.start_d_as_pos
33+
''' (Representative) Transcript Start Site
34+
This is NOT the most 5' position (use iv.start_d_as_pos for that) '''
35+
transcript_iv = self.get_representative_transcript().iv
36+
return transcript_iv.start_d_as_pos
3437

3538
def get_promoter_iv(self, promoter_range=1000):
3639
return iv_from_pos_range(self.tss, promoter_range)

pyreference/pyreference_config.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ def load_params_from_config(build=None, config=None):
4242
raise_from(configparser.NoOptionError(msg), noe)
4343

4444
if not cfg.has_section(build):
45-
params = {"build" : build, "config" : config}
46-
msg = "Build='%(build)s', no section [%(build)s] in config file '%(config)s" % params
45+
msg_params = {"build" : build, "config" : config}
46+
msg = "Build='%(build)s', no section [%(build)s] in config file '%(config)s" % msg_params
4747
raise ValueError(msg)
4848

4949
for f in GLOBAL_FLAGS:
@@ -52,6 +52,9 @@ def load_params_from_config(build=None, config=None):
5252
except configparser.NoOptionError as noe:
5353
pass
5454

55+
params["build"] = build
56+
params["config"] = config
57+
5558
for k in defaults.keys():
5659
params[k] = cfg.get(build, k)
5760

pyreference/reference.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,11 @@ def __init__(self, build=None, config=None, **kwargs):
104104
if kwargs:
105105
six.raise_from(ValueError("No 'genes_json' in passed kwargs"), config_exception)
106106
raise config_exception
107+
108+
# Store this so we can ask about config later
109+
self.build = params["build"]
110+
self._args = {"build" : build, "config" : config}
111+
self._build_params = params
107112

108113

109114
@lazy
@@ -281,7 +286,7 @@ def get_transcripts_in_iv(self, iv):
281286
return list(transcripts)
282287

283288
def get_transcript_ids(self, iv):
284-
return [feature.name for feature in self.get_transcripts_in_iv(iv)]
289+
return [feature.get_id() for feature in self.get_transcripts_in_iv(iv)]
285290

286291
def get_gene_names_array(self, iv):
287292
return list(set([t.get_gene_id() for t in self.get_transcripts_in_iv(iv)]))
@@ -355,4 +360,15 @@ def has_chr(self):
355360
chrom = some_transcript["chr"]
356361
return chrom.startswith("chr")
357362

363+
def __repr__(self):
364+
return "PyReference (%s)" % self.build
365+
366+
@lazy
367+
def config(self):
368+
params = {"build" : self.build,
369+
"args" : self._args,
370+
"reference_gtf" : self._genes_dict["reference_gtf"],
371+
"build_params" : self._build_params.copy()}
372+
return params
373+
358374

pyreference/settings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77

88
# Change this when you introduce breaking changes
9-
PYREFERENCE_JSON_VERSION = 4
9+
PYREFERENCE_JSON_VERSION = 5
1010
PYREFERENCE_JSON_VERSION_KEY = "pyreference_json_version"
1111

1212
CODING_FEATURES = {"CDS", "start_codon", "stop_codon"}

pyreference/transcript.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,3 +176,6 @@ def get_genomic_position(self, pos_on_transcript):
176176

177177
raise NotOnTranscriptException("%s didn't contain %s" % (self.get_id(), pos_on_transcript))
178178

179+
def __repr__(self):
180+
coding_str = " (coding)" if self.is_coding else ""
181+
return "Transcript %s%s: length %d" % (self.get_id(), coding_str, self.length)

pyreference/utils/file_utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
44
@author: dlawrence
55
'''
6+
from md5 import md5
67
import os
78

89

@@ -34,3 +35,9 @@ def file_or_file_name(f, mode='r'):
3435
return f # Already a File object
3536
else:
3637
raise ValueError("'%s' (%s) not a file or string" % (f, type(f)))
38+
39+
def file_md5sum(filename):
40+
m = md5()
41+
with open(filename, "rb") as f:
42+
m.update(f.read())
43+
return m.hexdigest()

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
setup(name = 'pyreference',
66
packages = find_packages(),
7-
version = '0.2',
7+
version = '0.3',
88
description = 'Library for working with reference genomes',
99
author = 'David Lawrence',
1010
author_email = 'davmlaw@gmail.com',

0 commit comments

Comments
 (0)