Skip to content

Commit e64ef23

Browse files
committed
#56 - Fix occasional UTA duplicated exons
1 parent 01625f8 commit e64ef23

File tree

3 files changed

+18
-23
lines changed

3 files changed

+18
-23
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
### Changed
1111

12+
- #56 - Fix occasional UTA duplicated exons
1213
- #57 - Correctly handle retrieving genomic position and dealing w/indels in GFF (thanks ltnetcase for reporting)
1314
- #60 - Fix for missing protein IDs due to Genbank / GenBank (thanks holtgrewe)
1415
- #64 - Split code/data versions. json.gz are now labelled according to data schema version (thanks holtgrewe)
Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,15 @@
1-
\copy (SELECT transcript.ac, string_agg(distinct transcript.hgnc, ',') as hgnc, string_agg(distinct origin.url, ',') as origin_url,
2-
string_agg(distinct es.alt_ac::varchar, ',') as contig,
3-
string_agg(distinct es.alt_strand::varchar, ',') as strand,
1+
\copy (SELECT transcript.ac, string_agg(distinct transcript.hgnc, ',') as hgnc, 'http://www.ncbi.nlm.nih.gov/refseq/' as origin_url,
2+
string_agg(distinct aln_v.alt_ac::varchar, ',') as contig,
3+
string_agg(distinct aln_v.alt_strand::varchar, ',') as strand,
44
transcript.cds_start_i,
55
transcript.cds_end_i,
6-
string_agg(exon.start_i::varchar, ',' order by exon.ord) as exon_starts,
7-
string_agg(exon.end_i::varchar, ',' order by exon.ord) as exon_ends,
8-
string_agg(exon_aln.cigar, ',' order by exon.ord) as cigars,
6+
string_agg(aln_v.alt_start_i::varchar, ',' order by aln_v.alt_exon_id) as exon_starts,
7+
string_agg(aln_v.alt_end_i::varchar, ',' order by aln_v.alt_exon_id) as exon_ends,
8+
string_agg(aln_v.cigar, ',' order by aln_v.alt_exon_id) as cigars,
99
string_agg(distinct aa.pro_ac, ',' order by aa.pro_ac) as protein
1010
from uta_20210129.transcript transcript
11-
inner join uta_20210129.exon_set es on (transcript.ac = es.tx_ac AND alt_aln_method = 'splign')
12-
inner join uta_20210129.origin origin on (transcript.origin_id = origin.origin_id)
13-
inner join uta_20210129.exon as exon on (es.exon_set_id = exon.exon_set_id)
14-
inner join uta_20210129.exon_aln exon_aln on (exon_aln.alt_exon_id = exon.exon_id)
11+
inner join uta_20210129.tx_exon_aln_v aln_v on (transcript.ac = aln_v.tx_ac AND alt_aln_method = 'splign')
1512
left outer join uta_20210129.associated_accessions aa on (transcript.ac = aa.tx_ac)
16-
WHERE es.alt_ac in
17-
('NC_000001.10', 'NC_000002.11', 'NC_000003.11', 'NC_000004.11', 'NC_000005.9', 'NC_000006.11', 'NC_000007.13', 'NC_000008.10', 'NC_000009.11', 'NC_000010.10', 'NC_000011.9', 'NC_000012.11', 'NC_000013.10', 'NC_000014.8', 'NC_000015.9', 'NC_000016.9', 'NC_000017.10', 'NC_000018.9', 'NC_000019.9', 'NC_000020.10', 'NC_000021.8', 'NC_000022.10', 'NC_000023.10', 'NC_000024.9') and origin.origin_id not in (10, 11)
13+
WHERE aln_v.alt_ac in
14+
('NC_000001.10', 'NC_000002.11', 'NC_000003.11', 'NC_000004.11', 'NC_000005.9', 'NC_000006.11', 'NC_000007.13', 'NC_000008.10', 'NC_000009.11', 'NC_000010.10', 'NC_000011.9', 'NC_000012.11', 'NC_000013.10', 'NC_000014.8', 'NC_000015.9', 'NC_000016.9', 'NC_000017.10', 'NC_000018.9', 'NC_000019.9', 'NC_000020.10', 'NC_000021.8', 'NC_000022.10', 'NC_000023.10', 'NC_000024.9')
1815
group by transcript.ac) TO 'uta_20210129_grch37.csv' CSV HEADER;
Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,15 @@
1-
\copy (SELECT transcript.ac, string_agg(distinct transcript.hgnc, ',') as hgnc, string_agg(distinct origin.url, ',') as origin_url,
2-
string_agg(distinct es.alt_ac::varchar, ',') as contig,
3-
string_agg(distinct es.alt_strand::varchar, ',') as strand,
1+
\copy (SELECT transcript.ac, string_agg(distinct transcript.hgnc, ',') as hgnc, 'http://www.ncbi.nlm.nih.gov/refseq/' as origin_url,
2+
string_agg(distinct aln_v.alt_ac::varchar, ',') as contig,
3+
string_agg(distinct aln_v.alt_strand::varchar, ',') as strand,
44
transcript.cds_start_i,
55
transcript.cds_end_i,
6-
string_agg(exon.start_i::varchar, ',' order by exon.ord) as exon_starts,
7-
string_agg(exon.end_i::varchar, ',' order by exon.ord) as exon_ends,
8-
string_agg(exon_aln.cigar, ',' order by exon.ord) as cigars,
6+
string_agg(aln_v.alt_start_i::varchar, ',' order by aln_v.alt_exon_id) as exon_starts,
7+
string_agg(aln_v.alt_end_i::varchar, ',' order by aln_v.alt_exon_id) as exon_ends,
8+
string_agg(aln_v.cigar, ',' order by aln_v.alt_exon_id) as cigars,
99
string_agg(distinct aa.pro_ac, ',' order by aa.pro_ac) as protein
1010
from uta_20210129.transcript transcript
11-
inner join uta_20210129.exon_set es on (transcript.ac = es.tx_ac AND alt_aln_method = 'splign')
12-
inner join uta_20210129.origin origin on (transcript.origin_id = origin.origin_id)
13-
inner join uta_20210129.exon as exon on (es.exon_set_id = exon.exon_set_id)
14-
inner join uta_20210129.exon_aln exon_aln on (exon_aln.alt_exon_id = exon.exon_id)
11+
inner join uta_20210129.tx_exon_aln_v aln_v on (transcript.ac = aln_v.tx_ac AND alt_aln_method = 'splign')
1512
left outer join uta_20210129.associated_accessions aa on (transcript.ac = aa.tx_ac)
16-
WHERE es.alt_ac in
13+
WHERE aln_v.alt_ac in
1714
('NC_000001.11', 'NC_000002.12', 'NC_000003.12', 'NC_000004.12', 'NC_000005.10', 'NC_000006.12', 'NC_000007.14', 'NC_000008.11', 'NC_000009.12', 'NC_000010.11', 'NC_000011.10', 'NC_000012.12', 'NC_000013.11', 'NC_000014.9', 'NC_000015.10', 'NC_000016.10', 'NC_000017.11', 'NC_000018.10', 'NC_000019.10', 'NC_000020.11', 'NC_000021.9', 'NC_000022.11', 'NC_000023.11', 'NC_000024.10') and origin.origin_id not in (10, 11)
1815
group by transcript.ac) TO 'uta_20210129_grch38.csv' CSV HEADER;

0 commit comments

Comments
 (0)