Skip to content

Commit ab9a69c

Browse files
committed
issue #85 - data release helper code
1 parent 6aa1db6 commit ab9a69c

File tree

3 files changed

+68
-4
lines changed

3 files changed

+68
-4
lines changed

CHANGELOG.md

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
1-
## [unreleased]
1+
## [0.2.26] 2024-08-15
2+
3+
Bumped version to 0.2.26 to catch up with data release. Only new client functionality is #81 'data_release' helper functions
4+
5+
All other changes in this release were for data (and contained in data_v0.2.26)
26

37
### Added
48

5-
- New GFFs: RefSeq RS_2023_10, Ensembl 110, 111
9+
- #81 New 'data_release' code eg 'get_latest_combo_file_urls' that looks on GitHub to find latest data
10+
- New GFFs: RefSeq RS_2023_10, Ensembl 111, 112
11+
- #79 - RefSeq MT transcripts
612
- #66 - We now store 'Note' field (thanks holtgrewe for suggestion)
713
- Added requirements.txt for 'generate_transcript_data' sections
814
- client / JSON data schema version compatability check
@@ -15,6 +21,7 @@
1521
- #64 - Split code/data versions. json.gz are now labelled according to data schema version (thanks holtgrewe)
1622
- Renamed 'CHM13v2.0' to 'T2T-CHM13v2.0' so it could work with biocommons bioutils
1723
- #72 - Correctly handle ncRNA_gene genes (thanks holtgrewe for reporting)
24+
- #73 - HGNC ID was missing for some chrMT genes in Ensembl
1825

1926
## [0.2.21] - 2023-08-14
2027

@@ -209,7 +216,8 @@
209216

210217
- Initial commit
211218

212-
[unreleased]: https://github.com/SACGF/cdot/compare/v0.2.21...HEAD
219+
[unreleased]: https://github.com/SACGF/cdot/compare/v0.2.26...HEAD
220+
[0.2.26]: https://github.com/SACGF/cdot/compare/v0.2.21...v0.2.26
213221
[0.2.21]: https://github.com/SACGF/cdot/compare/v0.2.20...v0.2.21
214222
[0.2.20]: https://github.com/SACGF/cdot/compare/v0.2.19...v0.2.20
215223
[0.2.19]: https://github.com/SACGF/cdot/compare/v0.2.18...v0.2.19

cdot/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.2.21"
1+
__version__ = "0.2.26"
22

33

44
def get_data_schema_int(version: str) -> int:

cdot/data_release.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import re
2+
import requests
3+
import cdot
4+
5+
from cdot import get_data_schema_int
6+
7+
8+
def get_latest_data_release_tag_name():
9+
latest_data_release = get_latest_data_release()
10+
return latest_data_release.get('tag_name')
11+
12+
def _get_version_from_tag_name(tag_name, data_version=False):
13+
""" Returns None if doesn't match required prefix """
14+
release_prefix = "v"
15+
if data_version:
16+
release_prefix = "data_" + release_prefix
17+
18+
if not tag_name.startswith(release_prefix):
19+
return None
20+
return tag_name.lstrip(release_prefix)
21+
22+
23+
def get_latest_data_release():
24+
client_data_schema = get_data_schema_int(cdot.__version__)
25+
26+
url = "https://api.github.com/repos/SACGF/cdot/releases"
27+
response = requests.get(url)
28+
json_data = response.json()
29+
for release in json_data:
30+
tag_name = release['tag_name'] # Should look like 'v0.2.25' for code or 'data_v0.2.25' for data
31+
# We require a data version
32+
data_version = _get_version_from_tag_name(tag_name, data_version=True)
33+
if data_version is None:
34+
continue
35+
36+
data_schema = get_data_schema_int(data_version)
37+
if data_schema != client_data_schema:
38+
continue
39+
return release
40+
return {}
41+
42+
def get_latest_combo_file_urls(annotation_consortia, genome_builds):
43+
# lower case everything to be case insensitive
44+
annotation_consortia = {x.lower() for x in annotation_consortia}
45+
genome_builds = {x.lower() for x in genome_builds}
46+
47+
file_urls = []
48+
if latest_data_release := get_latest_data_release():
49+
for asset in latest_data_release["assets"]:
50+
browser_download_url = asset["browser_download_url"]
51+
filename = browser_download_url.rsplit("/")[-1]
52+
if m := re.match(r"cdot-(\d+\.\d+\.\d+)\.(refseq|ensembl)\.(.+)\.json\.gz", filename):
53+
_version, annotation_consortium, genome_build = m.groups()
54+
if annotation_consortium.lower() in annotation_consortia and genome_build.lower() in genome_builds:
55+
file_urls.append(browser_download_url)
56+
return file_urls

0 commit comments

Comments
 (0)