Skip to content

Commit 6cd964b

Browse files
eselimnlalimanfoo
andauthored
Add Pf8 (#659)
* initial pf8 gcs PR ready Pf8() PR ready Pf8() * file path change * s3 access rewired via fsspec * fixes the check for annotation file * removes extra path * altlen commented in test units * missed fix for altlen * fixes for PR feedbacks * fix-2 for PR feedbacks * fix for syntax error * chained urls into s3 fs * make handling of chained URLs consistent between GCS and S3 * fix bug --------- Co-authored-by: Alistair Miles <[email protected]>
1 parent 6052cdc commit 6cd964b

File tree

7 files changed

+542
-13
lines changed

7 files changed

+542
-13
lines changed

malariagen_data/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from .amin1 import Amin1
55
from .anopheles import AnophelesDataResource, Region
66
from .pf7 import Pf7
7+
from .pf8 import Pf8
78
from .pv4 import Pv4
89
from .util import SiteClass
910

malariagen_data/pf8.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import os
2+
3+
from .plasmodium import PlasmodiumDataResource
4+
5+
6+
class Pf8(PlasmodiumDataResource):
7+
"""Provides access to data from the Pf8 release.
8+
9+
Parameters
10+
----------
11+
url : str, optional
12+
Base path to data. Default uses Google Cloud Storage "gs://pf8-release/",
13+
or specify a local path on your file system if data have been downloaded.
14+
data_config : str, optional
15+
Path to config for structure of Pf8 data resource. Defaults to config included
16+
with the malariagen_data package.
17+
**kwargs
18+
Passed through to fsspec when setting up file system access.
19+
20+
Examples
21+
--------
22+
Access data from Google Cloud Storage (default):
23+
24+
>>> import malariagen_data
25+
>>> pf8 = malariagen_data.Pf8()
26+
27+
Access data downloaded to a local file system:
28+
29+
>>> pf8 = malariagen_data.Pf8("/local/path/to/pf8-release/")
30+
31+
"""
32+
33+
def __init__(
34+
self,
35+
url=None,
36+
data_config=None,
37+
**kwargs,
38+
):
39+
# setup filesystem
40+
if not data_config:
41+
working_dir = os.path.dirname(os.path.abspath(__file__))
42+
data_config = os.path.join(working_dir, "pf8_config.json")
43+
super().__init__(data_config=data_config, url=url)

malariagen_data/pf8_config.json

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
{
2+
"default_url": "gs://pf8-release/",
3+
"metadata_path": "metadata/Pf8_samples.txt",
4+
"reference_path": "reference/PlasmoDB-54-Pfalciparum3D7-Genome.zarr/",
5+
"reference_contigs": [
6+
"Pf3D7_01_v3",
7+
"Pf3D7_02_v3",
8+
"Pf3D7_03_v3",
9+
"Pf3D7_04_v3",
10+
"Pf3D7_05_v3",
11+
"Pf3D7_06_v3",
12+
"Pf3D7_07_v3",
13+
"Pf3D7_08_v3",
14+
"Pf3D7_09_v3",
15+
"Pf3D7_10_v3",
16+
"Pf3D7_11_v3",
17+
"Pf3D7_12_v3",
18+
"Pf3D7_13_v3",
19+
"Pf3D7_14_v3",
20+
"Pf3D7_API_v3",
21+
"Pf3D7_MIT_v3"
22+
],
23+
"annotations_path": "annotations/PlasmoDB-55_Pfalciparum3D7.gff.gz",
24+
"variant_calls_zarr_path": "zarr/",
25+
"default_variant_variables": {
26+
"FILTER_PASS": ["variants"],
27+
"is_snp": ["variants"],
28+
"numalt": ["variants"],
29+
"CDS": ["variants"]
30+
},
31+
"extended_calldata_variables": {
32+
"DP": ["variants", "samples"],
33+
"GQ": ["variants", "samples"],
34+
"MIN_DP": ["variants", "samples"],
35+
"PGT": ["variants", "samples"],
36+
"PID": ["variants", "samples"],
37+
"PS": ["variants", "samples"],
38+
"RGQ": ["variants", "samples"],
39+
"PL": ["variants", "samples", "genotypes"],
40+
"SB": ["variants", "samples", "sb_statistics"]
41+
},
42+
"extended_variant_fields": {
43+
"AC": ["variants", "alt_alleles"],
44+
"AF": ["variants", "alt_alleles"],
45+
"AN": ["variants"],
46+
"ANN_AA_length": ["variants", "alt_alleles"],
47+
"ANN_AA_pos": ["variants", "alt_alleles"],
48+
"ANN_Allele": ["variants", "alt_alleles"],
49+
"ANN_Annotation": ["variants", "alt_alleles"],
50+
"ANN_Annotation_Impact": ["variants", "alt_alleles"],
51+
"ANN_CDS_length": ["variants", "alt_alleles"],
52+
"ANN_CDS_pos": ["variants", "alt_alleles"],
53+
"ANN_Distance": ["variants", "alt_alleles"],
54+
"ANN_Feature_ID": ["variants", "alt_alleles"],
55+
"ANN_Feature_Type": ["variants", "alt_alleles"],
56+
"ANN_Gene_ID": ["variants", "alt_alleles"],
57+
"ANN_Gene_Name": ["variants", "alt_alleles"],
58+
"ANN_HGVS_c": ["variants", "alt_alleles"],
59+
"ANN_HGVS_p": ["variants", "alt_alleles"],
60+
"ANN_Rank": ["variants", "alt_alleles"],
61+
"ANN_Transcript_BioType": ["variants", "alt_alleles"],
62+
"ANN_cDNA_length": ["variants", "alt_alleles"],
63+
"ANN_cDNA_pos": ["variants", "alt_alleles"],
64+
"AS_BaseQRankSum": ["variants", "alt_alleles"],
65+
"AS_FS": ["variants", "alt_alleles"],
66+
"AS_InbreedingCoeff": ["variants", "alt_alleles"],
67+
"AS_MQ": ["variants", "alt_alleles"],
68+
"AS_MQRankSum": ["variants", "alt_alleles"],
69+
"AS_QD": ["variants", "alt_alleles"],
70+
"AS_ReadPosRankSum": ["variants", "alt_alleles"],
71+
"AS_SOR": ["variants", "alt_alleles"],
72+
"BaseQRankSum": ["variants"],
73+
"DP": ["variants"],
74+
"DS": ["variants"],
75+
"END": ["variants"],
76+
"ExcessHet": ["variants"],
77+
"FILTER_Apicoplast": ["variants"],
78+
"FILTER_Centromere": ["variants"],
79+
"FILTER_InternalHypervariable": ["variants"],
80+
"FILTER_LowQual": ["variants"],
81+
"FILTER_Low_VQSLOD": ["variants"],
82+
"FILTER_Mitochondrion": ["variants"],
83+
"FILTER_SubtelomericHypervariable": ["variants"],
84+
"FILTER_SubtelomericRepeat": ["variants"],
85+
"FILTER_VQSRTrancheINDEL99.50to99.60": ["variants"],
86+
"FILTER_VQSRTrancheINDEL99.60to99.80": ["variants"],
87+
"FILTER_VQSRTrancheINDEL99.80to99.90": ["variants"],
88+
"FILTER_VQSRTrancheINDEL99.90to99.95": ["variants"],
89+
"FILTER_VQSRTrancheINDEL99.95to100.00+": ["variants"],
90+
"FILTER_VQSRTrancheINDEL99.95to100.00": ["variants"],
91+
"FILTER_VQSRTrancheSNP99.50to99.60": ["variants"],
92+
"FILTER_VQSRTrancheSNP99.60to99.80": ["variants"],
93+
"FILTER_VQSRTrancheSNP99.80to99.90": ["variants"],
94+
"FILTER_VQSRTrancheSNP99.90to99.95": ["variants"],
95+
"FILTER_VQSRTrancheSNP99.95to100.00+": ["variants"],
96+
"FILTER_VQSRTrancheSNP99.95to100.00": ["variants"],
97+
"FS": ["variants"],
98+
"ID": ["variants"],
99+
"InbreedingCoeff": ["variants"],
100+
"LOF": ["variants"],
101+
"MLEAC": ["variants", "alt_alleles"],
102+
"MLEAF": ["variants", "alt_alleles"],
103+
"MQ": ["variants"],
104+
"MQRankSum": ["variants"],
105+
"NEGATIVE_TRAIN_SITE": ["variants"],
106+
"NMD": ["variants"],
107+
"POSITIVE_TRAIN_SITE": ["variants"],
108+
"QD": ["variants"],
109+
"QUAL": ["variants"],
110+
"RAW_MQandDP": ["variants", "ploidy"],
111+
"ReadPosRankSum": ["variants"],
112+
"RegionType": ["variants"],
113+
"SOR": ["variants"],
114+
"VQSLOD": ["variants"],
115+
"culprit": ["variants"],
116+
"set": ["variants"]
117+
}
118+
}

malariagen_data/plasmodium.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ def genome_sequence(self, region="*", inline_array=True, chunks="native"):
298298
)
299299
return d
300300

301-
def genome_features(self, attributes=("ID", "Parent", "Name", "alias")):
301+
def genome_features(self, attributes=("ID", "Parent", "Name")):
302302
"""Access genome feature annotations.
303303
304304
Parameters

malariagen_data/util.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -450,18 +450,42 @@ def init_filesystem(url, **kwargs):
450450
scopes=["https://www.googleapis.com/auth/cloud-platform"]
451451
)
452452

453-
# Ensure credentials are passed through to gcsfs.
453+
kwargs.setdefault("token", credentials)
454+
455+
# Ensure options are passed through to gcsfs, even if URL is chained.
454456
if url.startswith("gs://") or url.startswith("gcs://"):
455-
kwargs["token"] = credentials
457+
storage_options = kwargs
456458
elif "gs://" in url:
457459
# Chained URL.
458-
kwargs["gs"] = dict(token=credentials)
460+
storage_options = {"gs": kwargs}
459461
elif "gcs://" in url:
460462
# Chained URL.
461-
kwargs["gcs"] = dict(token=credentials)
463+
storage_options = {"gcs": kwargs}
464+
465+
elif "s3://" in url:
466+
# N.B., we currently assume any S3 URLs refer to buckets hosted at Sanger.
467+
config = {
468+
"signature_version": "s3",
469+
"s3": {"addressing_style": "virtual"},
470+
}
471+
472+
# Create an S3FileSystem with custom endpoint if specified.
473+
kwargs.setdefault("anon", True) # Default to anonymous access.
474+
kwargs.setdefault("endpoint_url", "https://cog.sanger.ac.uk")
475+
kwargs.setdefault("config_kwargs", config)
476+
477+
if url.startswith("s3://"):
478+
storage_options = kwargs
479+
else:
480+
# Chained URL.
481+
storage_options = {"s3": kwargs}
482+
483+
else:
484+
# Some other kind of URL, pass through kwargs as-is.
485+
storage_options = kwargs
462486

463487
# Process the URL using fsspec.
464-
fs, path = url_to_fs(url, **kwargs)
488+
fs, path = url_to_fs(url, **storage_options)
465489

466490
# Path compatibility, fsspec/gcsfs behaviour varies between versions.
467491
while path.endswith("/"):

0 commit comments

Comments
 (0)