Skip to content

Commit 99154c3

Browse files
authored
Merge pull request #201 from jmchilton/idc_2
Enhancements to the IDC scripts
2 parents bacbf87 + cf586a9 commit 99154c3

16 files changed

+929
-28
lines changed

setup.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,11 @@ def get_var(var_name):
4343
install_tool_deps=ephemeris.install_tool_deps:main
4444
install-tool-deps=ephemeris.install_tool_deps:main
4545
set-library-permissions=ephemeris.set_library_permissions:main
46-
"""
46+
_idc-lint=ephemeris._idc_lint:main
47+
_idc-split-data-manager-genomes=ephemeris._idc_split_data_manager_genomes:main
48+
_idc-data-managers-to-tools=ephemeris._idc_data_managers_to_tools:main
49+
"""
50+
4751
PACKAGE_DATA = {
4852
# Be sure to update MANIFEST.in for source dist.
4953
}

src/ephemeris/__init__.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import os
2+
13
import yaml
24
from bioblend import galaxy
35

@@ -11,6 +13,14 @@
1113
RAW_CONTENT_URL = f"https://raw.github.com/{PROJECT_USERAME}/{PROJECT_NAME}/master/"
1214

1315

16+
def get_or_create_history(history_name: str, gi: galaxy.GalaxyInstance):
17+
histories = gi.histories.get_histories(name=history_name)
18+
if histories:
19+
return histories[0]
20+
else:
21+
return gi.histories.create_history(name=history_name)
22+
23+
1424
def check_url(url, log=None):
1525
if not url.startswith("http"):
1626
if log:
@@ -32,7 +42,7 @@ def get_galaxy_connection(args, file=None, log=None, login_required=True):
3242

3343
url = args.galaxy or file_content.get("galaxy_instance")
3444
galaxy_url = check_url(url, log)
35-
api_key = args.api_key or file_content.get("api_key")
45+
api_key = args.api_key or file_content.get("api_key") or os.environ.get("EPHEMERIS_API_KEY")
3646

3747
if args.user and args.password:
3848
return galaxy.GalaxyInstance(url=galaxy_url, email=args.user, password=args.password)

src/ephemeris/_config_models.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
from pathlib import Path
2+
from typing import (
3+
Dict,
4+
List,
5+
Optional,
6+
Union,
7+
)
8+
9+
import yaml
10+
from pydantic import (
11+
BaseModel,
12+
Extra,
13+
)
14+
15+
StrOrPath = Union[Path, str]
16+
17+
18+
class RepositoryInstallTarget(BaseModel):
19+
name: str
20+
owner: str
21+
tool_shed_url: Optional[str]
22+
tool_panel_section_id: Optional[str]
23+
tool_panel_section_label: Optional[str]
24+
revisions: Optional[List[str]]
25+
install_tool_dependencies: Optional[bool]
26+
install_repository_dependencies: Optional[bool]
27+
install_resolver_dependencies: Optional[bool]
28+
29+
30+
class RepositoryInstallTargets(BaseModel):
31+
""" """
32+
33+
api_key: Optional[str]
34+
galaxy_instance: Optional[str]
35+
tools: List[RepositoryInstallTarget]
36+
37+
38+
class DataManager(BaseModel, extra=Extra.forbid):
39+
tags: List[str]
40+
tool_id: str
41+
42+
43+
class DataManagers(BaseModel, extra=Extra.forbid):
44+
__root__: Dict[str, DataManager]
45+
46+
47+
class Genome(BaseModel):
48+
id: str # The unique id of the data in Galaxy
49+
description: str # The description of the data, including its taxonomy, version and date
50+
dbkey: Optional[str]
51+
source: Optional[str] # The source of the data. Can be: 'ucsc', an NCBI accession number or a URL to a fasta file.
52+
53+
# The following fields are currently purely for human consumption and unused by
54+
# IDC infrastructure.
55+
doi: Optional[str] # Any DOI associated with the data
56+
blob: Optional[str] # A blob for any other pertinent information
57+
checksum: Optional[str] # A SHA256 checksum of the original
58+
version: Optional[str] # Any version information associated with the data
59+
60+
# Description of actions (data managers) to run on target genome.
61+
indexers: Optional[
62+
List[str]
63+
] # indexers to run - keyed on repository name - see data_managers.yml for how to resolve these to tools
64+
skiplist: Optional[List[str]] # unimplemented: but if we implement classes of indexers, these will be ones to skip
65+
66+
67+
class Genomes(BaseModel):
68+
genomes: List[Genome]
69+
70+
71+
def _read_yaml(path: StrOrPath):
72+
with open(path) as f:
73+
return yaml.safe_load(f)
74+
75+
76+
def read_data_managers(path: StrOrPath) -> DataManagers:
77+
return DataManagers(__root__=_read_yaml(path))
78+
79+
80+
def read_genomes(path: StrOrPath) -> Genomes:
81+
return Genomes(**_read_yaml(path))
82+
83+
84+
def read_tools(path: StrOrPath) -> RepositoryInstallTargets:
85+
return RepositoryInstallTargets(**_read_yaml(path))
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#!/usr/bin/env python
2+
"""Helper script for IDC - not yet meant for public consumption.
3+
4+
This script takes a data_managers.yml configuration describing the
5+
set of data managers the IDC configuration targets and builds a
6+
a tools.yml file from it for use with shed_tools.
7+
"""
8+
import argparse
9+
import logging
10+
from typing import (
11+
Dict,
12+
List,
13+
NamedTuple,
14+
)
15+
16+
import yaml
17+
18+
from ._config_models import (
19+
read_data_managers,
20+
RepositoryInstallTargets,
21+
)
22+
from .common_parser import (
23+
add_log_file_argument,
24+
add_verbosity_argument,
25+
)
26+
from .ephemeris_log import (
27+
disable_external_library_logging,
28+
setup_global_logger,
29+
)
30+
31+
32+
class DataManager(NamedTuple):
33+
tool_id: str
34+
repository_name: str
35+
tags: List[str]
36+
37+
38+
def read_data_managers_configuration(path: str) -> Dict[str, DataManager]:
39+
raw_data_managers = read_data_managers(path)
40+
data_managers: Dict[str, DataManager] = {}
41+
for repository_name, data_manager_configuration in raw_data_managers.__root__.items():
42+
data_manager = DataManager(
43+
tool_id=data_manager_configuration.tool_id,
44+
repository_name=repository_name,
45+
tags=data_manager_configuration.tags or [],
46+
)
47+
data_managers[repository_name] = data_manager
48+
return data_managers
49+
50+
51+
def build_shed_install_conf(path: str) -> dict:
52+
data_managers = read_data_managers_configuration(path)
53+
tools = []
54+
for data_manager in data_managers.values():
55+
tool_id = data_manager.tool_id
56+
tool_id_parts = tool_id.split("/")
57+
repo_owner = tool_id_parts[2]
58+
repo_name = tool_id_parts[3]
59+
entry = {
60+
"name": repo_name,
61+
"owner": repo_owner,
62+
"tool_panel_section_label": None,
63+
"tool_shed_url": "toolshed.g2.bx.psu.edu",
64+
}
65+
tools.append(entry)
66+
tools_yaml = {"tools": tools}
67+
return tools_yaml
68+
69+
70+
def write_shed_install_conf(data_manager_conf_path: str, output_path: str) -> None:
71+
tools_yaml = build_shed_install_conf(data_manager_conf_path)
72+
73+
# validate generated dict to ensure we're writing out valid file
74+
RepositoryInstallTargets(**tools_yaml)
75+
76+
with open(output_path, "w") as f:
77+
yaml.safe_dump(tools_yaml, f)
78+
79+
80+
def _parser():
81+
"""returns the parser object."""
82+
83+
parser = argparse.ArgumentParser(add_help=False)
84+
general_group = parser.add_argument_group("General options")
85+
add_verbosity_argument(general_group)
86+
add_log_file_argument(general_group)
87+
parser.add_argument("--data-managers-conf", default="data_managers.yml")
88+
parser.add_argument("--shed-install-output-conf", default="tools.yml")
89+
return parser
90+
91+
92+
def main():
93+
disable_external_library_logging()
94+
parser = _parser()
95+
args = parser.parse_args()
96+
log = setup_global_logger(name=__name__, log_file=args.log_file)
97+
if args.verbose:
98+
log.setLevel(logging.DEBUG)
99+
else:
100+
log.setLevel(logging.INFO)
101+
write_shed_install_conf(args.data_managers_conf, args.shed_install_output_conf)
102+
103+
104+
if __name__ == "__main__":
105+
main()

src/ephemeris/_idc_lint.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import os
2+
from pathlib import Path
3+
4+
import yaml
5+
6+
from ._config_models import (
7+
read_data_managers,
8+
read_genomes,
9+
)
10+
11+
12+
def read_yaml(path: Path):
13+
with open(path) as f:
14+
return yaml.safe_load(f)
15+
16+
17+
def lint_idc_directory(directory: Path):
18+
genomes_path = directory / "genomes.yml"
19+
data_managers_path = directory / "data_managers.yml"
20+
assert genomes_path.exists()
21+
assert data_managers_path.exists()
22+
data_managers = read_data_managers(data_managers_path).__root__
23+
genomes = read_genomes(genomes_path)
24+
25+
for data_manager in data_managers.values():
26+
data_manager_tool_id = data_manager.tool_id
27+
if not data_manager_tool_id.startswith("toolshed.g2.bx.psu.edu/"):
28+
raise Exception(
29+
f"Expected a data manager repository from main Galaxy tool shed but discovered tool ID {data_manager_tool_id}"
30+
)
31+
32+
for genome in genomes.genomes:
33+
print(genome)
34+
for indexer in genome.indexers or []:
35+
if indexer not in data_managers:
36+
raise Exception(f"Failed to find data manager {indexer} referenced for genome {genome}")
37+
38+
39+
def main():
40+
lint_idc_directory(Path(os.curdir))
41+
42+
43+
if __name__ == "__main__":
44+
main()

0 commit comments

Comments
 (0)