Skip to content

Commit c2febff

Browse files
committed
Merge branch 'main' into timeout-arg
2 parents 111e53f + 18f2cf8 commit c2febff

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+373
-258
lines changed

Snakefile

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,10 @@ def write_parameter_log(algorithm, param_label, logfile):
6464
def write_dataset_log(dataset, logfile):
6565
dataset_contents = get_dataset(_config.config.datasets,dataset)
6666

67-
# safe_dump gives RepresenterError for an OrderedDict
68-
# config file has to convert the dataset from OrderedDict to dict to avoid this
69-
with open(logfile,'w') as f:
70-
yaml.safe_dump(dataset_contents,f)
67+
# safe_dump gives RepresenterError for a DatasetSchema
68+
# config file has to convert the dataset to a dict to avoid this
69+
with open(logfile, 'w') as f:
70+
yaml.safe_dump(dict(dataset_contents), f)
7171

7272
# Choose the final files expected according to the config file options.
7373
def make_final_input(wildcards):
@@ -158,9 +158,9 @@ rule log_datasets:
158158
# Input preparation needs to be rerun if these files are modified
159159
def get_dataset_dependencies(wildcards):
160160
dataset = _config.config.datasets[wildcards.dataset]
161-
all_files = dataset["node_files"] + dataset["edge_files"] + dataset["other_files"]
161+
all_files = dataset.node_files + dataset.edge_files + dataset.other_files
162162
# Add the relative file path
163-
all_files = [dataset["data_dir"] + SEP + data_file for data_file in all_files]
163+
all_files = [dataset.data_dir + SEP + data_file for data_file in all_files]
164164

165165
return all_files
166166

docs/htcondor.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,11 @@ might look like:
5454

5555
.. code:: bash
5656
57-
apptainer build spras-v0.6.0.sif docker://reedcompbio/spras:v0.6.0
57+
apptainer build spras-v0.6.0.sif docker://reedcompbio/spras:0.6.0
5858
5959
After running this command, a new file called ``spras-v0.6.0.sif`` will
60-
exist in the directory where the command was run.
60+
exist in the directory where the command was run. Note that the Docker
61+
image does not use a "v" in the tag.
6162

6263
Submitting All Jobs to a Single EP
6364
----------------------------------

spras/allpairs.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,6 @@ def generate_inputs(data: Dataset, filename_map):
3535
# Get sources and targets for node input file
3636
# Borrowed code from pathlinker.py
3737
sources_targets = data.get_node_columns(["sources", "targets"])
38-
if sources_targets is None:
39-
raise ValueError("All Pairs Shortest Paths requires sources and targets")
4038

4139
both_series = sources_targets.sources & sources_targets.targets
4240
for _index, row in sources_targets[both_series].iterrows():

spras/analysis/ml.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -459,8 +459,11 @@ def jaccard_similarity_eval(summary_df: pd.DataFrame, output_file: str | PathLik
459459
ax.set_yticklabels(algorithms)
460460
plt.colorbar(cax, ax=ax)
461461
# annotate each cell with the corresponding similarity value
462+
# where we set the precision to be lower as the number of algorithms increases
463+
n = 2
464+
if len(algorithms) > 10: n = 1
462465
for i in range(len(algorithms)):
463466
for j in range(len(algorithms)):
464-
ax.text(j, i, f'{jaccard_matrix.values[i, j]:.2f}', ha='center', va='center', color='white')
467+
ax.text(j, i, f'{jaccard_matrix.values[i, j]:.{n}f}', ha='center', va='center', color='white')
465468
plt.savefig(output_png, bbox_inches="tight", dpi=DPI)
466469
plt.close()

spras/btb.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -44,19 +44,8 @@ def generate_inputs(data, filename_map):
4444

4545
# Get sources and write to file, repeat for targets
4646
# Does not check whether a node is a source and a target
47-
for node_type in ['sources', 'targets']:
48-
nodes = data.get_node_columns([node_type])
49-
if nodes is None:
50-
raise ValueError(f'No {node_type} found in the node files')
51-
52-
# TODO test whether this selection is needed, what values could the column contain that we would want to
53-
# include or exclude?
54-
nodes = nodes.loc[nodes[node_type]]
55-
if node_type == "sources":
56-
nodes.to_csv(filename_map["sources"], sep= '\t', index=False, columns=['NODEID'], header=False)
57-
elif node_type == "targets":
58-
nodes.to_csv(filename_map["targets"], sep= '\t', index=False, columns=['NODEID'], header=False)
59-
47+
for node_type, nodes in data.get_node_columns_separate(['sources', 'targets']).items():
48+
nodes.to_csv(filename_map[node_type], sep='\t', index=False, columns=['NODEID'], header=False)
6049

6150
# Create network file
6251
edges = data.get_interactome()

spras/config/config.py

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,17 @@
1414

1515
import copy as copy
1616
import itertools as it
17-
import os
1817
import warnings
18+
from pathlib import Path
1919
from typing import Any, Optional
2020

2121
import numpy as np
2222
import yaml
2323
from pytimeparse import parse
2424

2525
from spras.config.container_schema import ProcessedContainerSettings
26-
from spras.config.schema import RawConfig
27-
from spras.util import NpHashEncoder, hash_params_sha1_base32
26+
from spras.config.schema import DatasetSchema, RawConfig
27+
from spras.util import LoosePathLike, NpHashEncoder, hash_params_sha1_base32
2828

2929
config = None
3030

@@ -35,19 +35,7 @@ def init_global(config_dict):
3535

3636
def init_from_file(filepath):
3737
global config
38-
39-
# Handle opening the file and parsing the yaml
40-
filepath = os.path.abspath(filepath)
41-
try:
42-
with open(filepath, 'r') as yaml_file:
43-
config_dict = yaml.safe_load(yaml_file)
44-
except FileNotFoundError as e:
45-
raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.") from e
46-
except yaml.YAMLError as e:
47-
raise RuntimeError(f"Error: Failed to parse config '{filepath}'") from e
48-
49-
# And finally, initialize
50-
config = Config(config_dict)
38+
config = Config.from_file(filepath)
5139

5240

5341
class Config:
@@ -65,7 +53,7 @@ def __init__(self, raw_config: dict[str, Any]):
6553
# Directory used for storing output
6654
self.out_dir = parsed_raw_config.reconstruction_settings.locations.reconstruction_dir
6755
# A dictionary to store configured datasets against which SPRAS will be run
68-
self.datasets = None
56+
self.datasets: dict[str, DatasetSchema] = {}
6957
# A dictionary to store configured gold standard data against output of SPRAS runs
7058
self.gold_standards = None
7159
# The hash length SPRAS will use to identify parameter combinations.
@@ -106,6 +94,20 @@ def __init__(self, raw_config: dict[str, Any]):
10694

10795
self.process_config(parsed_raw_config)
10896

97+
@classmethod
98+
def from_file(cls, filepath: LoosePathLike):
99+
# Handle opening the file and parsing the yaml
100+
filepath = Path(filepath).absolute()
101+
try:
102+
with open(filepath, 'r') as yaml_file:
103+
config_dict = yaml.safe_load(yaml_file)
104+
except FileNotFoundError as e:
105+
raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.") from e
106+
except yaml.YAMLError as e:
107+
raise RuntimeError(f"Error: Failed to parse config '{filepath}'") from e
108+
109+
return cls(config_dict)
110+
109111
def process_datasets(self, raw_config: RawConfig):
110112
"""
111113
Parse dataset information
@@ -118,12 +120,11 @@ def process_datasets(self, raw_config: RawConfig):
118120
# Currently assumes all datasets have a label and the labels are unique
119121
# When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts
120122
# Convert to dicts to simplify the yaml logging
121-
self.datasets = {}
122123
for dataset in raw_config.datasets:
123124
label = dataset.label
124125
if label.lower() in [key.lower() for key in self.datasets.keys()]:
125126
raise ValueError(f"Datasets must have unique case-insensitive labels, but the label {label} appears at least twice.")
126-
self.datasets[label] = dict(dataset)
127+
self.datasets[label] = dataset
127128

128129
# parse gold standard information
129130
self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards}

spras/config/dataset.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from typing import Annotated
2+
3+
from pydantic import AfterValidator, BaseModel, ConfigDict
4+
5+
from spras.config.util import label_validator
6+
from spras.util import LoosePathLike
7+
8+
9+
class DatasetSchema(BaseModel):
10+
"""
11+
Collection of information related to `Dataset` objects in the configuration.
12+
"""
13+
14+
# We prefer AfterValidator here to allow pydantic to run its own
15+
# validation & coercion logic before we check it against our own
16+
# requirements
17+
label: Annotated[str, AfterValidator(label_validator("Dataset"))]
18+
node_files: list[LoosePathLike]
19+
edge_files: list[LoosePathLike]
20+
other_files: list[LoosePathLike]
21+
data_dir: LoosePathLike
22+
23+
model_config = ConfigDict(extra='forbid')

spras/config/schema.py

Lines changed: 3 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@
1010
- `CaseInsensitiveEnum` (see ./util.py)
1111
"""
1212

13-
import re
1413
from typing import Annotated
1514

1615
from pydantic import AfterValidator, BaseModel, ConfigDict
1716

1817
from spras.config.algorithms import AlgorithmUnion
1918
from spras.config.container_schema import ContainerSettings
20-
from spras.config.util import CaseInsensitiveEnum
19+
from spras.config.dataset import DatasetSchema
20+
from spras.config.util import CaseInsensitiveEnum, label_validator
2121

2222
# Most options here have an `include` property,
2323
# which is meant to make disabling parts of the configuration easier.
@@ -79,30 +79,6 @@ class Analysis(BaseModel):
7979
# The default length of the truncated hash used to identify parameter combinations
8080
DEFAULT_HASH_LENGTH = 7
8181

82-
def label_validator(name: str):
83-
"""
84-
A validator takes in a label
85-
and ensures that it contains only letters, numbers, or underscores.
86-
"""
87-
label_pattern = r'^\w+$'
88-
def validate(label: str):
89-
if not bool(re.match(label_pattern, label)):
90-
raise ValueError(f"{name} label '{label}' contains invalid values. {name} labels can only contain letters, numbers, or underscores.")
91-
return label
92-
return validate
93-
94-
class Dataset(BaseModel):
95-
# We prefer AfterValidator here to allow pydantic to run its own
96-
# validation & coercion logic before we check it against our own
97-
# requirements
98-
label: Annotated[str, AfterValidator(label_validator("Dataset"))]
99-
node_files: list[str]
100-
edge_files: list[str]
101-
other_files: list[str]
102-
data_dir: str
103-
104-
model_config = ConfigDict(extra='forbid')
105-
10682
class GoldStandard(BaseModel):
10783
label: Annotated[str, AfterValidator(label_validator("Gold Standard"))]
10884
node_files: list[str] = []
@@ -131,7 +107,7 @@ class RawConfig(BaseModel):
131107

132108
# See algorithms.py for more information about AlgorithmUnion
133109
algorithms: list[AlgorithmUnion] # type: ignore - pydantic allows this.
134-
datasets: list[Dataset]
110+
datasets: list[DatasetSchema]
135111
gold_standards: list[GoldStandard] = []
136112
analysis: Analysis = Analysis()
137113

spras/config/util.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,26 @@
44
only import this config file.
55
"""
66

7+
import re
78
from enum import Enum
89
from typing import Any
910

1011
import yaml
1112
from pydantic import BaseModel, ConfigDict
1213

1314

15+
def label_validator(name: str):
16+
"""
17+
A validator takes in a label
18+
and ensures that it contains only letters, numbers, or underscores.
19+
"""
20+
label_pattern = r'^\w+$'
21+
def validate(label: str):
22+
if not bool(re.match(label_pattern, label)):
23+
raise ValueError(f"{name} label '{label}' contains invalid values. {name} labels can only contain letters, numbers, or underscores.")
24+
return label
25+
return validate
26+
1427
# https://stackoverflow.com/a/76883868/7589775
1528
class CaseInsensitiveEnum(str, Enum):
1629
"""

0 commit comments

Comments
 (0)