Skip to content

Commit f69a0f3

Browse files
committed
Merge branch 'main' into hash
2 parents c7262ed + 18f2cf8 commit f69a0f3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+367
-247
lines changed

Snakefile

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,10 @@ def write_parameter_log(algorithm, param_label, logfile):
6060
def write_dataset_log(dataset, logfile):
6161
dataset_contents = get_dataset(_config.config.datasets,dataset)
6262

63-
# safe_dump gives RepresenterError for an OrderedDict
64-
# config file has to convert the dataset from OrderedDict to dict to avoid this
65-
with open(logfile,'w') as f:
66-
yaml.safe_dump(dataset_contents,f)
63+
# safe_dump gives RepresenterError for a DatasetSchema
64+
# config file has to convert the dataset to a dict to avoid this
65+
with open(logfile, 'w') as f:
66+
yaml.safe_dump(dict(dataset_contents), f)
6767

6868
# Choose the final files expected according to the config file options.
6969
def make_final_input(wildcards):
@@ -154,9 +154,9 @@ rule log_datasets:
154154
# Input preparation needs to be rerun if these files are modified
155155
def get_dataset_dependencies(wildcards):
156156
dataset = _config.config.datasets[wildcards.dataset]
157-
all_files = dataset["node_files"] + dataset["edge_files"] + dataset["other_files"]
157+
all_files = dataset.node_files + dataset.edge_files + dataset.other_files
158158
# Add the relative file path
159-
all_files = [dataset["data_dir"] + SEP + data_file for data_file in all_files]
159+
all_files = [dataset.data_dir + SEP + data_file for data_file in all_files]
160160

161161
return all_files
162162

docs/htcondor.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,11 @@ might look like:
5454

5555
.. code:: bash
5656
57-
apptainer build spras-v0.6.0.sif docker://reedcompbio/spras:v0.6.0
57+
apptainer build spras-v0.6.0.sif docker://reedcompbio/spras:0.6.0
5858
5959
After running this command, a new file called ``spras-v0.6.0.sif`` will
60-
exist in the directory where the command was run.
60+
exist in the directory where the command was run. Note that the Docker
61+
image does not use a "v" in the tag.
6162

6263
Submitting All Jobs to a Single EP
6364
----------------------------------

spras/allpairs.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,6 @@ def generate_inputs(data: Dataset, filename_map):
3535
# Get sources and targets for node input file
3636
# Borrowed code from pathlinker.py
3737
sources_targets = data.get_node_columns(["sources", "targets"])
38-
if sources_targets is None:
39-
raise ValueError("All Pairs Shortest Paths requires sources and targets")
4038

4139
both_series = sources_targets.sources & sources_targets.targets
4240
for _index, row in sources_targets[both_series].iterrows():

spras/analysis/ml.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -459,8 +459,11 @@ def jaccard_similarity_eval(summary_df: pd.DataFrame, output_file: str | PathLik
459459
ax.set_yticklabels(algorithms)
460460
plt.colorbar(cax, ax=ax)
461461
# annotate each cell with the corresponding similarity value
462+
# where we set the precision to be lower as the number of algorithms increases
463+
n = 2
464+
if len(algorithms) > 10: n = 1
462465
for i in range(len(algorithms)):
463466
for j in range(len(algorithms)):
464-
ax.text(j, i, f'{jaccard_matrix.values[i, j]:.2f}', ha='center', va='center', color='white')
467+
ax.text(j, i, f'{jaccard_matrix.values[i, j]:.{n}f}', ha='center', va='center', color='white')
465468
plt.savefig(output_png, bbox_inches="tight", dpi=DPI)
466469
plt.close()

spras/btb.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -44,19 +44,8 @@ def generate_inputs(data, filename_map):
4444

4545
# Get sources and write to file, repeat for targets
4646
# Does not check whether a node is a source and a target
47-
for node_type in ['sources', 'targets']:
48-
nodes = data.get_node_columns([node_type])
49-
if nodes is None:
50-
raise ValueError(f'No {node_type} found in the node files')
51-
52-
# TODO test whether this selection is needed, what values could the column contain that we would want to
53-
# include or exclude?
54-
nodes = nodes.loc[nodes[node_type]]
55-
if node_type == "sources":
56-
nodes.to_csv(filename_map["sources"], sep= '\t', index=False, columns=['NODEID'], header=False)
57-
elif node_type == "targets":
58-
nodes.to_csv(filename_map["targets"], sep= '\t', index=False, columns=['NODEID'], header=False)
59-
47+
for node_type, nodes in data.get_node_columns_separate(['sources', 'targets']).items():
48+
nodes.to_csv(filename_map[node_type], sep='\t', index=False, columns=['NODEID'], header=False)
6049

6150
# Create network file
6251
edges = data.get_interactome()

spras/config/config.py

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
import hashlib
1818
import importlib.metadata
1919
import itertools as it
20-
import os
2120
import subprocess
2221
import tomllib
2322
import warnings
@@ -28,8 +27,8 @@
2827
import yaml
2928

3029
from spras.config.container_schema import ProcessedContainerSettings
31-
from spras.config.schema import RawConfig
32-
from spras.util import NpHashEncoder, hash_params_sha1_base32
30+
from spras.config.schema import DatasetSchema, RawConfig
31+
from spras.util import LoosePathLike, NpHashEncoder, hash_params_sha1_base32
3332

3433
config = None
3534

@@ -93,19 +92,7 @@ def init_global(config_dict):
9392

9493
def init_from_file(filepath):
9594
global config
96-
97-
# Handle opening the file and parsing the yaml
98-
filepath = os.path.abspath(filepath)
99-
try:
100-
with open(filepath, 'r') as yaml_file:
101-
config_dict = yaml.safe_load(yaml_file)
102-
except FileNotFoundError as e:
103-
raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.") from e
104-
except yaml.YAMLError as e:
105-
raise RuntimeError(f"Error: Failed to parse config '{filepath}'") from e
106-
107-
# And finally, initialize
108-
config = Config(config_dict)
95+
config = Config.from_file(filepath)
10996

11097

11198
class Config:
@@ -123,7 +110,7 @@ def __init__(self, raw_config: dict[str, Any]):
123110
# Directory used for storing output
124111
self.out_dir = parsed_raw_config.reconstruction_settings.locations.reconstruction_dir
125112
# A dictionary to store configured datasets against which SPRAS will be run
126-
self.datasets = None
113+
self.datasets: dict[str, DatasetSchema] = {}
127114
# A dictionary to store configured gold standard data against output of SPRAS runs
128115
self.gold_standards = None
129116
# The hash length SPRAS will use to identify parameter combinations.
@@ -162,6 +149,20 @@ def __init__(self, raw_config: dict[str, Any]):
162149

163150
self.process_config(parsed_raw_config)
164151

152+
@classmethod
153+
def from_file(cls, filepath: LoosePathLike):
154+
# Handle opening the file and parsing the yaml
155+
filepath = Path(filepath).absolute()
156+
try:
157+
with open(filepath, 'r') as yaml_file:
158+
config_dict = yaml.safe_load(yaml_file)
159+
except FileNotFoundError as e:
160+
raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.") from e
161+
except yaml.YAMLError as e:
162+
raise RuntimeError(f"Error: Failed to parse config '{filepath}'") from e
163+
164+
return cls(config_dict)
165+
165166
def process_datasets(self, raw_config: RawConfig):
166167
"""
167168
Parse dataset information
@@ -176,16 +177,14 @@ def process_datasets(self, raw_config: RawConfig):
176177
# Convert to dicts to simplify the yaml logging
177178

178179
for dataset in raw_config.datasets:
180+
label = dataset.label
181+
if label.lower() in [key.lower() for key in self.datasets.keys()]:
182+
raise ValueError(f"Datasets must have unique case-insensitive labels, but the label {label} appears at least twice.")
179183
dataset.label = attach_spras_revision(dataset.label)
180184
for gold_standard in raw_config.gold_standards:
181185
gold_standard.label = attach_spras_revision(gold_standard.label)
182186

183187
self.datasets = {}
184-
for dataset in raw_config.datasets:
185-
label = dataset.label
186-
if label.lower() in [key.lower() for key in self.datasets.keys()]:
187-
raise ValueError(f"Datasets must have unique case-insensitive labels, but the label {label} appears at least twice.")
188-
self.datasets[label] = dict(dataset)
189188

190189
# parse gold standard information
191190
self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards}

spras/config/dataset.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from typing import Annotated
2+
3+
from pydantic import AfterValidator, BaseModel, ConfigDict
4+
5+
from spras.config.util import label_validator
6+
from spras.util import LoosePathLike
7+
8+
9+
class DatasetSchema(BaseModel):
10+
"""
11+
Collection of information related to `Dataset` objects in the configuration.
12+
"""
13+
14+
# We prefer AfterValidator here to allow pydantic to run its own
15+
# validation & coercion logic before we check it against our own
16+
# requirements
17+
label: Annotated[str, AfterValidator(label_validator("Dataset"))]
18+
node_files: list[LoosePathLike]
19+
edge_files: list[LoosePathLike]
20+
other_files: list[LoosePathLike]
21+
data_dir: LoosePathLike
22+
23+
model_config = ConfigDict(extra='forbid')

spras/config/schema.py

Lines changed: 3 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@
1010
- `CaseInsensitiveEnum` (see ./util.py)
1111
"""
1212

13-
import re
1413
from typing import Annotated
1514

1615
from pydantic import AfterValidator, BaseModel, ConfigDict
1716

1817
from spras.config.algorithms import AlgorithmUnion
1918
from spras.config.container_schema import ContainerSettings
20-
from spras.config.util import CaseInsensitiveEnum
19+
from spras.config.dataset import DatasetSchema
20+
from spras.config.util import CaseInsensitiveEnum, label_validator
2121

2222
# Most options here have an `include` property,
2323
# which is meant to make disabling parts of the configuration easier.
@@ -79,30 +79,6 @@ class Analysis(BaseModel):
7979
# The default length of the truncated hash used to identify parameter combinations
8080
DEFAULT_HASH_LENGTH = 7
8181

82-
def label_validator(name: str):
83-
"""
84-
A validator takes in a label
85-
and ensures that it contains only letters, numbers, or underscores.
86-
"""
87-
label_pattern = r'^\w+$'
88-
def validate(label: str):
89-
if not bool(re.match(label_pattern, label)):
90-
raise ValueError(f"{name} label '{label}' contains invalid values. {name} labels can only contain letters, numbers, or underscores.")
91-
return label
92-
return validate
93-
94-
class Dataset(BaseModel):
95-
# We prefer AfterValidator here to allow pydantic to run its own
96-
# validation & coercion logic before we check it against our own
97-
# requirements
98-
label: Annotated[str, AfterValidator(label_validator("Dataset"))]
99-
node_files: list[str]
100-
edge_files: list[str]
101-
other_files: list[str]
102-
data_dir: str
103-
104-
model_config = ConfigDict(extra='forbid')
105-
10682
class GoldStandard(BaseModel):
10783
label: Annotated[str, AfterValidator(label_validator("Gold Standard"))]
10884
node_files: list[str] = []
@@ -131,7 +107,7 @@ class RawConfig(BaseModel):
131107

132108
# See algorithms.py for more information about AlgorithmUnion
133109
algorithms: list[AlgorithmUnion] # type: ignore - pydantic allows this.
134-
datasets: list[Dataset]
110+
datasets: list[DatasetSchema]
135111
gold_standards: list[GoldStandard] = []
136112
analysis: Analysis = Analysis()
137113

spras/config/util.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,26 @@
44
only import this config file.
55
"""
66

7+
import re
78
from enum import Enum
89
from typing import Any
910

1011
import yaml
1112
from pydantic import BaseModel, ConfigDict
1213

1314

15+
def label_validator(name: str):
16+
"""
17+
A validator takes in a label
18+
and ensures that it contains only letters, numbers, or underscores.
19+
"""
20+
label_pattern = r'^\w+$'
21+
def validate(label: str):
22+
if not bool(re.match(label_pattern, label)):
23+
raise ValueError(f"{name} label '{label}' contains invalid values. {name} labels can only contain letters, numbers, or underscores.")
24+
return label
25+
return validate
26+
1427
# https://stackoverflow.com/a/76883868/7589775
1528
class CaseInsensitiveEnum(str, Enum):
1629
"""

0 commit comments

Comments
 (0)