Skip to content

Commit a212606

Browse files
Bug fixes, preparation vor v1.2.4, and road planning for v2.0
1 parent a40d59f commit a212606

File tree

10 files changed

+47
-18
lines changed

10 files changed

+47
-18
lines changed

CHANGELOG.md

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,37 @@
1-
# Change Log
1+
# Change Log and Look-Ahead
22

33
## [Planned - Long-term project ideas]
44

55
- [ ] Multi-threading support for pre-solving (Snakemake as backbone)
66
- [ ] Make (more) deterministic ([Issue #6](https://github.com/kalininalab/DataSAIL/issues/6))
77
- [ ] Reports of results with plots and tables a PDF and or HTML
8-
- [ ] Generalization to R-dimensional datasets (see [paper](https://doi.org/10.1101/2023.11.15.566305))
9-
- [ ] Input from config files
108
- [ ] Replace GraKel with something "modern" and fully "conda-installable" to make DataSAIL fully conda-installable
119
- [ ] Include [MashMap3](https://github.com/marbl/MashMap)
1210
- [ ] Include MASH for amino acid sequences
11+
12+
## Roadmap to DataSAIL v2.0
13+
14+
### Most have features for v2.0
15+
16+
- [ ] Input from config files
17+
- [ ] Generalization to R-dimensional datasets (see [paper](https://doi.org/10.1101/2023.11.15.566305))
18+
- [ ] Support for large molecular datasets, based on sampling and heuristical assignments
19+
- [ ] UMAP (or tSNE?)-based splitting
20+
- [ ] MCES-based splitting
21+
22+
### Nice to have features for v2.0
23+
1324
- [ ] Custom clustering methods ([Issue #25](https://github.com/kalininalab/DataSAIL/issues/25))
25+
- [ ] Support for multi-modal data, i.e., combination of different metrics (e.g., sequence similarity and structural similarity)
26+
- [ ] "Stratification" based on continuous labels (e.g., regression tasks)
27+
28+
## Change Log
29+
30+
## v1.2.4 (2025-11-??)
31+
32+
- New ``output`` parameter for Python API to specify output directory and save cluster assignments and splits there.
33+
- Default values of ``epsilon`` and ``delta`` in CLI and API aligned to `0.05`.
34+
- Two bugs in parameter handling fixed.
1435

1536
## v1.2.3 (2025-10-21)
1637

base_recipe.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
package:
2-
version: '1.2.3'
2+
version: '1.2.4'
33

44
source:
55
path: ..

datasail/parsers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,15 +123,15 @@ def parse_datasail_args(args) -> Dict[str, object]:
123123
split.add_argument(
124124
"-d",
125125
"--delta",
126-
default=0.3,
126+
default=0.05,
127127
type=float,
128128
dest=KW_DELTA,
129129
help="Relative error for stratification. This is only used if stratification is provided."
130130
)
131131
split.add_argument(
132132
"-e",
133133
"--epsilon",
134-
default=0.3,
134+
default=0.05,
135135
type=float,
136136
dest=KW_EPSILON,
137137
help="Relative error how much the limits of the splits can be exceeded.",

datasail/reader/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class DataSet:
2828
names: Optional[List[str]] = None
2929
id_map: Optional[Dict[str, str]] = None
3030
cluster_names: Optional[List[str]] = None
31-
num_clusters: int = 50
31+
num_clusters: Optional[int] = 50
3232
data: Optional[Dict[str, Union[str, np.ndarray]]] = None
3333
cluster_map: Optional[Dict[str, str]] = None
3434
location: Optional[Path] = None
@@ -259,7 +259,7 @@ def read_data(
259259
dist: MATRIX_INPUT,
260260
inter: Optional[List[Tuple[str, str]]],
261261
index: Optional[int],
262-
num_clusters: int,
262+
num_clusters: Optional[int],
263263
tool_args: str,
264264
dataset: DataSet,
265265
) -> DataSet:
@@ -346,7 +346,7 @@ def read_data(
346346
# tmp_classes.add(value)
347347
# dataset.classes = {s: i for i, s in enumerate(tmp_classes)}
348348
# dataset.class_oh = np.eye(len(dataset.classes))
349-
# dataset.num_clusters = num_clusters
349+
dataset.num_clusters = num_clusters
350350

351351
dataset.args = validate_user_args(dataset.type, dataset.format, sim, dist, tool_args)
352352

datasail/report.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ def save_matrix_tsne(
239239
n_components=2,
240240
learning_rate="auto",
241241
init="random",
242-
perplexity=max(min(math.sqrt(len(distances)), 50), 5),
242+
perplexity=min(math.sqrt(len(distances)), 50), # max(min(math.sqrt(len(distances)), 50), 5),
243243
random_state=42,
244244
).fit_transform(distances)
245245

datasail/routine.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from datasail.reader.read import read_data
1111
from datasail.reader.utils import DataSet
1212
from datasail.report import report
13-
from datasail.settings import DIM_1, LOGGER, KW_INTER, KW_TECHNIQUES, KW_EPSILON, KW_RUNS, KW_SPLITS, KW_NAMES, \
13+
from datasail.settings import DIM_1, KW_CLI, LOGGER, KW_INTER, KW_TECHNIQUES, KW_EPSILON, KW_RUNS, KW_SPLITS, KW_NAMES, \
1414
KW_MAX_SEC, KW_SOLVER, KW_LOGDIR, NOT_ASSIGNED, KW_OUTDIR, MODE_E, MODE_F, DIM_2, SRC_CL, KW_DELTA, \
1515
KW_E_CLUSTERS, KW_F_CLUSTERS, KW_CC, CDHIT, INSTALLED, FOLDSEEK, TMALIGN, CDHIT_EST, DIAMOND, MMSEQS, MASH, TEC_R, TEC_I1, TEC_C1, TEC_I2, TEC_C2, MODE_E, MODE_F, KW_LINKAGE, KW_OVERFLOW
1616
from datasail.solver.overflow import check_dataset
@@ -199,7 +199,7 @@ def datasail_main(**kwargs) -> Optional[Tuple[Dict, Dict, Dict]]:
199199
output_dir=kwargs[KW_OUTDIR],
200200
split_names=kwargs[KW_NAMES],
201201
)
202-
else:
202+
if not kwargs[KW_CLI]:
203203
full_e_name_split_map = fill_split_maps(e_dataset, e_name_split_map)
204204
full_f_name_split_map = fill_split_maps(f_dataset, f_name_split_map)
205205
return full_e_name_split_map, full_f_name_split_map, inter_split_map

datasail/sail.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,16 @@ def validate_args(**kwargs) -> Dict[str, object]:
159159
return kwargs
160160

161161

162+
def to_path(x):
163+
return Path(x) if isinstance(x, str) and x not in ALGOS else x
164+
165+
162166
def datasail(
163167
techniques: Union[str, List[str], Callable[..., List[str]], Generator[str, None, None]] = None,
164168
inter: Optional[
165169
Union[str, Path, List[Tuple[str, str]], Callable[..., List[str]], Generator[str, None, None]]
166170
] = None,
171+
output: Optional[Union[str, Path]] = None,
167172
max_sec: int = 100,
168173
verbose: str = "W",
169174
splits: List[float] = None,
@@ -200,6 +205,7 @@ def datasail(
200205
Args:
201206
techniques: List of techniques to split based on
202207
inter: Filepath to a TSV file storing interactions of the e-entities and f-entities.
208+
output: Output directory to store the results in.
203209
max_sec: Maximal number of seconds to take for optimizing a found solution.
204210
verbose: Verbosity level for logging.
205211
splits: List of splits, have to add up to one, otherwise scaled accordingly.
@@ -233,11 +239,8 @@ def datasail(
233239
Three dictionaries mapping techniques to another dictionary. The inner dictionary maps input id to their splits.
234240
"""
235241

236-
def to_path(x):
237-
return Path(x) if isinstance(x, str) and x not in ALGOS else x
238-
239242
kwargs = validate_args(
240-
output=None, techniques=techniques, inter=to_path(inter), max_sec=max_sec, verbosity=verbose,
243+
output=to_path(output), techniques=techniques, inter=to_path(inter), max_sec=max_sec, verbosity=verbose,
241244
splits=splits, names=names, delta=delta, epsilon=epsilon, runs=runs, solver=solver, cache=cache,
242245
cache_dir=to_path(cache_dir), linkage=linkage, e_type=e_type, e_data=to_path(e_data),
243246
e_weights=to_path(e_weights), e_strat=to_path(e_strat), e_sim=to_path(e_sim), e_dist=to_path(e_dist),
@@ -257,5 +260,9 @@ def sail(args=None, **kwargs) -> None:
257260
kwargs = parse_datasail_args(args or sys.argv[1:])
258261
kwargs = {key: (kwargs[key] if key in kwargs else val) for key, val in DEFAULT_KWARGS.items()}
259262
kwargs[KW_CLI] = True
263+
for kwarg in [KW_OUTDIR, KW_INTER, KW_CACHE_DIR, KW_E_DATA, KW_E_WEIGHTS, KW_E_STRAT,
264+
KW_E_SIM, KW_E_DIST, KW_F_DATA, KW_F_WEIGHTS, KW_F_STRAT, KW_F_SIM, KW_F_DIST]:
265+
if kwarg in kwargs:
266+
kwargs[kwarg] = to_path(kwargs[kwarg])
260267
kwargs = validate_args(**kwargs)
261268
datasail_main(**kwargs)

datasail/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.2.3"
1+
__version__ = "1.2.4"

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "datasail"
3-
version = "1.2.3"
3+
version = "1.2.4"
44
repository = "https://github.com/kalininalab/DataSAIL"
55
readme = "README.md"
66
description = "A package to compute hard out-of-distribution data splits for machine learning, challenging generalization of models."

tests/test_overflow.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ def test_overflow_assign():
6767
assert e_split_names == {"C1e": ["S2", "S3", "S4", "S5"]}
6868

6969

70+
@pytest.mark.todo
7071
@pytest.mark.parametrize("overflow", ["break", "assign"])
7172
def test_overflow_full(overflow):
7273
e_splits, _, _ = datasail(

0 commit comments

Comments
 (0)