Bug fixes, preparation vor v1.2.4, and road planning for v2.0

Old-Shatterhand · Old-Shatterhand · commit a212606c4ad9 · 2025-11-29T01:10:28.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,16 +1,37 @@
-# Change Log
+# Change Log and Look-Ahead
 
 ## [Planned - Long-term project ideas]
 
 - [ ] Multi-threading support for pre-solving (Snakemake as backbone)
 - [ ] Make (more) deterministic ([Issue #6](https://github.com/kalininalab/DataSAIL/issues/6))
 - [ ] Reports of results with plots and tables a PDF and or HTML
-- [ ] Generalization to R-dimensional datasets (see [paper](https://doi.org/10.1101/2023.11.15.566305))
-- [ ] Input from config files
 - [ ] Replace GraKel with something "modern" and fully "conda-installable" to make DataSAIL fully conda-installable
 - [ ] Include [MashMap3](https://github.com/marbl/MashMap)
 - [ ] Include MASH for amino acid sequences
+
+## Roadmap to DataSAIL v2.0
+
+### Most have features for v2.0
+
+- [ ] Input from config files
+- [ ] Generalization to R-dimensional datasets (see [paper](https://doi.org/10.1101/2023.11.15.566305))
+- [ ] Support for large molecular datasets, based on sampling and heuristical assignments
+- [ ] UMAP (or tSNE?)-based splitting
+- [ ] MCES-based splitting
+
+### Nice to have features for v2.0
+
 - [ ] Custom clustering methods ([Issue #25](https://github.com/kalininalab/DataSAIL/issues/25))
+- [ ] Support for multi-modal data, i.e., combination of different metrics (e.g., sequence similarity and structural similarity)
+- [ ] "Stratification" based on continuous labels (e.g., regression tasks)
+
+## Change Log
+
+## v1.2.4 (2025-11-??)
+
+- New ``output`` parameter for Python API to specify output directory and save cluster assignments and splits there.
+- Default values of ``epsilon`` and ``delta`` in CLI and API aligned to `0.05`.
+- Two bugs in parameter handling fixed.
 
 ## v1.2.3 (2025-10-21)
 
diff --git a/base_recipe.yaml b/base_recipe.yaml
@@ -1,5 +1,5 @@
 package:
-  version: '1.2.3'
+  version: '1.2.4'
 
 source:
   path: ..
diff --git a/datasail/parsers.py b/datasail/parsers.py
@@ -123,15 +123,15 @@ def parse_datasail_args(args) -> Dict[str, object]:
     split.add_argument(
         "-d",
         "--delta",
-        default=0.3,
+        default=0.05,
         type=float,
         dest=KW_DELTA,
         help="Relative error for stratification. This is only used if stratification is provided."
     )
     split.add_argument(
         "-e",
         "--epsilon",
-        default=0.3,
+        default=0.05,
         type=float,
         dest=KW_EPSILON,
         help="Relative error how much the limits of the splits can be exceeded.",
diff --git a/datasail/reader/utils.py b/datasail/reader/utils.py
@@ -28,7 +28,7 @@ class DataSet:
     names: Optional[List[str]] = None
     id_map: Optional[Dict[str, str]] = None
     cluster_names: Optional[List[str]] = None
-    num_clusters: int = 50
+    num_clusters: Optional[int] = 50
     data: Optional[Dict[str, Union[str, np.ndarray]]] = None
     cluster_map: Optional[Dict[str, str]] = None
     location: Optional[Path] = None
@@ -259,7 +259,7 @@ def read_data(
         dist: MATRIX_INPUT,
         inter: Optional[List[Tuple[str, str]]],
         index: Optional[int],
-        num_clusters: int,
+        num_clusters: Optional[int],
         tool_args: str,
         dataset: DataSet,
 ) -> DataSet:
@@ -346,7 +346,7 @@ def read_data(
     #         tmp_classes.add(value)
     # dataset.classes = {s: i for i, s in enumerate(tmp_classes)}
     # dataset.class_oh = np.eye(len(dataset.classes))
-    # dataset.num_clusters = num_clusters
+    dataset.num_clusters = num_clusters
 
     dataset.args = validate_user_args(dataset.type, dataset.format, sim, dist, tool_args)
 
diff --git a/datasail/report.py b/datasail/report.py
@@ -239,7 +239,7 @@ def save_matrix_tsne(
         n_components=2,
         learning_rate="auto",
         init="random",
-        perplexity=max(min(math.sqrt(len(distances)), 50), 5),
+        perplexity=min(math.sqrt(len(distances)), 50),  # max(min(math.sqrt(len(distances)), 50), 5),
         random_state=42,
     ).fit_transform(distances)
 
diff --git a/datasail/routine.py b/datasail/routine.py
@@ -10,7 +10,7 @@
 from datasail.reader.read import read_data
 from datasail.reader.utils import DataSet
 from datasail.report import report
-from datasail.settings import DIM_1, LOGGER, KW_INTER, KW_TECHNIQUES, KW_EPSILON, KW_RUNS, KW_SPLITS, KW_NAMES, \
+from datasail.settings import DIM_1, KW_CLI, LOGGER, KW_INTER, KW_TECHNIQUES, KW_EPSILON, KW_RUNS, KW_SPLITS, KW_NAMES, \
     KW_MAX_SEC, KW_SOLVER, KW_LOGDIR, NOT_ASSIGNED, KW_OUTDIR, MODE_E, MODE_F, DIM_2, SRC_CL, KW_DELTA, \
     KW_E_CLUSTERS, KW_F_CLUSTERS, KW_CC, CDHIT, INSTALLED, FOLDSEEK, TMALIGN, CDHIT_EST, DIAMOND, MMSEQS, MASH, TEC_R, TEC_I1, TEC_C1, TEC_I2, TEC_C2, MODE_E, MODE_F, KW_LINKAGE, KW_OVERFLOW
 from datasail.solver.overflow import check_dataset
@@ -199,7 +199,7 @@ def datasail_main(**kwargs) -> Optional[Tuple[Dict, Dict, Dict]]:
             output_dir=kwargs[KW_OUTDIR],
             split_names=kwargs[KW_NAMES],
         )
-    else:
+    if not kwargs[KW_CLI]:
         full_e_name_split_map = fill_split_maps(e_dataset, e_name_split_map)
         full_f_name_split_map = fill_split_maps(f_dataset, f_name_split_map)
         return full_e_name_split_map, full_f_name_split_map, inter_split_map
diff --git a/datasail/sail.py b/datasail/sail.py
@@ -159,11 +159,16 @@ def validate_args(**kwargs) -> Dict[str, object]:
     return kwargs
 
 
+def to_path(x):
+    return Path(x) if isinstance(x, str) and x not in ALGOS else x
+
+
 def datasail(
         techniques: Union[str, List[str], Callable[..., List[str]], Generator[str, None, None]] = None,
         inter: Optional[
             Union[str, Path, List[Tuple[str, str]], Callable[..., List[str]], Generator[str, None, None]]
         ] = None,
+        output: Optional[Union[str, Path]] = None,
         max_sec: int = 100,
         verbose: str = "W",
         splits: List[float] = None,
@@ -200,6 +205,7 @@ def datasail(
     Args:
         techniques: List of techniques to split based on
         inter: Filepath to a TSV file storing interactions of the e-entities and f-entities.
+        output: Output directory to store the results in.
         max_sec: Maximal number of seconds to take for optimizing a found solution.
         verbose: Verbosity level for logging.
         splits: List of splits, have to add up to one, otherwise scaled accordingly.
@@ -233,11 +239,8 @@ def datasail(
         Three dictionaries mapping techniques to another dictionary. The inner dictionary maps input id to their splits.
     """
 
-    def to_path(x):
-        return Path(x) if isinstance(x, str) and x not in ALGOS else x
-
     kwargs = validate_args(
-        output=None, techniques=techniques, inter=to_path(inter), max_sec=max_sec, verbosity=verbose,
+        output=to_path(output), techniques=techniques, inter=to_path(inter), max_sec=max_sec, verbosity=verbose,
         splits=splits, names=names, delta=delta, epsilon=epsilon, runs=runs, solver=solver, cache=cache,
         cache_dir=to_path(cache_dir), linkage=linkage, e_type=e_type, e_data=to_path(e_data),
         e_weights=to_path(e_weights), e_strat=to_path(e_strat), e_sim=to_path(e_sim), e_dist=to_path(e_dist),
@@ -257,5 +260,9 @@ def sail(args=None, **kwargs) -> None:
         kwargs = parse_datasail_args(args or sys.argv[1:])
     kwargs = {key: (kwargs[key] if key in kwargs else val) for key, val in DEFAULT_KWARGS.items()}
     kwargs[KW_CLI] = True
+    for kwarg in [KW_OUTDIR, KW_INTER, KW_CACHE_DIR, KW_E_DATA, KW_E_WEIGHTS, KW_E_STRAT,
+                  KW_E_SIM, KW_E_DIST, KW_F_DATA, KW_F_WEIGHTS, KW_F_STRAT, KW_F_SIM, KW_F_DIST]:
+        if kwarg in kwargs:
+            kwargs[kwarg] = to_path(kwargs[kwarg])
     kwargs = validate_args(**kwargs)
     datasail_main(**kwargs)
diff --git a/datasail/version.py b/datasail/version.py
@@ -1 +1 @@
-__version__ = "1.2.3"
+__version__ = "1.2.4"
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "datasail"
-version = "1.2.3"
+version = "1.2.4"
 repository = "https://github.com/kalininalab/DataSAIL"
 readme = "README.md"
 description = "A package to compute hard out-of-distribution data splits for machine learning, challenging generalization of models."
diff --git a/tests/test_overflow.py b/tests/test_overflow.py
@@ -67,6 +67,7 @@ def test_overflow_assign():
     assert e_split_names == {"C1e": ["S2", "S3", "S4", "S5"]}
 
 
+@pytest.mark.todo
 @pytest.mark.parametrize("overflow", ["break", "assign"])
 def test_overflow_full(overflow):
     e_splits, _, _ = datasail(

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "1.2.3"`
	`1`	`+__version__ = "1.2.4"`