nf-core · mathysgrapotte · Jun 26, 2025 · Jun 20, 2025 · Jun 26, 2025 · Jun 26, 2025
diff --git a/.gitignore b/.gitignore
@@ -19,3 +19,5 @@ bin/.vscode/
 .nf-test/
 prototype/
 *.ipynb
+CLAUDE.md
+.claude
diff --git a/README.md b/README.md
@@ -21,24 +21,9 @@
 
 ## Introduction
 
-**nf-core/deepmodeloptim** is a bioinformatics end-to-end pipeline designed to facilitate the testing and development of deep learning models for genomics.
+**nf-core/deepmodeloptim** augments your bio data towards an optimal task-specific training set.
 
-Deep learning model development in natural science is an empirical and costly process. Despite the existence of generic tools for the tuning of hyperparameters and the training of the models, the connection between these procedures and the impact coming from the data is often underlooked, or at least not easily automatized. Indeed, researchers must define a pre-processing pipeline, an architecture, find the best parameters for said architecture and iterate over this process, often manually.
-
-Leveraging the power of Nextflow (polyglotism, container integration, scalable on the cloud), this pipeline will help users to 1) automatize the testing of the model, 2) gain useful insights with respect to the learning behaviour of the model, and hence 3) accelerate the development.
-
-## Pipeline summary
-
-It takes as input:
-
-- A dataset
-- A configuration file to describe the data pre-processing steps to be performed
-- An user defined PyTorch model
-- A configuration file describing the range of parameters for the PyTorch model
-
-It then transforms the data according to all possible pre-processing steps, finds the best architecture parameters for each of the transformed datasets, performs sanity checks on the models and train a minimal deep learning version for each dataset/architecture.
-
-Those experiments are then compiled into an intuitive report, making it easier for scientists to pick the best design choice to be sent to large scale training.
+Methods in deep learning are vastly equivalent (see neural scaling laws paper), most of the performance is driven by the training data.
 
 <picture>
   <source media="(prefers-color-scheme: dark)" srcset="assets/metromap.png">

diff --git a/conf/modules.config b/conf/modules.config
@@ -90,7 +90,7 @@ process {
     // main config
     // ==============================================================================
 
-    withName: "STIMULUS_SPLIT_TRANSFORM" {
+    withName: "STIMULUS_SPLIT_YAML" {
         publishDir = [
             path: { "${params.outdir}/configs/${meta.id}" },
             mode: params.publish_dir_mode,

diff --git a/modules/local/custom/modify_model_config/main.nf b/modules/local/custom/modify_model_config/main.nf
@@ -20,7 +20,11 @@ process CUSTOM_MODIFY_MODEL_CONFIG {
     meta_updated = meta + ["n_trials": "${n_trials}"]
     """
     # substitte the line containing n_trials in the config file with n_trials: \${n_trials}
-    awk -v n_trials=${n_trials} '/n_trials: [0-9]+/ {gsub(/n_trials: [0-9]+/, "n_trials: " n_trials)}1' ${config} > ${prefix}.yaml
+    if [ "${n_trials}" = "[]" ]; then
+        cp "${config}" "${prefix}.yaml"
+    else
+        awk -v n_trials="${n_trials}" '/n_trials: [0-9]+/ {gsub(/n_trials: [0-9]+/, "n_trials: " n_trials)}1' "${config}" > "${prefix}.yaml"
+    fi
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/stimulus/check_model/main.nf b/modules/local/stimulus/check_model/main.nf
@@ -6,11 +6,10 @@ process CHECK_MODEL {
     container "docker.io/mathysgrapotte/stimulus-py:dev"
 
     input:
-    tuple val(meta), path(data_config)
-    tuple val(meta2), path(data)
-    tuple val(meta3), path(model)
-    tuple val(meta4), path(model_config)
-    tuple val(meta5), path(initial_weights)
+    tuple val(meta1), path(data)
+    tuple val(meta2), path(model)
+    tuple val(meta3), path(model_config)
+    tuple val(meta4), path(initial_weights)
 
     output:
     stdout emit: standardout

diff --git a/modules/local/stimulus/predict/main.nf b/modules/local/stimulus/predict/main.nf
@@ -5,7 +5,7 @@ process STIMULUS_PREDICT {
 
     input:
     tuple val(meta) , path(model), path(model_config), path(weigths)
-    tuple val(meta2), path(data), path(config)
+    tuple val(meta2), path(data)
 
     output:
     tuple val(meta), path("${prefix}-pred.safetensors"), emit: predictions

diff --git a/modules/local/stimulus/split_split/main.nf b/modules/local/stimulus/split_split/main.nf
diff --git a/modules/local/stimulus/split_transform/main.nf b/modules/local/stimulus/split_transform/main.nf
diff --git a/modules/local/stimulus/split_yaml/main.nf b/modules/local/stimulus/split_yaml/main.nf
@@ -0,0 +1,41 @@
+process STIMULUS_SPLIT_YAML {
+
+    tag "$meta.id"
+    label 'process_low'
+    // TODO: push image to nf-core quay.io
+    container "docker.io/mathysgrapotte/stimulus-py:dev"
+
+    input:
+    tuple val(meta), path(data_config)
+
+    output:
+    tuple val(meta), path("*_encode.yaml")     , emit: encode_config
+    tuple val(meta), path("*_split.yaml")      , emit: split_config
+    tuple val(meta), path("*_transform.yaml")  , emit: transform_config
+    path "versions.yml"                        , emit: versions
+
+    script:
+    """
+    stimulus split-yaml -y ${data_config} --out-dir ./
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        stimulus: \$(stimulus -v | cut -d ' ' -f 3)
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = data_config.baseName
+    """
+    touch ${prefix}_encode.yaml
+    touch ${prefix}_RandomSplit_70-30_split.yaml
+    touch ${prefix}_noise_std0.1_transform.yaml
+    touch ${prefix}_noise_std0.2_transform.yaml
+    touch ${prefix}_noise_std0.3_transform.yaml
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        stimulus: \$(stimulus -v | cut -d ' ' -f 3)
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/stimulus/tune/main.nf b/modules/local/stimulus/tune/main.nf
@@ -4,7 +4,7 @@ process STIMULUS_TUNE {
     container "docker.io/mathysgrapotte/stimulus-py:dev"
 
     input:
-    tuple val(meta), path(transformed_data), path(data_sub_config)
+    tuple val(meta), path(transformed_data)
     tuple val(meta2), path(model), path(model_config), path(initial_weights)
 
     output:
@@ -15,7 +15,6 @@ process STIMULUS_TUNE {
     path "versions.yml"                                               , emit: versions
     // now we need to output these in this format for the predict module - thiw will have to be changed!
     tuple val(meta), path(model), path("best_config.json"), path("${prefix}-best-model.safetensors"), emit: model_tmp
-    tuple val(meta), path(data_sub_config)                                                          , emit: data_config_tmp
 
     script:
     prefix = task.ext.prefix ?: meta.id

diff --git a/subworkflows/local/check_model/main.nf b/subworkflows/local/check_model/main.nf
@@ -17,7 +17,6 @@ workflow CHECK_MODEL_WF {
 
     take:
     ch_data
-    ch_data_config
     ch_model
     ch_model_config
     ch_initial_weights
@@ -27,7 +26,6 @@ workflow CHECK_MODEL_WF {
     ch_versions = Channel.empty()
 
     CHECK_MODEL(
-        ch_data_config,
         ch_data,
         ch_model,
         ch_model_config,

diff --git a/subworkflows/local/split_data_config_split/main.nf b/subworkflows/local/split_data_config_split/main.nf
diff --git a/subworkflows/local/split_data_config_transform/main.nf b/subworkflows/local/split_data_config_transform/main.nf
diff --git a/subworkflows/local/split_data_config_unified/main.nf b/subworkflows/local/split_data_config_unified/main.nf
@@ -0,0 +1,58 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    IMPORT NF-CORE MODULES/SUBWORKFLOWS
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+include { STIMULUS_SPLIT_YAML } from '../../../modules/local/stimulus/split_yaml'
+
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    RUN MAIN WORKFLOW
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+workflow SPLIT_DATA_CONFIG_UNIFIED_WF {
+    take:
+    ch_data_config
+
+    main:
+
+    ch_versions = Channel.empty()
+
+    STIMULUS_SPLIT_YAML( ch_data_config )
+    ch_versions = ch_versions.mix(STIMULUS_SPLIT_YAML.out.versions)
+
+    // Process split configs - transpose and add split_id to meta
+    ch_split_configs = STIMULUS_SPLIT_YAML.out.split_config
+        .transpose()
+        .map { meta, yaml ->
+            // Extract split info from descriptive filename
+            def split_id = yaml.baseName.replaceAll(/.*_([^_]+_[^_]+)_split$/, '$1')
+            [ meta + [split_id: split_id], yaml]
+        }
+
+    // Process transform configs - transpose and add transform_id to meta
+    ch_transform_configs = STIMULUS_SPLIT_YAML.out.transform_config
+        .transpose()
+        .map { meta, yaml ->
+            // Extract transform info from descriptive filename
+            def transform_id = yaml.baseName.replaceAll(/.*_([^_]+_[^_]+)_transform$/, '$1')
+            [ meta + [transform_id: transform_id], yaml]
+        }
+
+    // Encoding configs don't need transposition as there's only one per input
+    ch_encoding_configs = STIMULUS_SPLIT_YAML.out.encode_config
+
+    emit:
+    split_config     = ch_split_configs     // channel: [ meta + [split_id: split_id], yaml ]
+    transform_config = ch_transform_configs // channel: [ meta + [transform_id: transform_id], yaml ]
+    encode_config    = ch_encoding_configs  // channel: [ meta, yaml ]
+    versions         = ch_versions          // channel: [ versions.yml ]
+}
+
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    THE END
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/