Dev: split.DatasetSplitter implementation (todo)

niklases · niklases · commit e4dc48dd4faf · 2025-07-11T19:47:57.000+02:00
diff --git a/README.md b/README.md
@@ -71,9 +71,9 @@ pypef --help
 After installation, a rudimentary graphical user interface (GUI) can be invoked using the command
 
 ```bash
-pypef-gui       # loading takes some seconds
-# command for extra debug/tqdm progress terminal window:
-#pypef-gui-cli  
+pypef-gui      # loading takes some seconds
+# or
+pypef-gui-cli  # command for keeping background debug/tqdm progress information in terminal on Windows
 ```
 
 <p align="center">
diff --git a/pypef/utils/split.py b/pypef/utils/split.py
@@ -15,12 +15,11 @@
 class DatasetSplitter:
     def __init__(
             self, 
-            csv_file: str | PathLike, 
+            df_or_csv_file: str | PathLike | pd.DataFrame, 
+            n_cv: int | None = None,
             mutation_column: str | None = None, 
-            separator: str | None = None, 
-            n_cv: int | None = None
+            separator: str | None = None
     ):
-        self.csv_file = csv_file
         if mutation_column is None:
             mutation_column = 'mutant'
         self.mutation_column = mutation_column
@@ -30,7 +29,10 @@ def __init__(
         if n_cv is None:
             n_cv = 5
         self.n_cv = n_cv
-        self.df = pd.read_csv(self.csv_file, sep=self.separator)
+        if type(df_or_csv_file) == pd.DataFrame:
+            self.df = df_or_csv_file
+        else:
+            self.df = pd.read_csv(self.csv_file, sep=self.separator)
         self.random_splits_train_indices_combined, self.random_splits_test_indices_combined = None, None
         self.modulo_splits_train_indices_combined, self.modulo_splits_test_indices_combined = None, None
         self.cont_splits_train_indices_combined, self.cont_splits_test_indices_combined = None, None
diff --git a/pypef/utils/utils_run.py b/pypef/utils/utils_run.py
@@ -24,8 +24,8 @@
     create_split_files, make_combinations_double_all_diverse,
     make_combinations_triple_all_diverse, make_combinations_quadruple_all_diverse,
     make_ssm_singles
-)   # not yet implemented: make_combinations_double_all_diverse_and_all_positions
-
+)
+from pypef.utils.split import DatasetSplitter
 from pypef.utils.directed_evolution import DirectedEvolution
 from pypef.utils.sto2a2m import convert_sto2a2m
 
@@ -34,49 +34,62 @@
 
 
 def run_pypef_utils(arguments):
-    if arguments['mklsts']:
+    if arguments['mklsts'] or ['mklsts_rnd'] or ['mklsts_mod'] or ['mklsts_cont'] or ['mklsts_plot']:
         wt_sequence = get_wt_sequence(arguments['--wt'])
         t_drop = float(arguments['--drop'])
         ls_proportion = arguments['--ls_proportion']
-
         logger.info(f'Length of provided sequence: {len(wt_sequence)} amino acids.')
         logger.info(f'Training set proportion (--ls_proportion): {ls_proportion}.')
         df = drop_rows(arguments['--input'], amino_acids, t_drop, 
                        arguments['--sep'], arguments['--mutation_sep'])
         no_rnd = arguments['--numrnd']
-
         single_variants, single_values, higher_variants, higher_values = get_variants(
             df, amino_acids, wt_sequence, arguments['--mutation_sep']
         )
         if len(single_variants) == 0:
             logger.info('Found no single substitution variants for possible recombination!')
-        sub_ls, val_ls, sub_ts, val_ts = make_sub_ls_ts(
-            single_variants, single_values, 
-            higher_variants, higher_values, 
-            ls_proportion
-        )
-        logger.info('Tip: You can edit your LS and TS datasets just by '
-                    'cutting/pasting between the LS and TS fasta datasets.')
 
-        make_fasta_ls_ts('LS.fasl', wt_sequence, sub_ls, val_ls)
-        make_fasta_ls_ts('TS.fasl', wt_sequence, sub_ts, val_ts)
+        if arguments['mklsts']:
+            sub_ls, val_ls, sub_ts, val_ts = make_sub_ls_ts(
+                single_variants, single_values, 
+                higher_variants, higher_values, 
+                ls_proportion
+            )
+            logger.info('Tip: You can edit your LS and TS datasets just by '
+                        'cutting/pasting between the LS and TS fasta datasets.')
 
-        try:
-            no_rnd = int(no_rnd)
-        except ValueError:
-            no_rnd = 0
-        if no_rnd != 0:
-            random_set_counter = 1
-            no_rnd = int(no_rnd)
-            while random_set_counter <= no_rnd:
-                sub_ls, val_ls, sub_ts, val_ts = make_sub_ls_ts_randomly(
-                    single_variants, single_values,
-                    higher_variants, higher_values,
-                    ls_proportion
-                )
-                make_fasta_ls_ts('LS_random_' + str(random_set_counter) + '.fasl', wt_sequence, sub_ls, val_ls)
-                make_fasta_ls_ts('TS_random_' + str(random_set_counter) + '.fasl', wt_sequence, sub_ts, val_ts)
-                random_set_counter += 1
+            make_fasta_ls_ts('LS.fasl', wt_sequence, sub_ls, val_ls)
+            make_fasta_ls_ts('TS.fasl', wt_sequence, sub_ts, val_ts)
+
+            try:
+                no_rnd = int(no_rnd)
+            except ValueError:
+                no_rnd = 0
+            if no_rnd != 0:
+                random_set_counter = 1
+                no_rnd = int(no_rnd)
+                while random_set_counter <= no_rnd:
+                    sub_ls, val_ls, sub_ts, val_ts = make_sub_ls_ts_randomly(
+                        single_variants, single_values,
+                        higher_variants, higher_values,
+                        ls_proportion
+                    )
+                    make_fasta_ls_ts('LS_random_' + str(random_set_counter) + '.fasl', wt_sequence, sub_ls, val_ls)
+                    make_fasta_ls_ts('TS_random_' + str(random_set_counter) + '.fasl', wt_sequence, sub_ts, val_ts)
+                    random_set_counter += 1
+        else:
+            ds = DatasetSplitter(df)
+            if arguments['mklsts_rnd']:
+                pass # TODO
+
+            elif arguments['mklsts_mod']:
+                pass # TODO
+
+            elif arguments['mklsts_cont']:
+                pass # TODO   
+
+            elif arguments['mklsts_plot']:
+                pass # TODO  
 
     elif arguments['mkps']:
         wt_sequence = get_wt_sequence(arguments['--wt'])