Skip to content

Commit e4dc48d

Browse files
committed
Dev: split.DatasetSplitter implementation (todo)
1 parent 10658b7 commit e4dc48d

File tree

3 files changed

+53
-38
lines changed

3 files changed

+53
-38
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,9 @@ pypef --help
7171
After installation, a rudimentary graphical user interface (GUI) can be invoked using the command
7272

7373
```bash
74-
pypef-gui # loading takes some seconds
75-
# command for extra debug/tqdm progress terminal window:
76-
#pypef-gui-cli
74+
pypef-gui # loading takes some seconds
75+
# or
76+
pypef-gui-cli # command for keeping background debug/tqdm progress information in terminal on Windows
7777
```
7878

7979
<p align="center">

pypef/utils/split.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,11 @@
1515
class DatasetSplitter:
1616
def __init__(
1717
self,
18-
csv_file: str | PathLike,
18+
df_or_csv_file: str | PathLike | pd.DataFrame,
19+
n_cv: int | None = None,
1920
mutation_column: str | None = None,
20-
separator: str | None = None,
21-
n_cv: int | None = None
21+
separator: str | None = None
2222
):
23-
self.csv_file = csv_file
2423
if mutation_column is None:
2524
mutation_column = 'mutant'
2625
self.mutation_column = mutation_column
@@ -30,7 +29,10 @@ def __init__(
3029
if n_cv is None:
3130
n_cv = 5
3231
self.n_cv = n_cv
33-
self.df = pd.read_csv(self.csv_file, sep=self.separator)
32+
if type(df_or_csv_file) == pd.DataFrame:
33+
self.df = df_or_csv_file
34+
else:
35+
self.df = pd.read_csv(self.csv_file, sep=self.separator)
3436
self.random_splits_train_indices_combined, self.random_splits_test_indices_combined = None, None
3537
self.modulo_splits_train_indices_combined, self.modulo_splits_test_indices_combined = None, None
3638
self.cont_splits_train_indices_combined, self.cont_splits_test_indices_combined = None, None

pypef/utils/utils_run.py

Lines changed: 43 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@
2424
create_split_files, make_combinations_double_all_diverse,
2525
make_combinations_triple_all_diverse, make_combinations_quadruple_all_diverse,
2626
make_ssm_singles
27-
) # not yet implemented: make_combinations_double_all_diverse_and_all_positions
28-
27+
)
28+
from pypef.utils.split import DatasetSplitter
2929
from pypef.utils.directed_evolution import DirectedEvolution
3030
from pypef.utils.sto2a2m import convert_sto2a2m
3131

@@ -34,49 +34,62 @@
3434

3535

3636
def run_pypef_utils(arguments):
37-
if arguments['mklsts']:
37+
if arguments['mklsts'] or ['mklsts_rnd'] or ['mklsts_mod'] or ['mklsts_cont'] or ['mklsts_plot']:
3838
wt_sequence = get_wt_sequence(arguments['--wt'])
3939
t_drop = float(arguments['--drop'])
4040
ls_proportion = arguments['--ls_proportion']
41-
4241
logger.info(f'Length of provided sequence: {len(wt_sequence)} amino acids.')
4342
logger.info(f'Training set proportion (--ls_proportion): {ls_proportion}.')
4443
df = drop_rows(arguments['--input'], amino_acids, t_drop,
4544
arguments['--sep'], arguments['--mutation_sep'])
4645
no_rnd = arguments['--numrnd']
47-
4846
single_variants, single_values, higher_variants, higher_values = get_variants(
4947
df, amino_acids, wt_sequence, arguments['--mutation_sep']
5048
)
5149
if len(single_variants) == 0:
5250
logger.info('Found no single substitution variants for possible recombination!')
53-
sub_ls, val_ls, sub_ts, val_ts = make_sub_ls_ts(
54-
single_variants, single_values,
55-
higher_variants, higher_values,
56-
ls_proportion
57-
)
58-
logger.info('Tip: You can edit your LS and TS datasets just by '
59-
'cutting/pasting between the LS and TS fasta datasets.')
6051

61-
make_fasta_ls_ts('LS.fasl', wt_sequence, sub_ls, val_ls)
62-
make_fasta_ls_ts('TS.fasl', wt_sequence, sub_ts, val_ts)
52+
if arguments['mklsts']:
53+
sub_ls, val_ls, sub_ts, val_ts = make_sub_ls_ts(
54+
single_variants, single_values,
55+
higher_variants, higher_values,
56+
ls_proportion
57+
)
58+
logger.info('Tip: You can edit your LS and TS datasets just by '
59+
'cutting/pasting between the LS and TS fasta datasets.')
6360

64-
try:
65-
no_rnd = int(no_rnd)
66-
except ValueError:
67-
no_rnd = 0
68-
if no_rnd != 0:
69-
random_set_counter = 1
70-
no_rnd = int(no_rnd)
71-
while random_set_counter <= no_rnd:
72-
sub_ls, val_ls, sub_ts, val_ts = make_sub_ls_ts_randomly(
73-
single_variants, single_values,
74-
higher_variants, higher_values,
75-
ls_proportion
76-
)
77-
make_fasta_ls_ts('LS_random_' + str(random_set_counter) + '.fasl', wt_sequence, sub_ls, val_ls)
78-
make_fasta_ls_ts('TS_random_' + str(random_set_counter) + '.fasl', wt_sequence, sub_ts, val_ts)
79-
random_set_counter += 1
61+
make_fasta_ls_ts('LS.fasl', wt_sequence, sub_ls, val_ls)
62+
make_fasta_ls_ts('TS.fasl', wt_sequence, sub_ts, val_ts)
63+
64+
try:
65+
no_rnd = int(no_rnd)
66+
except ValueError:
67+
no_rnd = 0
68+
if no_rnd != 0:
69+
random_set_counter = 1
70+
no_rnd = int(no_rnd)
71+
while random_set_counter <= no_rnd:
72+
sub_ls, val_ls, sub_ts, val_ts = make_sub_ls_ts_randomly(
73+
single_variants, single_values,
74+
higher_variants, higher_values,
75+
ls_proportion
76+
)
77+
make_fasta_ls_ts('LS_random_' + str(random_set_counter) + '.fasl', wt_sequence, sub_ls, val_ls)
78+
make_fasta_ls_ts('TS_random_' + str(random_set_counter) + '.fasl', wt_sequence, sub_ts, val_ts)
79+
random_set_counter += 1
80+
else:
81+
ds = DatasetSplitter(df)
82+
if arguments['mklsts_rnd']:
83+
pass # TODO
84+
85+
elif arguments['mklsts_mod']:
86+
pass # TODO
87+
88+
elif arguments['mklsts_cont']:
89+
pass # TODO
90+
91+
elif arguments['mklsts_plot']:
92+
pass # TODO
8093

8194
elif arguments['mkps']:
8295
wt_sequence = get_wt_sequence(arguments['--wt'])

0 commit comments

Comments
 (0)