Skip to content

Commit 6c46c00

Browse files
FFroehlichdweindl
andauthored
improve flatten overrides (#42)
* Update core.py * fix replacement * fixup * fix grouping * fixup * fixup * fixup Co-authored-by: Daniel Weindl <[email protected]>
1 parent fd34613 commit 6c46c00

File tree

2 files changed

+77
-129
lines changed

2 files changed

+77
-129
lines changed

petab/core.py

Lines changed: 45 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import logging
44
import os
5+
import re
56
from typing import Iterable, Optional, Callable, Union, Any, Sequence, List
67
from warnings import warn
78

@@ -124,125 +125,52 @@ def flatten_timepoint_specific_output_overrides(
124125
petab_problem:
125126
PEtab problem to work on
126127
"""
127-
measurement_df = petab_problem.measurement_df
128-
129-
# remember if columns exist
130-
has_obs_par = OBSERVABLE_PARAMETERS in measurement_df
131-
has_noise_par = NOISE_PARAMETERS in measurement_df
132-
has_preeq = PREEQUILIBRATION_CONDITION_ID in measurement_df
133-
134-
# fill in optional columns to avoid special cases later
135-
if not has_obs_par \
136-
or np.all(measurement_df[OBSERVABLE_PARAMETERS].isnull()):
137-
measurement_df[OBSERVABLE_PARAMETERS] = ''
138-
if not has_noise_par \
139-
or np.all(measurement_df[NOISE_PARAMETERS].isnull()):
140-
measurement_df[NOISE_PARAMETERS] = ''
141-
if not has_preeq \
142-
or np.all(measurement_df[PREEQUILIBRATION_CONDITION_ID].isnull()):
143-
measurement_df[PREEQUILIBRATION_CONDITION_ID] = ''
144-
# convert to str row by row
145-
for irow, row in measurement_df.iterrows():
146-
if is_empty(row[OBSERVABLE_PARAMETERS]):
147-
measurement_df.at[irow, OBSERVABLE_PARAMETERS] = ''
148-
if is_empty(row[NOISE_PARAMETERS]):
149-
measurement_df.at[irow, NOISE_PARAMETERS] = ''
150-
if is_empty(row[PREEQUILIBRATION_CONDITION_ID]):
151-
measurement_df.at[irow, PREEQUILIBRATION_CONDITION_ID] = ''
152-
153-
# Create empty df -> to be filled with replicate-specific observables
154-
df_new = pd.DataFrame()
155-
156-
# Get observableId, preequilibrationConditionId
157-
# and simulationConditionId columns in measurement df
158-
cols = get_notnull_columns(
159-
measurement_df,
160-
[OBSERVABLE_ID, PREEQUILIBRATION_CONDITION_ID,
161-
SIMULATION_CONDITION_ID]
162-
)
163-
df = measurement_df[cols]
164-
165-
# Get unique combinations of observableId, preequilibrationConditionId
166-
# and simulationConditionId
167-
df_unique_values = df.drop_duplicates()
168-
169-
# replaced observables: new ID => old ID
170-
replacements = dict()
171-
172-
# Loop over each unique combination
173-
for irow in df_unique_values.index:
174-
df = measurement_df.loc[
175-
(measurement_df[OBSERVABLE_ID] ==
176-
df_unique_values.loc[irow, OBSERVABLE_ID])
177-
& (measurement_df[PREEQUILIBRATION_CONDITION_ID] ==
178-
df_unique_values.loc[irow, PREEQUILIBRATION_CONDITION_ID])
179-
& (measurement_df[SIMULATION_CONDITION_ID] ==
180-
df_unique_values.loc[irow, SIMULATION_CONDITION_ID])
181-
]
182-
183-
# Get list of unique observable parameters
184-
unique_sc = df[OBSERVABLE_PARAMETERS].unique()
185-
# Get list of unique noise parameters
186-
unique_noise = df[NOISE_PARAMETERS].unique()
187-
188-
# Loop
189-
for i_noise, cur_noise in enumerate(unique_noise):
190-
for i_sc, cur_sc in enumerate(unique_sc):
191-
# Find the position of all instances of cur_noise
192-
# and unique_sc[j] in their corresponding column
193-
# (full-string matches are denoted by zero)
194-
idxs = (
195-
df[NOISE_PARAMETERS].astype(str).str.find(cur_noise) +
196-
df[OBSERVABLE_PARAMETERS].astype(str).str.find(cur_sc)
128+
new_measurement_dfs = []
129+
new_observable_dfs = []
130+
possible_groupvars = [OBSERVABLE_ID, OBSERVABLE_PARAMETERS,
131+
NOISE_PARAMETERS, SIMULATION_CONDITION_ID,
132+
PREEQUILIBRATION_CONDITION_ID]
133+
groupvars = get_notnull_columns(petab_problem.measurement_df,
134+
possible_groupvars)
135+
for groupvar, measurements in \
136+
petab_problem.measurement_df.groupby(groupvars, dropna=False):
137+
obs_id = groupvar[groupvars.index(OBSERVABLE_ID)]
138+
# construct replacement id
139+
replacement_id = ''
140+
for field in possible_groupvars:
141+
if field in groupvars:
142+
val = groupvar[groupvars.index(field)
143+
].replace(';', '_').replace('.', '_')
144+
if replacement_id == '':
145+
replacement_id = val
146+
elif val != '':
147+
replacement_id += f'__{val}'
148+
149+
logger.debug(f'Creating synthetic observable {obs_id}')
150+
if replacement_id in petab_problem.observable_df.index:
151+
raise RuntimeError('could not create synthetic observables '
152+
f'since {replacement_id} was already '
153+
'present in observable table')
154+
observable = petab_problem.observable_df.loc[obs_id].copy()
155+
observable.name = replacement_id
156+
for field, parname, target in [
157+
(NOISE_PARAMETERS, 'noiseParameter', NOISE_FORMULA),
158+
(OBSERVABLE_PARAMETERS, 'observableParameter', OBSERVABLE_FORMULA)
159+
]:
160+
if field in measurements:
161+
observable[target] = re.sub(
162+
fr'{parname}([0-9]+)_{obs_id}',
163+
f'{parname}\\1_{replacement_id}',
164+
observable[target]
197165
)
198-
tmp_ = df.loc[idxs == 0, OBSERVABLE_ID]
199-
# Create replicate-specific observable name
200-
tmp = tmp_ + "_" + str(i_noise + i_sc + 1)
201-
# Check if replicate-specific observable name already exists
202-
# in df. If true, rename replicate-specific observable
203-
counter = 2
204-
while (df[OBSERVABLE_ID].str.find(
205-
tmp.to_string()
206-
) == 0).any():
207-
tmp = tmp_ + counter * "_" + str(i_noise + i_sc + 1)
208-
counter += 1
209-
if not tmp_.empty:
210-
replacements[tmp.values[0]] = tmp_.values[0]
211-
df.loc[idxs == 0, OBSERVABLE_ID] = tmp
212-
# Append the result in a new df
213-
df_new = df_new.append(df.loc[idxs == 0])
214-
# Restore the observable name in the original df
215-
# (for continuation of the loop)
216-
df.loc[idxs == 0, OBSERVABLE_ID] = tmp
217-
218-
# remove previously non-existent columns again
219-
if not has_obs_par:
220-
df_new.drop(columns=OBSERVABLE_PARAMETERS, inplace=True)
221-
if not has_noise_par:
222-
df_new.drop(columns=NOISE_PARAMETERS, inplace=True)
223-
if not has_preeq:
224-
df_new.drop(columns=PREEQUILIBRATION_CONDITION_ID, inplace=True)
225-
226-
# Update/Redefine measurement df with replicate-specific observables
227-
petab_problem.measurement_df = df_new
228-
229-
observable_df = petab_problem.observable_df
230-
231-
# Update observables table
232-
for replacement, replacee in replacements.items():
233-
new_obs = observable_df.loc[replacee].copy()
234-
new_obs.name = replacement
235-
new_obs[OBSERVABLE_FORMULA] = new_obs[OBSERVABLE_FORMULA].replace(
236-
replacee, replacement)
237-
new_obs[NOISE_FORMULA] = new_obs[NOISE_FORMULA].replace(
238-
replacee, replacement)
239-
observable_df = observable_df.append(
240-
new_obs
241-
)
242166

243-
petab_problem.observable_df = observable_df
244-
petab_problem.observable_df.drop(index=set(replacements.values()),
245-
inplace=True)
167+
measurements[OBSERVABLE_ID] = replacement_id
168+
new_measurement_dfs.append(measurements)
169+
new_observable_dfs.append(observable)
170+
171+
petab_problem.observable_df = pd.concat(new_observable_dfs, axis=1).T
172+
petab_problem.observable_df.index.name = OBSERVABLE_ID
173+
petab_problem.measurement_df = pd.concat(new_measurement_dfs)
246174

247175

248176
def concat_tables(

tests/test_petab.py

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -299,14 +299,27 @@ def test_flatten_timepoint_specific_output_overrides():
299299
observable_df.set_index(OBSERVABLE_ID, inplace=True)
300300

301301
observable_df_expected = pd.DataFrame(data={
302-
OBSERVABLE_ID: ['obs1_1', 'obs1_2', 'obs1_3'],
302+
OBSERVABLE_ID: [
303+
'obs1__obsParOverride1_1_0__noiseParOverride1__condition1',
304+
'obs1__obsParOverride2_1_0__noiseParOverride1__condition1',
305+
'obs1__obsParOverride2_1_0__noiseParOverride2__condition1',
306+
],
303307
OBSERVABLE_FORMULA: [
304-
'observableParameter1_obs1_1 + observableParameter2_obs1_1',
305-
'observableParameter1_obs1_2 + observableParameter2_obs1_2',
306-
'observableParameter1_obs1_3 + observableParameter2_obs1_3'],
307-
NOISE_FORMULA: ['noiseParameter1_obs1_1',
308-
'noiseParameter1_obs1_2',
309-
'noiseParameter1_obs1_3']
308+
'observableParameter1_obs1__obsParOverride1_1_0__'
309+
'noiseParOverride1__condition1 + observableParameter2_obs1'
310+
'__obsParOverride1_1_0__noiseParOverride1__condition1',
311+
'observableParameter1_obs1__obsParOverride2_1_0__noiseParOverride1'
312+
'__condition1 + observableParameter2_obs1__obsParOverride2_1_0'
313+
'__noiseParOverride1__condition1',
314+
'observableParameter1_obs1__obsParOverride2_1_0'
315+
'__noiseParOverride2__condition1 + observableParameter2_obs1__'
316+
'obsParOverride2_1_0__noiseParOverride2__condition1'],
317+
NOISE_FORMULA: ['noiseParameter1_obs1__obsParOverride1_1_0__'
318+
'noiseParOverride1__condition1',
319+
'noiseParameter1_obs1__obsParOverride2_1_0__'
320+
'noiseParOverride1__condition1',
321+
'noiseParameter1_obs1__obsParOverride2_1_0__'
322+
'noiseParOverride2__condition1']
310323
})
311324
observable_df_expected.set_index(OBSERVABLE_ID, inplace=True)
312325

@@ -332,7 +345,10 @@ def test_flatten_timepoint_specific_output_overrides():
332345

333346
measurement_df_expected = pd.DataFrame(data={
334347
OBSERVABLE_ID:
335-
['obs1_1', 'obs1_2', 'obs1_3', 'obs1_3'],
348+
['obs1__obsParOverride1_1_0__noiseParOverride1__condition1',
349+
'obs1__obsParOverride2_1_0__noiseParOverride1__condition1',
350+
'obs1__obsParOverride2_1_0__noiseParOverride2__condition1',
351+
'obs1__obsParOverride2_1_0__noiseParOverride2__condition1'],
336352
SIMULATION_CONDITION_ID:
337353
['condition1', 'condition1', 'condition1', 'condition1'],
338354
PREEQUILIBRATION_CONDITION_ID:
@@ -384,12 +400,13 @@ def test_flatten_timepoint_specific_output_overrides_special_cases():
384400
observable_df.set_index(OBSERVABLE_ID, inplace=True)
385401

386402
observable_df_expected = pd.DataFrame(data={
387-
OBSERVABLE_ID: ['obs1_1', 'obs1_2'],
403+
OBSERVABLE_ID: ['obs1__noiseParOverride1__condition1',
404+
'obs1__noiseParOverride2__condition1'],
388405
OBSERVABLE_FORMULA: [
389406
'species1',
390407
'species1'],
391-
NOISE_FORMULA: ['noiseParameter1_obs1_1',
392-
'noiseParameter1_obs1_2']
408+
NOISE_FORMULA: ['noiseParameter1_obs1__noiseParOverride1__condition1',
409+
'noiseParameter1_obs1__noiseParOverride2__condition1']
393410
})
394411
observable_df_expected.set_index(OBSERVABLE_ID, inplace=True)
395412

@@ -410,7 +427,10 @@ def test_flatten_timepoint_specific_output_overrides_special_cases():
410427

411428
measurement_df_expected = pd.DataFrame(data={
412429
OBSERVABLE_ID:
413-
['obs1_1', 'obs1_1', 'obs1_2', 'obs1_2'],
430+
['obs1__noiseParOverride1__condition1',
431+
'obs1__noiseParOverride1__condition1',
432+
'obs1__noiseParOverride2__condition1',
433+
'obs1__noiseParOverride2__condition1'],
414434
SIMULATION_CONDITION_ID:
415435
['condition1', 'condition1', 'condition1', 'condition1'],
416436
TIME:

0 commit comments

Comments
 (0)