Skip to content

Commit fb15200

Browse files
authored
Feature synthetic data (#472)
* Base class that implements noise generation for simulated data, and an interface for synthetic data generation, optionally with noise * Noise scaling factor
1 parent 292c804 commit fb15200

File tree

3 files changed

+378
-0
lines changed

3 files changed

+378
-0
lines changed

petab/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from .problem import * # noqa: F403, F401, E402
2222
from .sampling import * # noqa: F403, F401, E402
2323
from .sbml import * # noqa: F403, F401, E402
24+
from .simulate import * # noqa: F403, F401, E402
2425
from .yaml import * # noqa: F403, F401, E402
2526
from .version import __version__ # noqa: F401, E402
2627
from .format_version import __format_version__ # noqa: F401, E402

petab/simulate.py

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
import abc
2+
import numpy as np
3+
import pathlib
4+
import pandas as pd
5+
import petab
6+
import shutil
7+
import sympy as sp
8+
import tempfile
9+
from typing import Dict, Optional, Union
10+
11+
12+
class Simulator(abc.ABC):
13+
"""
14+
Base class that specific simulators should inherit.
15+
Specific simulators should minimally implement the
16+
`simulate_without_noise` method.
17+
Example (AMICI): https://bit.ly/33SUSG4
18+
19+
Attributes:
20+
noise_formulas:
21+
The formulae that will be used to calculate the scale of noise
22+
distributions.
23+
petab_problem:
24+
A PEtab problem, which will be simulated.
25+
rng:
26+
A NumPy random generator, used to sample from noise distributions.
27+
temporary_working_dir:
28+
Whether `working_dir` is a temporary directory, which can be
29+
deleted without significant consequence.
30+
working_dir:
31+
All simulator-specific output files will be saved here. This
32+
directory and its contents may be modified and deleted, and
33+
should be considered ephemeral.
34+
"""
35+
def __init__(self,
36+
petab_problem: petab.Problem,
37+
working_dir: Optional[Union[pathlib.Path, str]] = None):
38+
"""
39+
Initialize the simulator with sufficient information to perform a
40+
simulation. If no working directory is specified, a temporary one is
41+
created.
42+
43+
Arguments:
44+
petab_problem:
45+
A PEtab problem.
46+
working_dir:
47+
All simulator-specific output files will be saved here. This
48+
directory and its contents may be modified and deleted, and
49+
should be considered ephemeral.
50+
"""
51+
self.petab_problem = petab_problem
52+
53+
self.temporary_working_dir = False
54+
if working_dir is None:
55+
working_dir = tempfile.mkdtemp()
56+
self.temporary_working_dir = True
57+
if not isinstance(working_dir, pathlib.Path):
58+
working_dir = pathlib.Path(working_dir)
59+
self.working_dir = working_dir
60+
self.working_dir.mkdir(parents=True, exist_ok=True)
61+
62+
self.noise_formulas = petab.calculate.get_symbolic_noise_formulas(
63+
self.petab_problem.observable_df)
64+
self.rng = np.random.default_rng()
65+
66+
def remove_working_dir(self, force: bool = False, **kwargs) -> None:
67+
"""
68+
Remove the simulator working directory and all files within (see the
69+
`__init__` method arguments).
70+
71+
Arguments:
72+
force:
73+
If True, the working directory is removed regardless of
74+
whether it is a temporary directory.
75+
"""
76+
if force or self.temporary_working_dir:
77+
shutil.rmtree(self.working_dir, **kwargs)
78+
if self.working_dir.is_dir():
79+
print('Failed to remove the working directory: '
80+
+ str(self.working_dir))
81+
else:
82+
print('By default, specified working directories are not removed. '
83+
'Please call this method with `force=True`, or manually '
84+
f'delete the working directory: {self.working_dir}')
85+
86+
@abc.abstractmethod
87+
def simulate_without_noise(self) -> pd.DataFrame:
88+
"""
89+
Simulate a PEtab problem. This is an abstract method that should be
90+
implemented in a simulation package. Links to examples of this are in
91+
the class docstring.
92+
93+
Returns:
94+
Simulated data, as a PEtab measurements table, which should be
95+
equivalent to replacing all values in the `petab.C.MEASUREMENT`
96+
column of the measurements table (of the PEtab problem supplied to
97+
the `__init__` method), with simulated values.
98+
"""
99+
100+
def simulate(
101+
self,
102+
noise: bool = False,
103+
noise_scaling_factor: float = 1,
104+
**kwargs
105+
) -> pd.DataFrame:
106+
"""Simulate a PEtab problem, optionally with noise.
107+
108+
Arguments:
109+
noise: If True, noise is added to simulated data.
110+
noise_scaling_factor:
111+
A multiplier of the scale of the noise distribution.
112+
113+
Returns:
114+
Simulated data, as a PEtab measurements table.
115+
"""
116+
simulation_df = self.simulate_without_noise(**kwargs)
117+
if noise:
118+
simulation_df = self.add_noise(simulation_df, noise_scaling_factor)
119+
return simulation_df
120+
121+
def add_noise(
122+
self,
123+
simulation_df: pd.DataFrame,
124+
noise_scaling_factor: float = 1,
125+
) -> pd.DataFrame:
126+
"""Add noise to simulated data.
127+
128+
Arguments:
129+
simulation_df:
130+
A PEtab measurements table that contains simulated data.
131+
noise_scaling_factor:
132+
A multiplier of the scale of the noise distribution.
133+
134+
Returns:
135+
Simulated data with noise, as a PEtab measurements table.
136+
"""
137+
simulation_df_with_noise = simulation_df.copy()
138+
simulation_df_with_noise[petab.C.MEASUREMENT] = [
139+
sample_noise(
140+
self.petab_problem,
141+
row,
142+
row[petab.C.MEASUREMENT],
143+
self.noise_formulas,
144+
self.rng,
145+
noise_scaling_factor,
146+
)
147+
for _, row in simulation_df_with_noise.iterrows()
148+
]
149+
return simulation_df_with_noise
150+
151+
152+
def sample_noise(
153+
petab_problem: petab.Problem,
154+
measurement_row: pd.Series,
155+
simulated_value: float,
156+
noise_formulas: Optional[Dict[str, sp.Expr]] = None,
157+
rng: Optional[np.random.Generator] = None,
158+
noise_scaling_factor: float = 1,
159+
) -> float:
160+
"""Generate a sample from a PEtab noise distribution.
161+
162+
Arguments:
163+
petab_problem:
164+
The PEtab problem used to generate the simulated value.
165+
Instance of `petab.Problem`.
166+
measurement_row:
167+
The row in the PEtab problem measurement table that corresponds
168+
to the simulated value.
169+
simulated_value:
170+
A simulated value without noise.
171+
noise_formulas:
172+
Processed noise formulas from the PEtab observables table, in the
173+
form output by the `petab.calculate.get_symbolic_noise_formulas`
174+
method.
175+
rng:
176+
A NumPy random generator.
177+
noise_scaling_factor:
178+
A multiplier of the scale of the noise distribution.
179+
180+
Returns:
181+
The sample from the PEtab noise distribution.
182+
"""
183+
if noise_formulas is None:
184+
noise_formulas = petab.calculate.get_symbolic_noise_formulas(
185+
petab_problem.observable_df)
186+
if rng is None:
187+
rng = np.random.default_rng()
188+
189+
noise_value = petab.calculate.evaluate_noise_formula(
190+
measurement_row,
191+
noise_formulas,
192+
petab_problem.parameter_df,
193+
simulated_value
194+
)
195+
196+
# default noise distribution is petab.C.NORMAL
197+
noise_distribution = (
198+
petab_problem
199+
.observable_df
200+
.loc[measurement_row[petab.C.OBSERVABLE_ID]]
201+
.get(petab.C.NOISE_DISTRIBUTION, petab.C.NORMAL)
202+
)
203+
204+
# below is e.g.: `np.random.normal(loc=simulation, scale=noise_value)`
205+
return getattr(rng, noise_distribution)(
206+
loc=simulated_value,
207+
scale=noise_value * noise_scaling_factor
208+
)

tests/test_simulate.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
"""Tests for petab/simulate.py."""
2+
import functools
3+
import numpy as np
4+
import pandas as pd
5+
import pathlib
6+
import pytest
7+
import scipy.stats
8+
from typing import Callable
9+
10+
import petab
11+
from petab.C import MEASUREMENT
12+
13+
14+
class TestSimulator(petab.simulate.Simulator):
15+
"""Dummy simulator."""
16+
17+
__test__ = False
18+
19+
def simulate_without_noise(self) -> pd.DataFrame:
20+
"""Dummy simulation."""
21+
return self.petab_problem.measurement_df
22+
23+
24+
@pytest.fixture
25+
def petab_problem() -> petab.Problem:
26+
"""Create a PEtab problem for use in tests."""
27+
petab_yaml_path = pathlib.Path(__file__).parent.absolute() / \
28+
'../doc/example/example_Fujita/Fujita.yaml'
29+
return petab.Problem.from_yaml(str(petab_yaml_path))
30+
31+
32+
def test_remove_working_dir(petab_problem):
33+
"""Test creation and removal of a non-empty temporary working directory."""
34+
simulator = TestSimulator(petab_problem)
35+
# The working directory exists
36+
assert pathlib.Path(simulator.working_dir).is_dir()
37+
synthetic_data_df = simulator.simulate()
38+
synthetic_data_df.to_csv(f'{simulator.working_dir}/test.csv', sep='\t')
39+
simulator.remove_working_dir()
40+
# The (non-empty) working directory is removed
41+
assert not pathlib.Path(simulator.working_dir).is_dir()
42+
43+
# Test creation and removal of a specified working directory
44+
working_dir = 'tests/test_simulate_working_dir'
45+
simulator = TestSimulator(petab_problem, working_dir=working_dir)
46+
# The working directory is as specified
47+
assert working_dir == str(simulator.working_dir)
48+
# The working directory exists
49+
assert pathlib.Path(simulator.working_dir).is_dir()
50+
# A user-specified working directory should not be removed unless
51+
# `force=True`.
52+
simulator.remove_working_dir()
53+
# The user-specified working directory is not removed without `force=True`
54+
assert pathlib.Path(simulator.working_dir).is_dir()
55+
simulator.remove_working_dir(force=True)
56+
# The user-specified working directory is removed with `force=True`
57+
assert not pathlib.Path(simulator.working_dir).is_dir()
58+
59+
# Test creation and removal of a specified non-empty working directory
60+
simulator = TestSimulator(petab_problem, working_dir=working_dir)
61+
synthetic_data_df = simulator.simulate()
62+
synthetic_data_df.to_csv(f'{simulator.working_dir}/test.csv', sep='\t')
63+
simulator.remove_working_dir(force=True)
64+
# The non-empty, user-specified directory is removed with `force=True`
65+
assert not pathlib.Path(simulator.working_dir).is_dir()
66+
67+
68+
def test_add_noise(petab_problem):
69+
"""Test the noise generating method."""
70+
71+
tested_noise_distributions = {'normal', 'laplace'}
72+
assert set(petab.C.NOISE_MODELS) == tested_noise_distributions, (
73+
'The noise generation methods have only been tested for '
74+
f'{tested_noise_distributions}. Please edit this test '
75+
'to include this distribution in its tested distributions. The '
76+
'appropriate SciPy distribution will need to be added to '
77+
'`petab_numpy2scipy_distribution` in `_test_add_noise`.'
78+
)
79+
80+
for distribution in tested_noise_distributions:
81+
petab_problem.observable_df[petab.C.NOISE_DISTRIBUTION] = distribution
82+
_test_add_noise(petab_problem)
83+
84+
85+
def _test_add_noise(petab_problem) -> None:
86+
"""Test the noise generating method."""
87+
n_samples = 100
88+
noise_scaling_factor = 0.5
89+
ks_1samp_pvalue_threshold = 0.05
90+
minimum_fraction_above_threshold = 0.9
91+
petab_numpy2scipy_distribution = {
92+
'normal': 'norm',
93+
'laplace': 'laplace',
94+
}
95+
96+
simulator = TestSimulator(petab_problem)
97+
synthetic_data_df = simulator.simulate()
98+
99+
# Generate samples of noisy data
100+
samples = []
101+
for _ in range(n_samples):
102+
samples.append(
103+
simulator.add_noise(
104+
synthetic_data_df,
105+
noise_scaling_factor=noise_scaling_factor,
106+
)[MEASUREMENT]
107+
)
108+
samples = np.array(samples)
109+
110+
expected_noise_values = [
111+
noise_scaling_factor *
112+
petab.calculate.evaluate_noise_formula(
113+
row,
114+
simulator.noise_formulas,
115+
petab_problem.parameter_df,
116+
row[MEASUREMENT],
117+
)
118+
for _, row in synthetic_data_df.iterrows()
119+
]
120+
expected_noise_distributions = [
121+
petab_problem
122+
.observable_df
123+
.loc[row[petab.C.OBSERVABLE_ID]]
124+
.get(petab.C.NOISE_DISTRIBUTION, petab.C.NORMAL)
125+
for _, row in synthetic_data_df.iterrows()
126+
]
127+
128+
def row2cdf(row, index) -> Callable:
129+
"""
130+
Get and customize the appropriate cumulative distribution function from
131+
`scipy.stats`.
132+
133+
Arguments:
134+
row:
135+
A row from a PEtab measurement table.
136+
index:
137+
The index of the row.
138+
139+
Returns:
140+
The appropriate SciPy cumulative distribution function, setup to
141+
produce noisy simulated data.
142+
"""
143+
return functools.partial(
144+
getattr(
145+
scipy.stats,
146+
petab_numpy2scipy_distribution[
147+
expected_noise_distributions[index]]
148+
).cdf, loc=row[MEASUREMENT], scale=expected_noise_values[index])
149+
150+
# Test whether the distribution of the samples is equal to the expected
151+
# distribution, for each measurement.
152+
results = []
153+
for index, row in synthetic_data_df.iterrows():
154+
r = scipy.stats.ks_1samp(
155+
samples[:, index],
156+
row2cdf(row, index)
157+
)
158+
results.append(r)
159+
observed_fraction_above_threshold = (
160+
sum(r.pvalue > ks_1samp_pvalue_threshold for r in results) /
161+
len(results)
162+
)
163+
# Sufficient distributions of measurement samples are sufficiently similar
164+
# to the expected distribution
165+
assert (
166+
observed_fraction_above_threshold > minimum_fraction_above_threshold)
167+
168+
simulator.remove_working_dir()
169+
assert not pathlib.Path(simulator.working_dir).is_dir()

0 commit comments

Comments
 (0)