diff --git a/causal_testing/estimation/genetic_programming_regression_fitter.py b/causal_testing/estimation/genetic_programming_regression_fitter.py index 49aa4dfc..00977265 100644 --- a/causal_testing/estimation/genetic_programming_regression_fitter.py +++ b/causal_testing/estimation/genetic_programming_regression_fitter.py @@ -149,7 +149,6 @@ def __init__( ) self.sympy_conversions[name] = conversion - print(self.pset.mapping) creator.create("FitnessMin", base.Fitness, weights=(-1.0,)) creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin) @@ -235,6 +234,8 @@ def simplify(self, expression: gp.PrimitiveTree) -> sympy.core.Expr: :return: The simplified expression as a sympy Expr object. """ + if isinstance(expression, str): + expression = creator.Individual(gp.PrimitiveTree.from_string(expression, self.pset)) return sympy.simplify(self._stringify_for_sympy(expression)) def repair(self, expression: gp.PrimitiveTree) -> gp.PrimitiveTree: @@ -248,7 +249,7 @@ def repair(self, expression: gp.PrimitiveTree) -> gp.PrimitiveTree: """ eq = f"{self.outcome} ~ {' + '.join(str(x) for x in self.split(expression))}" try: - # Create model, fit (run) it, give estimates from it] + # Create model, fit (run) it, give estimates from it model = smf.ols(eq, self.df) res = model.fit() @@ -278,16 +279,25 @@ def fitness(self, expression: gp.PrimitiveTree) -> float: """ old_settings = np.seterr(all="raise") try: - # Create model, fit (run) it, give estimates from it] + if isinstance(expression, str): + expression = creator.Individual(gp.PrimitiveTree.from_string(expression, self.pset)) + + # Create model, fit (run) it, give estimates from it func = gp.compile(expression, self.pset) - y_estimates = pd.Series([func(**x) for _, x in self.df[self.features].iterrows()]) + y_estimates = pd.Series( + [func(**x) for _, x in self.df[self.features].iterrows()], + index=self.df.index, + ) - # Calc errors using an improved normalised mean squared + # Calculate errors using the normalised root mean square error (nrmse), + # which is normalised with respect to the range sqerrors = (self.df[self.outcome] - y_estimates) ** 2 - mean_squared = sqerrors.sum() / len(self.df) - nmse = mean_squared / (self.df[self.outcome].sum() / len(self.df)) + nrmse = np.sqrt(sqerrors.sum() / len(self.df)) / (self.df[self.outcome].max() - self.df[self.outcome].min()) + + if pd.isnull(nrmse) or nrmse.real != nrmse: + return (float("inf"),) - return (nmse,) + return (nrmse,) # Fitness value of infinite if error - not return 1 except ( @@ -321,7 +331,15 @@ def make_offspring(self, population: list, num_offspring: int) -> list: offspring.append(child) return offspring - def run_gp(self, ngen: int, pop_size: int = 20, num_offspring: int = 10, seeds: list = None) -> gp.PrimitiveTree: + # pylint: disable=too-many-arguments + def run_gp( + self, + ngen: int, + pop_size: int = 20, + num_offspring: int = 10, + seeds: list = None, + repair: bool = True, + ) -> gp.PrimitiveTree: """ Execute Genetic Programming to find the best expression using a mu+lambda algorithm. @@ -329,10 +347,13 @@ def run_gp(self, ngen: int, pop_size: int = 20, num_offspring: int = 10, seeds: :param pop_size: The population size. :param num_offspring: The number of new individuals per generation. :param seeds: Seed individuals for the initial population. + :param repair: Whether to run the linear regression repair operator (defaults to True). :return: The best candididate expression. """ - population = [self.toolbox.repair(ind) for ind in self.toolbox.population(n=pop_size)] + population = self.toolbox.population(n=pop_size) + if repair: + population = [self.toolbox.repair(ind) for ind in population] if seeds is not None: for seed in seeds: ind = creator.Individual(gp.PrimitiveTree.from_string(seed, self.pset)) @@ -348,7 +369,8 @@ def run_gp(self, ngen: int, pop_size: int = 20, num_offspring: int = 10, seeds: for _ in range(1, ngen + 1): # Vary the population offspring = self.make_offspring(population, num_offspring) - offspring = [self.toolbox.repair(ind) for ind in offspring] + if repair: + offspring = [self.toolbox.repair(ind) for ind in offspring] # Evaluate the individuals with an invalid fitness for ind in offspring: diff --git a/tests/estimation_tests/test_genetic_programming_regression_fitter.py b/tests/estimation_tests/test_genetic_programming_regression_fitter.py index 4382b819..13cfe9f7 100644 --- a/tests/estimation_tests/test_genetic_programming_regression_fitter.py +++ b/tests/estimation_tests/test_genetic_programming_regression_fitter.py @@ -1,10 +1,43 @@ import unittest import pandas as pd +from operator import sub from causal_testing.estimation.genetic_programming_regression_fitter import GP +def root(x): + return x**0.5 + + class TestGP(unittest.TestCase): def test_init_invalid_fun_name(self): with self.assertRaises(ValueError): GP(df=pd.DataFrame(), features=[], outcome="", max_order=2, sympy_conversions={"power_1": ""}) + + def test_simplify_string(self): + gp = GP( + df=None, + features=["x1"], + outcome=None, + max_order=1, + ) + self.assertEqual(str(gp.simplify("power_1(x1)")), "x1") + + def test_fitness(self): + gp = GP( + df=pd.DataFrame({"x1": [1, 2, 3], "outcome": [2, 3, 4]}), + features=["x1"], + outcome="outcome", + max_order=0, + ) + self.assertEqual(gp.fitness("add(x1, 1)"), (0,)) + + def test_fitness_inf(self): + gp = GP( + df=pd.DataFrame({"x1": [1, 2, 3], "outcome": [2, 3, 4]}), + features=["x1"], + outcome="outcome", + max_order=0, + extra_operators=[(root, 1)], + ) + self.assertEqual(gp.fitness("root(-1)"), (float("inf"),))