diff --git a/amlb/benchmark.py b/amlb/benchmark.py
index 071709e0b..c703561b2 100644
--- a/amlb/benchmark.py
+++ b/amlb/benchmark.py
@@ -279,7 +279,7 @@ class TaskConfig:
 
     def __init__(self, name, fold, metrics, seed,
                  max_runtime_seconds, cores, max_mem_size_mb, min_vol_size_mb,
-                 input_dir, output_dir):
+                 input_dir, output_dir, extensions):
         self.framework = None
         self.framework_params = None
         self.type = None
@@ -295,6 +295,7 @@ def __init__(self, name, fold, metrics, seed,
         self.input_dir = input_dir
         self.output_dir = output_dir
         self.output_predictions_file = os.path.join(output_dir, "predictions.csv")
+        self.extensions = extensions
 
     def __json__(self):
         return self.__dict__
@@ -350,6 +351,7 @@ def __init__(self, benchmark: Benchmark, task_def, fold):
             min_vol_size_mb=task_def.min_vol_size_mb,
             input_dir=rconfig().input_dir,
             output_dir=benchmark.output_dirs.session,
+            extensions=rconfig().extensions_files,
         )
         # allowing to override some task parameters through command line, e.g.: -Xt.max_runtime_seconds=60
         if rconfig()['t'] is not None:
diff --git a/amlb/results.py b/amlb/results.py
index 89f5312fb..b20ebe8f9 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -17,6 +17,7 @@
 from .datautils import accuracy_score, confusion_matrix, f1_score, log_loss, balanced_accuracy_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score, roc_auc_score, read_csv, write_csv, is_data_frame, to_data_frame
 from .resources import get as rget, config as rconfig, output_dirs
 from .utils import Namespace, backup_file, cached, datetime_iso, memoize, profile
+from frameworks.shared.callee import get_extension
 
 log = logging.getLogger(__name__)
 
@@ -323,6 +324,13 @@ def __init__(self, predictions_df, info=None):
     def evaluate(self, metric):
         if hasattr(self, metric):
             return getattr(self, metric)()
+        else:
+            # A metric may be defined twice, once for the automl system to use (e.g.
+            # as a scikit-learn scorer), and once in the amlb-compatible format.
+            # The amlb-compatible format is marked with a trailing underscore.
+            custom_metric = get_extension(rconfig().extensions_files, f"{metric}_")
+            if custom_metric is not None:
+                return custom_metric(self)
         # raise ValueError("Metric {metric} is not supported for {type}.".format(metric=metric, type=self.type))
         log.warning("Metric %s is not supported for %s!", metric, self.type)
         return nan
diff --git a/frameworks/AutoGluon/exec.py b/frameworks/AutoGluon/exec.py
index 8dfb4004e..3336b2251 100644
--- a/frameworks/AutoGluon/exec.py
+++ b/frameworks/AutoGluon/exec.py
@@ -12,7 +12,7 @@
 from autogluon.utils.tabular.utils.savers import save_pd, save_pkl
 import autogluon.utils.tabular.metrics as metrics
 
-from frameworks.shared.callee import call_run, result, output_subdir, utils
+from frameworks.shared.callee import call_run, get_extension, result, output_subdir, utils
 
 log = logging.getLogger(__name__)
 
@@ -32,7 +32,8 @@ def run(dataset, config):
         rmse=metrics.mean_squared_error,  # for now, we can let autogluon optimize training on mse: anyway we compute final score from predictions.
     )
 
-    perf_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    perf_metric = (metrics_mapping[config.metric] if config.metric in metrics_mapping
+                   else get_extension(config.extensions, config.metric))
     if perf_metric is None:
         # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping
         log.warning("Performance metric %s not supported.", config.metric)
diff --git a/frameworks/TPOT/exec.py b/frameworks/TPOT/exec.py
index 0bfdf096a..986d3a3ab 100644
--- a/frameworks/TPOT/exec.py
+++ b/frameworks/TPOT/exec.py
@@ -12,7 +12,7 @@
 os.environ['MKL_NUM_THREADS'] = '1'
 from tpot import TPOTClassifier, TPOTRegressor
 
-from frameworks.shared.callee import call_run, result, output_subdir, utils
+from frameworks.shared.callee import call_run, get_extension, result, output_subdir, utils
 
 
 log = logging.getLogger(__name__)
@@ -34,7 +34,8 @@ def run(dataset, config):
         r2='r2',
         rmse='neg_mean_squared_error',  # TPOT can score on mse, as app computes rmse independently on predictions
     )
-    scoring_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    scoring_metric = (metrics_mapping[config.metric] if config.metric in metrics_mapping
+                      else get_extension(config.extensions, config.metric))
     if scoring_metric is None:
         raise ValueError("Performance metric {} not supported.".format(config.metric))
 
diff --git a/frameworks/autosklearn/exec.py b/frameworks/autosklearn/exec.py
index 7d3c5140a..130e7cfe0 100644
--- a/frameworks/autosklearn/exec.py
+++ b/frameworks/autosklearn/exec.py
@@ -13,7 +13,7 @@
 import autosklearn.metrics as metrics
 from packaging import version
 
-from frameworks.shared.callee import call_run, result, output_subdir, utils
+from frameworks.shared.callee import call_run, get_extension, result, output_subdir, utils
 
 log = logging.getLogger(__name__)
 
@@ -36,7 +36,8 @@ def run(dataset, config):
         rmse=metrics.mean_squared_error,  # autosklearn can optimize on mse, and we compute rmse independently on predictions
         r2=metrics.r2
     )
-    perf_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    perf_metric = (metrics_mapping[config.metric] if config.metric in metrics_mapping
+                   else get_extension(config.extensions, config.metric))
     if perf_metric is None:
         # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping
         log.warning("Performance metric %s not supported.", config.metric)
diff --git a/frameworks/shared/callee.py b/frameworks/shared/callee.py
index 6d355da20..5a6c182bc 100644
--- a/frameworks/shared/callee.py
+++ b/frameworks/shared/callee.py
@@ -1,3 +1,4 @@
+import linecache
 import json
 import logging
 import os
@@ -44,7 +45,38 @@ def output_subdir(name, config):
     return subdir
 
 
-data_keys = re.compile("^(X|y|data)(_.+)?$")
+_extensions_ = {}
+
+
+def get_extension(files, name=None, default=None):
+    files = [files] if isinstance(files, str) else files
+
+    extensions = []
+    for file in files:
+        if file in _extensions_:
+            extensions.append(_extensions_.get(file, {}))
+        elif os.path.isfile(file):
+            try:
+                with open(file) as f:
+                    # linecache and compile are necessary only if we want to inspect code later
+                    # otherwise the following statement is enough:
+                    # exec(f.read(), customizations)
+                    linecache.updatecache(f.name)
+                    code = compile(f.read(), f.name, 'exec')
+                    ext = {}
+                    exec(code, ext)
+                    _extensions_[file] = ext
+                    extensions.append(ext)
+            except Exception as e:
+                log.warning("Could not load extension file %s: %s", file, str(e))
+                _extensions_[file] = {}
+        else:
+            log.warning("No extensions available at %s", file)
+
+    return extensions if name is None else next((ext[name] for ext in extensions if name in ext), default)
+
+
+_data_keys_ = re.compile("^(X|y|data)(_.+)?$")
 
 
 def call_run(run_fn):
@@ -53,7 +85,7 @@ def call_run(run_fn):
     params = NS.from_dict(json.loads(sys.stdin.read()))
 
     def load_data(name, path, **ignored):
-        if isinstance(path, str) and data_keys.match(name):
+        if isinstance(path, str) and _data_keys_.match(name):
             return name, np.load(path, allow_pickle=True)
         return name, path
 
diff --git a/resources/config.yaml b/resources/config.yaml
index a8ad914d6..8487e7e62 100644
--- a/resources/config.yaml
+++ b/resources/config.yaml
@@ -50,6 +50,9 @@ benchmarks:
     max_mem_size_mb: -1       # default amount of memory assigned to each automl task. If <= 0, then the amount of memory is computed from os available memory.
     min_vol_size_mb: -1       # default minimum amount of free space required on the volume. If <= 0, skips verification.
 
+extensions_files:
+  - '{user}/extensions.py'
+
 results:
   error_max_length: 200
   save: true  # set by runbenchmark.py