diff --git a/bin/check_testing_tool.py b/bin/check_testing_tool.py
new file mode 100644
index 00000000..e0da3cf3
--- /dev/null
+++ b/bin/check_testing_tool.py
@@ -0,0 +1,241 @@
+import shutil
+import sys
+from pathlib import Path
+from typing import Optional, Sequence
+
+import config
+import parallel
+from program import Program
+from run import Submission
+from util import *
+
+if TYPE_CHECKING:  # Prevent circular import: https://stackoverflow.com/a/39757388
+    from problem import Problem
+
+"""DISCLAIMER:
+
+  This tool was only made to check testing tools faster.
+  You should still carefully review the code of the testing tool.
+
+  For this tool to work the following things must hold:
+   - the testing tool must be found under `attachments/testing_tool.<ext>`
+   - the testing tool must be callable as `{program} -f {in_path} {submission program}`
+   - the testing tool must accept the downloadable samples as well as those found under
+     `data/testing_tool_test/` as input files
+   - the testing tool must exits with a non zero exit code if something goes wrong
+   - the testing tool must not change the working directory
+"""
+
+
+class TestInput:
+    def __init__(self, problem: "Problem", in_path: Path, short_path: Path):
+        assert in_path.suffix in [".in", ".download", ".statement"]
+        self.problem = problem
+        self.in_path = in_path
+        self.short_path = short_path
+        if self.short_path.suffix in [".download", ".statement"]:
+            ext = self.short_path.suffix
+            name = self.short_path.with_suffix("")
+            assert name.suffix in [".in"]
+            self.name = str(name.with_suffix(ext))
+        else:
+            self.name = str(self.short_path.with_suffix(""))
+
+
+class WrappedSubmission:
+    def __init__(self, problem: "Problem", submission: Submission):
+        self.problem = problem
+        self.submission = submission
+        self.name = submission.name
+        self.tmpdir = (
+            problem.tmpdir / "testing_tool" / submission.tmpdir.relative_to(problem.tmpdir)
+        )
+        self.tmpdir.mkdir(parents=True, exist_ok=True)
+        self.run_command: Optional[list[Path | str]] = None
+
+    def supports_memory_limit(self) -> bool:
+        assert self.run_command is not None
+        assert self.submission.run_command is not None
+        return command_supports_memory_limit(self.run_command) and command_supports_memory_limit(
+            self.submission.run_command
+        )
+
+    def _wrapper_script(self) -> str:
+        assert self.submission.run_command is not None
+        args = ", ".join(map(repr, self.submission.run_command))
+        # script assumes that the working directory is not changed
+        script = """#!/usr/bin/env python3
+import subprocess
+import sys
+from pathlib import Path
+
+result = subprocess.run(
+    [{args}],
+    stdout=sys.stdout,
+    stderr=sys.stderr,
+    stdin=sys.stdin,
+)
+returncode_file = Path(".returncode")
+# For multipass we store the first non zero return code
+write_returncode = True
+if returncode_file.is_file():
+    raw = returncode_file.read_text()
+    try:
+        if int(raw) != 0:
+            write_returncode = False
+    except ValueError:
+        pass
+if write_returncode:
+    returncode_file.write_text(f"{result.returncode}\\n")
+sys.exit(result.returncode)
+"""
+        return script.replace("{args}", args)
+
+    def build(self) -> None:
+        wrapper_file = self.tmpdir / "wrapper.py"
+        wrapper_file.write_text(self._wrapper_script())
+        self.run_command = [sys.executable, wrapper_file]
+
+    def run(self, bar: ProgressBar, testing_tool: "TestingTool", testinput: TestInput) -> bool:
+        assert self.run_command is not None
+        rundir = self.tmpdir / testinput.short_path
+        if rundir.is_file():
+            rundir.unlink()
+        elif rundir.exists():
+            shutil.rmtree(rundir)
+        rundir.mkdir(exist_ok=True, parents=True)
+
+        returncode_file = rundir / ".returncode"
+        in_path = rundir / "testcase.in"
+        ensure_symlink(in_path, testinput.in_path)
+
+        localbar = bar.start(testinput)
+
+        result = testing_tool.run(in_path, self)
+        submission_returncode = None
+        submission_status = None
+        if returncode_file.is_file():
+            raw = returncode_file.read_text()
+            try:
+                submission_returncode = int(raw)
+                submission_status = default_exec_code_map(submission_returncode)
+            except ValueError:
+                pass
+        ok = bool(result.status) and bool(submission_status)
+
+        message = []
+        if result.status == ExecStatus.TIMEOUT:
+            message.append("TIMEOUT")
+        elif not result.status:
+            message.append(f"Testing Tool exit code: {result.returncode}")
+        if (
+            submission_status is not None
+            and not submission_status
+            and submission_status != ExecStatus.TIMEOUT
+        ):
+            message.append(f"Submission exit code: {submission_returncode}")
+        if not message:
+            message.append("OK")
+
+        data = ""
+        if result.out and result.err:
+            data = (
+                "TESTING TOOL STDERR:"
+                + localbar._format_data(result.err)
+                + "\nTESTING TOOL STDOUT:"
+                + localbar._format_data(result.out)
+                + "\n"
+            )
+        elif result.err:
+            data = result.err
+        elif result.out:
+            data = result.out
+
+        localbar.done(ok, ", ".join(message), data)
+        return ok
+
+
+class TestingTool(Program):
+    def __init__(self, problem: "Problem", path: Path):
+        super().__init__(
+            problem,
+            path,
+            "testing_tool",
+            limits={
+                "timeout": problem.limits.timeout,
+                "memory": problem.limits.memory,
+            },
+        )
+
+    def run(self, in_path: Path, submission: WrappedSubmission) -> ExecResult:
+        assert self.run_command is not None
+        assert submission.run_command is not None
+        exec_res = self._exec_command(
+            [*self.run_command, "-f", in_path, *submission.run_command],
+            cwd=in_path.parent,
+            crop=True,
+            memory=self.limits["memory"] if submission.supports_memory_limit() else None,
+        )
+        return exec_res
+
+
+def run(
+    problem: "Problem", testinputs: Sequence[TestInput], submissions: Sequence[Submission]
+) -> bool:
+    wrapped_submissions = [WrappedSubmission(problem, submission) for submission in submissions]
+    for submission in wrapped_submissions:
+        submission.build()
+
+    tool_dir = problem.path / "attachments" / "testing_tool"
+    tool_files = list((problem.path / "attachments").glob("testing_tool.*"))
+    if (tool_dir.is_dir() and tool_files) or len(tool_files) > 1:
+        error("Multiple testing tools found!")
+        return False
+    elif not tool_dir.is_dir() and not tool_files:
+        error("No testing tool found!")
+        return False
+
+    if tool_dir.is_dir():
+        testing_tool = TestingTool(problem, tool_dir)
+    else:
+        testing_tool = TestingTool(problem, tool_files[0])
+
+    bar = ProgressBar("Building testing tool", items=[testing_tool])
+    localbar = bar.start(testing_tool)
+    if not testing_tool.build(bar):
+        localbar.done(False)
+        return False
+    localbar.done()
+    bar.finalize(print_done=False)
+
+    ok = True
+
+    max_submission_len = max([len(x.name) for x in wrapped_submissions])
+    max_testinput_len = max(len(x.name) for x in testinputs)
+
+    # When True, the ProgressBar will print a newline before the first error log.
+    needs_leading_newline = False if config.args.verbose else True
+    for submission in wrapped_submissions:
+        bar = ProgressBar(
+            submission.name,
+            count=len(testinputs),
+            max_len=max_testinput_len + max_submission_len - len(submission.name),
+            needs_leading_newline=needs_leading_newline,
+        )
+        cur_ok = True
+
+        def run_submission(testinput: TestInput) -> None:
+            nonlocal cur_ok
+            # skip after first error
+            if not cur_ok and not config.args.all:
+                bar.skip()
+                return
+            if not submission.run(bar, testing_tool, testinput):
+                # just writing False is thread safe
+                cur_ok = False
+
+        parallel.run_tasks(run_submission, testinputs, pin=True)
+        ok &= cur_ok
+        needs_leading_newline = bar.finalize()
+
+    return ok
diff --git a/bin/constraints.py b/bin/constraints.py
index 73145358..40ad81bc 100644
--- a/bin/constraints.py
+++ b/bin/constraints.py
@@ -1,4 +1,5 @@
 import re
+import sys
 from collections import defaultdict
 from typing import Optional
 
@@ -29,7 +30,7 @@ def check_validators(
     problem.validate_data(validate.Mode.ANSWER, constraints=ans_constraints)
     if not problem.settings.ans_is_output and not ans_constraints:
         log("No constraint validation of answer values found in answer or output validators.")
-    print()
+    print(file=sys.stderr)
 
     validator_values: set[int | float] = set()
     validator_defs: list[str | tuple[int | float, str, int | float]] = []
@@ -275,6 +276,7 @@ def check_constraints(problem: Problem) -> bool:
     print(
         "{:^{width}}|{:^40}".format("VALIDATORS", "PROBLEM STATEMENT", width=left_width),
         sep="",
+        file=sys.stderr,
     )
 
     while statement_defs or validator_defs:
@@ -292,7 +294,7 @@ def check_constraints(problem: Problem) -> bool:
         if val is not None:
             validator_defs.remove(val)
             if isinstance(val, str):
-                print("{:^{width}}".format(val, width=left_width), sep="", end="")
+                print("{:^{width}}".format(val, width=left_width), sep="", end="", file=sys.stderr)
             else:
                 print(
                     "{:>{value_len}_} <= {:^{name_len}} <= {:<{value_len}_}".format(
@@ -300,20 +302,21 @@ def check_constraints(problem: Problem) -> bool:
                     ),
                     sep="",
                     end="",
+                    file=sys.stderr,
                 )
         else:
-            print("{:^{width}}".format("", width=left_width), sep="", end="")
-        print("|", end="")
+            print("{:^{width}}".format("", width=left_width), sep="", end="", file=sys.stderr)
+        print("|", end="", file=sys.stderr)
         if st is not None:
             languages = ",".join(statement_defs[st])
-            print("{:^40} {}".format(st, languages), sep="", end="")
+            print("{:^40} {}".format(st, languages), sep="", end="", file=sys.stderr)
         else:
-            print("{:^40}".format(""), sep="", end="")
-        print()
+            print("{:^40}".format(""), sep="", end="", file=sys.stderr)
+        print(file=sys.stderr)
         if st is not None:
             statement_defs.pop(st)
 
-    print()
+    print(file=sys.stderr)
 
     warned = False
     for value in validator_values:
@@ -323,7 +326,11 @@ def check_constraints(problem: Problem) -> bool:
             if not warned:
                 warned = True
                 warn("Values in validators but missing in some statement:")
-            print(f"{Fore.YELLOW}{value}{Style.RESET_ALL} missing in", ",".join(missing))
+            print(
+                f"{Fore.YELLOW}{value}{Style.RESET_ALL} missing in",
+                ",".join(missing),
+                file=sys.stderr,
+            )
 
     extra_in_statement = set(statement_values.keys()).difference(validator_values)
     if extra_in_statement:
@@ -332,6 +339,7 @@ def check_constraints(problem: Problem) -> bool:
             print(
                 f"{Fore.YELLOW}{value}{Style.RESET_ALL} in",
                 ",".join(sorted(statement_values[value])),
+                file=sys.stderr,
             )
 
     return True
diff --git a/bin/generate.py b/bin/generate.py
index 7dec303a..8279e09f 100644
--- a/bin/generate.py
+++ b/bin/generate.py
@@ -661,6 +661,9 @@ def validate_in(t, problem: Problem, testcase: Testcase, meta_yaml: dict, bar: P
         infile = problem.tmpdir / "data" / t.hash / "testcase.in"
         assert infile.is_file()
 
+        if testcase.root == "testing_tool_test":
+            return True
+
         input_validator_hashes = testcase.validator_hashes(validate.InputValidator, bar)
         if all(h in meta_yaml["input_validator_hashes"] for h in input_validator_hashes):
             return True
@@ -705,7 +708,7 @@ def validate_ans_and_out(
         infile = problem.tmpdir / "data" / t.hash / "testcase.in"
         assert infile.is_file()
 
-        if testcase.root == "invalid_input":
+        if testcase.root in ["invalid_input", "testing_tool_test"]:
             return True
 
         ansfile = infile.with_suffix(".ans")
@@ -939,7 +942,11 @@ def generate_from_rule():
         def generate_from_solution(testcase: Testcase, bar: ProgressBar):
             nonlocal meta_yaml
 
-            if testcase.root in [*config.INVALID_CASE_DIRECTORIES, "valid_output"]:
+            if testcase.root in [
+                *config.INVALID_CASE_DIRECTORIES,
+                "valid_output",
+                "testing_tool_test",
+            ]:
                 return True
             if config.args.no_solution:
                 return True
@@ -1021,6 +1028,8 @@ def generate_visualization(testcase: Testcase, bar: ProgressBar):
 
             if testcase.root in config.INVALID_CASE_DIRECTORIES:
                 return True
+            if testcase.root == "testing_tool_test":
+                return True
             if config.args.no_visualizer:
                 return True
 
@@ -1182,6 +1191,7 @@ def add_test_case_to_cache():
 
             # consider specific files for the uniqueness of this testcase
             relevant_files = {
+                "testing_tool_test": [".in"],
                 "invalid_input": [".in"],
                 "invalid_answer": [".in", ".ans"],
                 "invalid_output": [".in", ".ans", ".out"],
@@ -2155,6 +2165,8 @@ def reorder(self):
                 warn(f"{d} is used for invalid test data. Skipping.")
             elif parts[0] == "valid_output":
                 warn(f"{d} is used for valid test data. Skipping.")
+            elif parts[0] == "testing_tool_test":
+                warn(f"{d} is used to test the testing tool. Skipping.")
             elif path not in self.known_directories:
                 warn(f"{d} is not a generated directory. Skipping.")
             elif not self.known_directories[path].numbered:
diff --git a/bin/problem.py b/bin/problem.py
index f0ef73f2..19fe087b 100644
--- a/bin/problem.py
+++ b/bin/problem.py
@@ -17,6 +17,7 @@
 import parallel
 import run
 import testcase
+import check_testing_tool
 import validate
 import validator_tests
 import verdicts
@@ -214,7 +215,7 @@ def __init__(
         if config.args.timeout:
             self.validation_time = self.generator_time = self.visualizer_time = config.args.timeout
         if config.args.memory:
-            self.memory = self.validation_memory = config.args.memory
+            self.memory = self.compilation_memory = self.validation_memory = config.args.memory
 
 
 class ProblemSettings:
@@ -351,7 +352,7 @@ def __init__(self, path: Path, tmpdir: Path, label: Optional[str] = None):
 
         # Some caches.
         self._testcases = dict[
-            tuple[Optional[validate.Mode], bool, bool], list[testcase.Testcase]
+            tuple[Optional[validate.Mode], bool, bool, bool], list[testcase.Testcase]
         ]()
         self._submissions: Optional[list[run.Submission] | Literal[False]] = None
         self._validators_cache = dict[  # The "bool" is for "check_constraints"
@@ -629,17 +630,17 @@ def testcases(
         mode: Optional[validate.Mode] = None,
         needans=True,
         only_samples=False,
+        testing_tool_test=False,
     ) -> Sequence[testcase.Testcase]:
         only_samples = config.args.samples or only_samples
 
-        key = (mode, needans, only_samples)
+        key = (mode, needans, only_samples, testing_tool_test)
         if key in p._testcases is not None:
             return p._testcases[key]
 
         in_paths = None
         if config.args.testcases:
-            if only_samples:
-                assert False
+            assert not only_samples
             # Deduplicate testcases with both .in and .ans.
             in_paths = []
             for t in config.args.testcases:
@@ -654,6 +655,8 @@ def testcases(
 
             in_paths = list(set(in_paths))
         elif mode is not None:
+            assert not only_samples
+            assert not testing_tool_test
             assert needans
             in_paths = []
             for prefix in {
@@ -663,6 +666,8 @@ def testcases(
                 validate.Mode.VALID_OUTPUT: ["secret", "sample", "valid_output"],
             }[mode]:
                 in_paths += glob(p.path, f"data/{prefix}/**/*.in")
+        elif testing_tool_test:
+            in_paths = list(glob(p.path, "data/testing_tool_test/**/*.in"))
         else:
             in_paths = list(glob(p.path, "data/sample/**/*.in"))
             if not only_samples:
@@ -702,7 +707,7 @@ def testcases(
             testcases.append(t)
         testcases.sort(key=lambda t: t.name)
 
-        if len(testcases) == 0:
+        if len(testcases) == 0 and not testing_tool_test:
             ans = (
                 " with answer"
                 if needans and mode not in [validate.Mode.INVALID, validate.Mode.VALID_OUTPUT]
@@ -1010,7 +1015,7 @@ def _validators(
             paths = list(glob(problem.path / cls.source_dir, "*"))
 
         # TODO: Instead of checking file contents, maybe specify this in generators.yaml?
-        def has_constraints_checking(f):
+        def has_constraints_checking(f: Path) -> bool:
             if not f.is_file():
                 return False
             try:
@@ -1042,7 +1047,7 @@ def has_constraints_checking(f):
         ]
         bar = ProgressBar(f"Building {cls.validator_type} validator", items=validators)
 
-        def build_program(p):
+        def build_program(p: "Program") -> None:
             localbar = bar.start(p)
             p.build(localbar)
             localbar.done()
@@ -1054,7 +1059,9 @@ def build_program(p):
         return validators
 
     # get all testcases and submissions and prepare the output validator and visualizer
-    def prepare_run(problem):
+    def prepare_run(
+        problem,
+    ) -> Literal[False] | tuple[Sequence[testcase.Testcase], Sequence[run.Submission]]:
         testcases = problem.testcases()
         if not testcases:
             return False
@@ -1074,7 +1081,9 @@ def prepare_run(problem):
         return testcases, submissions
 
     @staticmethod
-    def run_some(testcases, submissions):
+    def run_some(
+        testcases: Sequence[testcase.Testcase], submissions: Sequence[run.Submission]
+    ) -> tuple[bool, verdicts.VerdictTable]:
         max_submission_len = max([len(x.name) for x in submissions])
 
         ok = True
@@ -1093,7 +1102,7 @@ def run_some(testcases, submissions):
         return ok, verdict_table
 
     # called by bt run
-    def run_submissions(problem):
+    def run_submissions(problem) -> bool:
         ts_pair = problem.prepare_run()
         if not ts_pair:
             return False
@@ -1119,7 +1128,7 @@ def run_submissions(problem):
     # Instead of validating the output, this function just prints all output to the
     # terminal.
     # Note: The CLI only accepts one submission.
-    def test_submissions(problem):
+    def test_submissions(problem) -> bool:
         submissions = problem.submissions()
         if submissions is False:
             return False
@@ -1132,16 +1141,18 @@ def test_submissions(problem):
         return True
 
     @staticmethod
-    def _print_table(verdict_table, testcases):
+    def _print_table(
+        verdict_table: Sequence[verdicts.Verdicts], testcases: Sequence[testcase.Testcase]
+    ) -> None:
         # Begin by aggregating bitstrings for all testcases, and find bitstrings occurring often (>=config.TABLE_THRESHOLD).
-        def single_verdict(row, testcase):
+        def single_verdict(row: verdicts.Verdicts, testcase: testcase.Testcase) -> str:
             assert row[testcase.name] is not None
             if row[testcase.name] is not False:
                 return verdicts.to_char(row[testcase.name])
             else:
                 return f"{Style.DIM}-{Style.RESET_ALL}"
 
-        def make_verdict(tc):
+        def make_verdict(tc: testcase.Testcase) -> str:
             return "".join(map(lambda row: single_verdict(row, tc), verdict_table))
 
         resultant_count, resultant_id = dict[str, int](), dict[str, int]()
@@ -1214,11 +1225,36 @@ def make_verdict(tc):
                 print(str.format("(Type {})", resultant_id[resultant]), end="", file=sys.stderr)
             print(end="\n", file=sys.stderr)
 
-    def reset_testcase_hashes(self):
-        self._testcase_hashes = {}
+    # called by bt check_testing_tool
+    def check_testing_tool(problem) -> bool:
+        testcases = problem.testcases(needans=False, testing_tool_test=True)
+        testinputs = [
+            check_testing_tool.TestInput(problem, t.in_path, t.short_path) for t in testcases
+        ]
+        if not config.args.testcases:
+            sampleinputs = []
+            for in_path, _ in problem.download_samples():
+                sample = check_testing_tool.TestInput(
+                    problem, in_path, in_path.relative_to(problem.path / "data")
+                )
+                if sample not in testinputs:
+                    sampleinputs.append(sample)
+            testinputs = sampleinputs + testinputs
+        if not testinputs:
+            warn(
+                f"Didn't find any testcases to run the testing tool in problem {problem.name}. Skipping."
+            )
+            return False
+        submissions = problem.selected_or_accepted_submissions()
+        if not submissions:
+            return False
+        return check_testing_tool.run(problem, testinputs, submissions)
+
+    def reset_testcase_hashes(self) -> None:
+        self._testcase_hashes: dict[str, testcase.Testcase] = {}
 
     # Returns None for new testcases or the Testcase object it equals.
-    def matches_existing_testcase(self, t):
+    def matches_existing_testcase(self, t: testcase.Testcase) -> Optional[testcase.Testcase]:
         hashes = {}
         relevant_files = {
             "invalid_input": ["in"],
@@ -1547,7 +1583,7 @@ def get_slowest(result):
                 limits["time_limit"] = problem.limits.time_limit
                 write_yaml(problem_yaml, problem.path / "problem.yaml")
 
-        print()
+        print(file=sys.stderr)
         message(f"{duration:.3f}s @ {testcase} ({submission})", "slowest AC")
         message(
             f"{problem.limits.time_limit}s >= {duration:.3f}s * {problem.limits.ac_to_time_limit}",
@@ -1561,13 +1597,13 @@ def get_slowest(result):
             f"{problem.limits.timeout}s >= {problem.limits.time_limit}s * {problem.limits.time_limit_to_tle}²",
             "timeout",
         )
-        print()
+        print(file=sys.stderr)
 
         submission, testcase, duration = run_all(
             lambda vs: vs == [verdicts.Verdict.TIME_LIMIT_EXCEEDED], min
         )
         if submission is not None:
-            print()
+            print(file=sys.stderr)
             message(f"{duration:.3f}s @ {testcase} ({submission})", "fastest TLE")
             if duration <= problem.limits.time_limit:
                 error("TLE submission runs within time limit")
@@ -1575,7 +1611,7 @@ def get_slowest(result):
                 warn("TLE submission runs within safety margin")
             elif duration >= problem.limits.timeout:
                 log(f"No TLE submission finished within {problem.limits.timeout}s")
-            print()
+            print(file=sys.stderr)
         else:
             log("No TLE submissions found")
 
diff --git a/bin/program.py b/bin/program.py
index cde1633d..7786a3c1 100644
--- a/bin/program.py
+++ b/bin/program.py
@@ -106,7 +106,7 @@ def sanitizer():
 # After build() has been called, the following are available:
 # - run_command:    command to be executed. E.g. ['/path/to/run'] or ['python3', '/path/to/main.py']. `None` if something failed.
 #
-# build() will return the (run_command, message) pair.
+# build() will return the true if building was successfull.
 class Program:
     input_files: list[Path]  # Populated in Program.build
 
@@ -181,18 +181,18 @@ def __init__(
 
     # is file at path executable
     @staticmethod
-    def _is_executable(path):
-        return path.is_file() and (
-            path.stat().st_mode & (stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
+    def _is_executable(path: Path) -> bool:
+        return bool(
+            path.is_file() and (path.stat().st_mode & (stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH))
         )
 
     # Returns true when file f matches the given shebang regex.
     @staticmethod
-    def _matches_shebang(f, shebang):
+    def _matches_shebang(f: Path, shebang: Optional[re.Pattern]) -> bool:
         if shebang is None:
             return True
         with f.open() as o:
-            return shebang.search(o.readline())
+            return shebang.search(o.readline()) is not None
 
     # Do not warn for the same fallback language multiple times.
     warn_cache: set[str] = set()
@@ -200,7 +200,7 @@ def _matches_shebang(f, shebang):
     language: Optional[str]
 
     # Sets self.language and self.env['mainfile']
-    def _get_language(self, bar: ProgressBar):
+    def _get_language(self, bar: ProgressBar) -> bool:
         fallback = False
         candidates = []
         for lang in languages():
@@ -300,7 +300,7 @@ def _get_language(self, bar: ProgressBar):
         bar.error(f"No language detected for {self.path}.")
         return False
 
-    def _checks(self, bar: ProgressBar):
+    def _checks(self, bar: ProgressBar) -> None:
         for f in self.source_files:
             if f.stat().st_size >= config.ICPC_FILE_LIMIT * 1024**2:
                 bar.warn(
@@ -367,7 +367,7 @@ def _checks(self, bar: ProgressBar):
                         pass
 
     # Return True on success.
-    def _compile(self, bar: ProgressBar):
+    def _compile(self, bar: ProgressBar) -> bool:
         meta_path = self.tmpdir / "meta_.yaml"
 
         # Remove all non-source files.
@@ -415,7 +415,7 @@ def _compile(self, bar: ProgressBar):
         return True
 
     # Return True on success, False on failure.
-    def build(self, bar: ProgressBar):
+    def build(self, bar: ProgressBar) -> bool:
         assert not self.built
         self.built = True
 
@@ -527,7 +527,7 @@ def _exec_command(self, *args, **kwargs) -> ExecResult:
         return exec_command(*args, **kwargs)
 
     @staticmethod
-    def add_callback(problem, path, c):
+    def add_callback(problem: "Problem", path: Path, c: Callable[["Program"], Any]):
         if path not in problem._program_callbacks:
             problem._program_callbacks[path] = []
         problem._program_callbacks[path].append(c)
@@ -547,7 +547,7 @@ def __init__(self, problem: "Problem", path: Path, **kwargs):
     # Run the generator in the given working directory.
     # May write files in |cwd| and stdout is piped to {name}.in if it's not written already.
     # Returns ExecResult. Success when result.status == ExecStatus.ACCEPTED.
-    def run(self, bar, cwd, name, args=[]):
+    def run(self, bar: ProgressBar, cwd: Path, name: str, args: list[str] = []) -> ExecResult:
         assert self.run_command is not None
 
         in_path = cwd / (name + ".in")
diff --git a/bin/run.py b/bin/run.py
index c8d27039..4707fdd4 100644
--- a/bin/run.py
+++ b/bin/run.py
@@ -483,10 +483,7 @@ def process_run(run: Run):
             localbar.item_width = padding_len
             localbar.done(got_expected, message, data, print_item=False)
 
-        p = parallel.new_queue(process_run, pin=True)
-        for run in runs:
-            p.put(run)
-        p.done()
+        parallel.run_tasks(process_run, runs, pin=True)
 
         self.verdict = verdicts["."]
         assert isinstance(self.verdict, Verdict), "Verdict of root must not be empty"
diff --git a/bin/stats.py b/bin/stats.py
index 9f0740d7..ba357d12 100644
--- a/bin/stats.py
+++ b/bin/stats.py
@@ -278,7 +278,7 @@ def loc(file: Path) -> Optional[int]:
             content = file.read_text()
             lexer = lexers.guess_lexer_for_filename(file, content)
             assert isinstance(lexer, pygments.lexer.Lexer)
-            language = lexer.name.lower()
+            language = getattr(lexer, "name").lower()
             tokens = lexer.get_tokens(content)
 
             count = 0
diff --git a/bin/tools.py b/bin/tools.py
index b1e5d0b9..fbf5dd96 100755
--- a/bin/tools.py
+++ b/bin/tools.py
@@ -288,7 +288,7 @@ def split_submissions_and_testcases(s: list[Path]) -> tuple[list[Path], list[Pat
     submissions = []
     testcases = []
     for p in s:
-        testcase_dirs = ["data", "sample", "secret", "fuzz"]
+        testcase_dirs = ["data", "sample", "secret", "fuzz", "testing_tool_cases"]
         if (
             any(part in testcase_dirs for part in p.parts)
             or p.suffix in config.KNOWN_DATA_EXTENSIONS
@@ -817,6 +817,35 @@ def build_parser() -> SuppressingParser:
         help="Override the default timeout. Default: 1.5 * time_limit + 1.",
     )
 
+    checktestingtool = subparsers.add_parser(
+        "check_testing_tool",
+        parents=[global_parser],
+        help="Run testing_tool against some or all accepted submissions.",
+    )
+    checktestingtool.add_argument(
+        "submissions",
+        nargs="*",
+        type=Path,
+        help="optionally supply a list of programs and testcases to run",
+    )
+    checktestingtool.add_argument(
+        "--no-generate",
+        "-G",
+        action="store_true",
+        help="Do not run `generate` before running submissions.",
+    )
+    checktestingtool.add_argument(
+        "--timeout",
+        type=int,
+        help="Override the default timeout. Default: 1.5 * time_limit + 1.",
+    )
+    checktestingtool.add_argument(
+        "--all",
+        "-a",
+        action="store_true",
+        help="Run all testcases and don't stop on error.",
+    )
+
     # Sort
     subparsers.add_parser(
         "sort", parents=[global_parser], help="sort the problems for a contest by name"
@@ -1087,7 +1116,7 @@ def run_parsed_arguments(args: argparse.Namespace, personal_config: bool = True)
     problems, tmpdir = get_problems(problem_dir)
 
     # Split submissions and testcases when needed.
-    if action in ["run", "fuzz", "time_limit"]:
+    if action in ["run", "fuzz", "time_limit", "check_testing_tool"]:
         if config.args.submissions:
             config.args.submissions, config.args.testcases = split_submissions_and_testcases(
                 config.args.submissions
@@ -1234,7 +1263,10 @@ def run_parsed_arguments(args: argparse.Namespace, personal_config: bool = True)
 
         if action in ["generate"]:
             success &= generate.generate(problem)
-        if action in ["all", "constraints", "run", "time_limit"] and not config.args.no_generate:
+        if (
+            action in ["all", "constraints", "run", "time_limit", "check_testing_tool"]
+            and not config.args.no_generate
+        ):
             # Call `generate` with modified arguments.
             old_args = argparse.Namespace(**vars(config.args))
             config.args.jobs = (os.cpu_count() or 1) // 2
@@ -1295,6 +1327,8 @@ def run_parsed_arguments(args: argparse.Namespace, personal_config: bool = True)
             success &= problem.test_submissions()
         if action in ["constraints"]:
             success &= constraints.check_constraints(problem)
+        if action in ["check_testing_tool"]:
+            problem.check_testing_tool()
         if action in ["time_limit"]:
             success &= problem.determine_time_limit()
         if action in ["zip"]:
diff --git a/bin/util.py b/bin/util.py
index eb584f6c..991b8f7c 100644
--- a/bin/util.py
+++ b/bin/util.py
@@ -1235,6 +1235,11 @@ def __init__(
         self.pass_id = pass_id
 
 
+def command_supports_memory_limit(command: Sequence[str | Path]) -> bool:
+    # https://bugs.openjdk.org/browse/JDK-8071445
+    return Path(command[0]).name not in ["java", "javac", "kotlin", "kotlinc", "sbcl"]
+
+
 def limit_setter(
     command: Optional[Sequence[str | Path]],
     timeout: Optional[int],
@@ -1242,35 +1247,45 @@ def limit_setter(
     group: Optional[int] = None,
     cores: Literal[False] | list[int] = False,
 ) -> Callable[[], None]:
+    # perform all syscalls / things that could fail in the current context, i.e., outside of the preexec_fn
+    disable_stack_limit = not is_bsd()
+
+    if config.args.memory:
+        memory_limit = config.args.memory
     if memory_limit:
+        memory_limit *= 1024**2
         assert command is not None
-        jvm = Path(command[0]).name in ["java", "javac", "kotlin", "kotlinc"]
+        if not command_supports_memory_limit(command):
+            memory_limit = None
+    if config.args.sanitizer or is_bsd() or is_windows():
+        memory_limit = None
 
     if group is not None:
         assert not is_windows()
         assert not is_mac()
 
+    if not is_windows() and not is_bsd():
+        cores = False
+
+    # actual preexec_fn called in the context of the new process
+    # this should only do resource and os calls to stay safe
     def setlimits() -> None:
-        if timeout:
+        if timeout is not None:
             resource.setrlimit(resource.RLIMIT_CPU, (timeout + 1, timeout + 1))
 
         # Increase the max stack size from default to the max available.
-        if not is_bsd():
+        if disable_stack_limit:
             resource.setrlimit(
                 resource.RLIMIT_STACK, (resource.RLIM_INFINITY, resource.RLIM_INFINITY)
             )
 
-        if memory_limit and not jvm and not is_bsd():
-            resource.setrlimit(
-                resource.RLIMIT_AS,
-                (memory_limit * 1024**2, memory_limit * 1024**2),
-            )
+        if memory_limit is not None:
+            resource.setrlimit(resource.RLIMIT_AS, (memory_limit, memory_limit))
 
-        # TODO: with python 3.11 it is better to use Popen(process_group=group)
         if group is not None:
             os.setpgid(0, group)
 
-        if cores is not False and not is_windows() and not is_bsd():
+        if cores is not False:
             os.sched_setaffinity(0, cores)
 
         # Disable coredumps.
@@ -1344,7 +1359,7 @@ def exec_command(
     command: Sequence[str | Path],
     exec_code_map: Callable[[int], ExecStatus] = default_exec_code_map,
     crop: bool = True,
-    preexec_fn: bool | Callable[[], None] = True,
+    preexec_fn: bool = True,
     **kwargs: Any,
 ) -> ExecResult:
     # By default: discard stdout, return stderr
@@ -1368,21 +1383,13 @@ def exec_command(
 
     timeout: Optional[int] = None
     if "timeout" in kwargs:
-        if kwargs["timeout"] is None:
-            timeout = None
-        elif kwargs["timeout"]:
-            timeout = kwargs["timeout"]
+        timeout = kwargs["timeout"]
         kwargs.pop("timeout")
 
     memory: Optional[int] = None
     if "memory" in kwargs:
-        if kwargs["memory"] is not None:
-            memory = kwargs["memory"]
+        memory = kwargs["memory"]
         kwargs.pop("memory")
-    if config.args.memory:
-        memory = config.args.memory
-    if is_windows() or config.args.sanitizer:
-        memory = None
 
     process: Optional[ResourcePopen] = None
     old_handler = None
@@ -1401,7 +1408,7 @@ def interrupt_handler(sig: Any, frame: Any) -> None:
     tstart = time.monotonic()
 
     try:
-        if not is_windows() and preexec_fn not in [False, None]:
+        if not is_windows() and preexec_fn is not False:
             process = ResourcePopen(
                 command,
                 preexec_fn=limit_setter(command, timeout, memory),
diff --git a/bin/verdicts.py b/bin/verdicts.py
index c4996d2c..6a4a5f92 100644
--- a/bin/verdicts.py
+++ b/bin/verdicts.py
@@ -4,7 +4,7 @@
 import threading
 from enum import Enum
 from pathlib import Path
-from typing import Literal, TYPE_CHECKING
+from typing import Literal, Sequence, TYPE_CHECKING
 
 from colorama import Fore, Style
 
@@ -168,7 +168,7 @@ class Verdicts:
 
     def __init__(
         self,
-        test_cases_list: list[testcase.Testcase],
+        test_cases_list: Sequence[testcase.Testcase],
         timeout: int,
         run_until: RunUntil = RunUntil.FIRST_ERROR,
     ):
@@ -380,7 +380,7 @@ def __iter__(self):
     def __init__(
         self,
         submissions,
-        test_cases: list[testcase.Testcase],
+        test_cases: Sequence[testcase.Testcase],
         width: int = ProgressBar.columns,
         height: int = shutil.get_terminal_size().lines,
         max_name_width: int = 50,
diff --git a/doc/commands.md b/doc/commands.md
index a547fc78..6bff0491 100644
--- a/doc/commands.md
+++ b/doc/commands.md
@@ -26,6 +26,7 @@ This lists all subcommands and their most important options.
   - [`bt output [-v] [testcases [testcases ...]]`](#output)
   - [`bt validate [-v] [--input | --answer | --invalid | --valid-output | --generic [TYPE]] [--remove | --move-to DIR] [testcases [testcases ...]]`](#validate)
   - [`bt constraints [-v]`](#constraints)
+  - [`bt check_testing_tool [submissions [submissions ...]] [testcases [testcases ...]`](#check_testing_tool)
 - Creating new contest/problems
   - [`bt new_contest [contestname]`](#new_contest)
   - [`bt new_problem [problemname] [--author AUTHOR] [--type {pass-fail,float,custom,interactive,...}] [--defaults] [--skel SKEL]`](#new_problem)
@@ -334,6 +335,24 @@ This output will look like:
                               |            a_i  1
 ```
 
+## `check_testing_tool`
+
+`bt check_testing_tool` tries to run the testing tool with some submissions to ensure that it works properly.
+However, this tool has many caveats and should never replace a carefull manual review of the testing tool.
+
+**Caveats**
+- the testing tool must be found under `attachments/testing_tool.<ext>`
+- the testing tool must be callable as `{program} -f {in_path} {submission program}`
+- the testing tool must accept the downloadable samples as well as files matching `data/testing_tool_test/*.in` as input files
+- the testing tool must exits with a non zero exit code if something goes wrong
+- the testing tool must not change the working directory
+
+**Flags**
+
+- `--timeout <seconds>`: Override the default timeout.
+- `--all`/`-a`: run all testcases and don't stop after first error
+- `--no-generate`/`-G`: Do not generate testcases before running. This usually won't be needed since checking that generated testcases are up to date is fast.
+
 # Creating a new contest/problem
 
 ## `new_contest`
diff --git a/test/problems/alternativeencryption/attachments/testing_tool.py b/test/problems/alternativeencryption/attachments/testing_tool.py
new file mode 100755
index 00000000..caaf4297
--- /dev/null
+++ b/test/problems/alternativeencryption/attachments/testing_tool.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+#
+# Testing tool for the Alternative Encryption problem
+#
+# Usage:
+#
+#   python3 testing_tool.py -f inputfile <program invocation>
+#
+#
+# Use the -f parameter to specify the input file, e.g. 1.in.
+# The input file should contain the following:
+# - The first line contains "encrypt".
+# - The second line contains an integer n, the number of strings.
+# - The following n lines each contain one string to encrypt.
+
+# You can compile and run your solution as follows:
+
+# C++:
+#   g++ solution.cpp
+#   python3 testing_tool.py -f 1.in ./a.out
+
+# Python:
+#   python3 testing_tool.py -f 1.in python3 ./solution.py
+
+# Java:
+#   javac solution.java
+#   python3 testing_tool.py -f 1.in java solution
+
+# Kotlin:
+#   kotlinc solution.kt
+#   python3 testing_tool.py -f 1.in kotlin solutionKt
+
+
+# The tool is provided as-is, and you should feel free to make
+# whatever alterations or augmentations you like to it.
+#
+# The tool attempts to detect and report common errors, but it is not an exhaustive test.
+# It is not guaranteed that a program that passes this testing tool will be accepted.
+
+
+import argparse
+import subprocess
+import traceback
+
+parser = argparse.ArgumentParser(description="Testing tool for problem Alternative Encryption.")
+parser.add_argument(
+    "-f",
+    dest="inputfile",
+    metavar="inputfile",
+    default=None,
+    type=argparse.FileType("r"),
+    required=True,
+    help="The input file to use.",
+)
+parser.add_argument("program", nargs="+", help="Invocation of your solution")
+
+args = parser.parse_args()
+
+
+def single_pass(action: str, words: list[str]) -> list[str]:
+    with (
+        subprocess.Popen(
+            " ".join(args.program),
+            shell=True,
+            stdout=subprocess.PIPE,
+            stdin=subprocess.PIPE,
+            universal_newlines=True,
+        ) as p,
+    ):
+        assert p.stdin is not None and p.stdout is not None
+
+        raw = "\n".join([action, str(len(words)), *words])
+        (stdout, stderr) = p.communicate(input=raw)
+        output = [line.strip() for line in stdout.strip().split("\n") if line.strip()]
+
+        assert len(output) == len(words), (
+            f"Your submission printed {len(output)} words, expected {len(words)} words."
+        )
+        print(f"{action} exit code: {p.returncode}")
+        print(f"{action} output:")
+        print()
+        print(stdout, flush=True)
+
+        for word_a, word_b in zip(words, output):
+            assert len(word_a) == len(word_b), (
+                f"Your submission changed the length of '{word_a}', you printed '{word_b}'"
+            )
+
+            for i, (char_a, char_b) in enumerate(zip(word_a, word_b), start=1):
+                assert char_a != char_b, (
+                    f"Letter at position {i} ({char_a}) is the same: '{word_a}' => '{word_b}'"
+                )
+
+        return output
+
+
+try:
+    with args.inputfile as f:
+        # Parse input
+        lines = [line.strip() for line in f.readlines()]
+        action = lines[0]
+        n = int(lines[1])
+        words = lines[2:]
+
+        assert action == "encrypt", f"Initial action must be 'encrypt', but got {action}"
+
+    encrypted = single_pass("encrypt", words)
+    decrypted = single_pass("decrypt", encrypted)
+
+    for expected, got in zip(words, decrypted):
+        assert expected == got, f"Got decrypted word '{got}', expected '{expected}'"
+
+    print("Success.")
+
+except AssertionError as e:
+    print()
+    print(f"Error: {e}")
+    print()
+    exit(1)
+
+except Exception:
+    print()
+    print("Unexpected error:")
+    traceback.print_exc()
+    print()
+    exit(1)
diff --git a/test/problems/alternativeencryption/data/sample/001.ans b/test/problems/alternativeencryption/data/sample/01.ans
similarity index 100%
rename from test/problems/alternativeencryption/data/sample/001.ans
rename to test/problems/alternativeencryption/data/sample/01.ans
diff --git a/test/problems/alternativeencryption/data/sample/001.in b/test/problems/alternativeencryption/data/sample/01.in
similarity index 100%
rename from test/problems/alternativeencryption/data/sample/001.in
rename to test/problems/alternativeencryption/data/sample/01.in
diff --git a/test/problems/alternativeencryption/data/sample/001.interaction b/test/problems/alternativeencryption/data/sample/01.interaction
similarity index 100%
rename from test/problems/alternativeencryption/data/sample/001.interaction
rename to test/problems/alternativeencryption/data/sample/01.interaction
diff --git a/test/problems/alternativeencryption/generators/generators.yaml b/test/problems/alternativeencryption/generators/generators.yaml
index 2d4f34a5..c63dc032 100644
--- a/test/problems/alternativeencryption/generators/generators.yaml
+++ b/test/problems/alternativeencryption/generators/generators.yaml
@@ -31,4 +31,4 @@ data:
 
       - random:
           generate: eval.py {seed} 1000 randstr(randrange(1, 101))
-          count: 100
+          count: 10
diff --git a/test/problems/alternativeencryption/submissions/accepted/Paul.kt b/test/problems/alternativeencryption/submissions/accepted/Paul.kt
deleted file mode 100644
index bd8e05c3..00000000
--- a/test/problems/alternativeencryption/submissions/accepted/Paul.kt
+++ /dev/null
@@ -1,8 +0,0 @@
-import java.util.*
-
-fun main() {
-    readln()
-    for (i in 1..readln().toInt()) {
-        println(readln().trim().map { (((it.code - 1) xor 1) + 1).toChar() }.joinToString(""))
-    }
-}
diff --git a/test/test_problems.py b/test/test_problems.py
index 4f0b7eee..1a372384 100644
--- a/test/test_problems.py
+++ b/test/test_problems.py
@@ -21,6 +21,7 @@
     "interactivemultipass",
     "multipass",
     "constants",
+    "alternativeencryption",
 ] + ["hellounix" if not util.is_mac() and not util.is_windows() else []]
 
 RUN_DIR = Path.cwd().absolute()
@@ -42,6 +43,28 @@ def test_problem(self):
         tools.test(["run"])
 
 
+@pytest.fixture(scope="class")
+def setup_alternativeencryption_problem(request):
+    problem_dir = RUN_DIR / "test/problems/alternativeencryption"
+    os.chdir(problem_dir)
+    try:
+        tools.test(["tmp", "--clean"])
+        yield
+    finally:
+        tools.test(["tmp", "--clean"])
+        os.chdir(RUN_DIR)
+
+
+@pytest.mark.usefixtures("setup_alternativeencryption_problem")
+class TestAlternativeencryptionProblem:
+    def test_check_testing_tool(self):
+        tools.test(["check_testing_tool"])
+
+    def test_bad_check_testing_tool(self):
+        with pytest.raises(SystemExit):
+            tools.test(["check_testing_tool", "submissions/wrong_answer/no-change.py"])
+
+
 @pytest.fixture(scope="class")
 def setup_constants_problem(request):
     problem_dir = RUN_DIR / "test/problems/constants"