diff --git a/bin/expectations.py b/bin/expectations.py
new file mode 100644
index 000000000..7337f5515
--- /dev/null
+++ b/bin/expectations.py
@@ -0,0 +1,409 @@
+"""Expectations for a submission
+
+Here is a sample expectations.yaml file:
+
+    accepted/: accepted     # Every submission in accepted/* should be accepted
+    wrong_answer/th.py:     # This particular submission ...
+      sample: accepted      # ... should be acceped on sample
+      secret: wrong answer  # ... but fail with WA on some test case in secret
+    mixed/failing.java      # For this particular submission, ...
+      secret/huge/graph07:  # ... on this particular test case ...
+        permitted: [TLE, RTE] # ... only TLE and RTE are permitted
+
+A yaml parser will turn this into a dict that can be fed to the Registry class:
+
+>>> exp_dict = {
+...     "accepted/": "accepted",
+...     "wrong_answer/th.py": {"sample": "accepted", "secret": "wrong answer"},
+...     "mixed/failing.java": {"secret/huge/graph07": {"permitted": ["TLE", "RTE"]}},
+...     "mixed/": {"sample": "accepted"}
+... }
+
+
+>>> registry = Registry.from_dict(exp_dict)
+>>> registry['mixed/']
+'sample': {permitted: {AC}, required: None}
+
+Expectations for a single submission can now be extracted from
+the registry. Here, the submission `mixed/failing.java` matches two patterns,
+so those will be the expectations that apply to that submission.
+
+>>> sub_registry = registry.for_path(Path("mixed/failing.java"))
+>>> sorted(sub_registry.keys())
+['mixed/', 'mixed/failing.java']
+
+Expectations for a submission can be compared with actual validation
+results. This runs all (in this case, both), sets of expecations
+against the results.
+
+>>> results_ac = { "sample/1": "AC", "secret/1": "AC", "secret/2": "AC" }
+>>> results_wa = { "sample/1": "WA", "secret/1": "AC", "secret/2": "WA" }
+>>> sub_registry.check(results_ac)
+True
+>>> sub_registry.check(results_wa)
+False
+
+
+Altenatively, supply a submission path check the submission and results
+directly against the expectations dictionary.
+
+>>> registry.for_path(Path("accepted/ragnar.cpp")).check(results_ac)
+True
+>>> registry.for_path(Path("accepted/ragnar.cpp")).check(results_wa)
+False
+>>> registry.for_path(Path("wrong_answer/th.py")).check(results_wa)
+False
+>>> results_wa_secret = { "sample/1": "AC", "secret/1": "AC", "secret/2": "WA" }
+>>> registry.for_path(Path("wrong_answer/th.py")).check(results_wa_secret)
+True
+
+Checking some results against no relevant expectations always succeeds:
+>>> registry.for_path(Path("mixed/failing.java")).check(results_wa_secret)
+True
+
+Terminology
+-----------
+
+verdict
+    A testcase can have a verdict, which is any of 'AC', 'WA', 'RTE', 'TLE'.
+    (Note that the verdict 'JE' is never expected.)
+
+result
+    a verdict for a path representing a testcase, like "TLE" for "secret/huge/random-01"
+
+score
+    A finite number, often just an integer in the range {0, ..., 100}, but can be a float.
+    NOT IMPLEMENTED
+
+range
+    A string of two space-separated numbers, like '0 30' or '-inf 43' or '3.14 3.14';
+    a one-value range can be abbreviated: '5' is the range '5 5'.
+    NOT IMPLEMENTED
+"""
+
+from pathlib import Path
+import re
+
+
+class TestCasePattern(str):
+    """A pattern that matches against testgroups and -cases."""
+
+    def __new__(cls, content):
+        if content != "" and not content.startswith("sample") and not content.startswith("secret"):
+            raise ValueError(f"Unexpected test case pattern {content}")
+        return super().__new__(cls, content)
+
+
+class BaseExpectations:
+    """Base expectations."""
+
+    def __init__(self, expectations: str | list[int | float] | dict):
+        self._permitted_verdicts: set[str] | None = None
+        self._required_verdicts: set[str] | None = None
+
+        if isinstance(expectations, str):
+            self._set_common(expectations)
+        elif isinstance(expectations, list):
+            raise ValueError("Range expecations not implemented")
+        elif isinstance(expectations, dict):
+            for k, val in expectations.items():
+                if k == "permitted":
+                    self._permitted_verdicts = val if isinstance(val, set) else set(val)
+                elif k == "required":
+                    self._required_verdicts = val if isinstance(val, set) else set(val)
+                elif k in ["judge_message", "score", "fractional_score"]:
+                    raise ValueError(f"Key {k} not implemented")
+                else:
+                    raise ValueError(f"Unrecognised key {k}")
+
+    def permitted_verdicts(self) -> set[str]:
+        """Returns a set of verdicts."""
+        return self._permitted_verdicts or set(["AC", "WA", "TLE", "RTE"])
+
+    def required_verdicts(self) -> set[str]:
+        """Returns a set of verdicts."""
+        return self._required_verdicts or set()
+
+    def _set_common(self, abbreviation):
+        permissions = None
+        requirements = None
+        if abbreviation == "accepted":
+            permissions = set(["AC"])
+        elif abbreviation == "wrong answer":
+            permissions = set(["AC", "WA"])
+            requirements = set(["WA"])
+        elif abbreviation == "time limit exceeded":
+            permissions = set(["AC", "TLE"])
+            requirements = set(["TLE"])
+        elif abbreviation == "runtime exception":
+            permissions = set(["AC", "RTE"])
+            requirements = set(["RTE"])
+        elif abbreviation == "does not terminate":
+            permissions = set(["AC", "RTE", "TLE"])
+            requirements = set(["RTE", "TLE"])
+        elif abbreviation == "not accepted":
+            requirements = set(["RTE", "TLE", "WA"])
+        else:
+            assert False, f"unknown abbreviation {abbreviation}"
+        if permissions is not None:
+            self._permitted_verdicts = permissions
+        if requirements is not None:
+            self._required_verdicts = requirements
+
+    def __repr__(self):
+
+        def sorted_set_str(verdicts: set|None) -> str:
+            if verdicts is None:
+                return "None"
+            else:
+                return "{" + ", ".join(sorted(verdicts)) + "}"
+
+
+        return (f"permitted: {sorted_set_str(self._permitted_verdicts)}, " +
+                f"required: {sorted_set_str(self._required_verdicts)}")
+
+
+class Expectations(dict[TestCasePattern, BaseExpectations]):
+    """The expectations for a submission pattern; it maps testcase patterns
+    to BaseExpectations.
+
+    >>> e = Expectations("accepted")
+    >>> e
+    '': {permitted: {AC}, required: None}
+    >>> e.permitted_verdicts_for_testcase(Path("sample/1"))
+    {'AC'}
+
+    Specify expectations by testgroup:
+
+    >>> f = Expectations({'': 'wrong answer', 'sample': 'accepted', 'secret': 'wrong answer'})
+    >>> f['sample']
+    permitted: {AC}, required: None
+
+    Or by testcase
+    >>> list(sorted(f.for_testcase('sample/1').keys()))
+    ['', 'sample']
+    """
+
+    def __init__(self, expectations: str | list[int | float] | dict):
+        """
+        Arguments
+        ---------
+
+        expectations
+            list of common expectations, or range, or map
+        """
+
+        self.data: dict[str, BaseExpectations] = dict()
+
+        if not isinstance(expectations, dict):
+            expectations = {"": expectations}
+        for k, val in expectations.items():
+            if not (k == "" or k.startswith("sample") or k.startswith("secret")):
+                raise ValueError(f"Unexpected test data pattern: {k}")
+            self[TestCasePattern(k)] = BaseExpectations(val)
+
+    def for_testcase(self, path: Path) -> dict[TestCasePattern, BaseExpectations]:
+        """Returns a dictionary over the patterns that apply for the given test case path.
+
+        >>> e = Expectations( {'secret': { 'permitted': ['AC', 'TLE', 'WA']},
+        ...                    'secret/(tc)?[0-9]+-huge': { 'permitted': ['TLE'] },
+        ...                    'secret/[0-9]+-disconnected': { 'permitted': ['WA'] }})
+        >>> list(sorted(e.for_testcase("secret/tc05-huge").keys()))
+        ['secret', 'secret/(tc)?[0-9]+-huge']
+        >>> list(sorted(e.for_testcase("secret/05-disconnected").keys()))
+        ['secret', 'secret/[0-9]+-disconnected']
+        >>> list(sorted(e.for_testcase("secret/abc-disconnected").keys()))
+        ['secret']
+        """
+
+        return {
+            pattern: expectations
+            for pattern, expectations in self.items()
+            if re.match(pattern, str(path))
+        }
+
+    def permitted_verdicts_for_testcase(self, path: Path) -> set[str]:
+        """Returns a set of verdicts that is permitted at the given test case path.
+
+        Permissions are restrictions, so that if several permissions apply,
+        their *intersection* is permitted
+
+         >>> e = Expectations( {'secret': { 'permitted': ['AC', 'TLE']},
+         ...                    'secret/foo': { 'permitted': ['RTE', 'TLE'] }})
+         >>> e.permitted_verdicts_for_testcase("secret/foo")
+         {'TLE'}
+        """
+        permitted_verdicts = set(["AC", "TLE", "WA", "RTE"])
+        for exp in self.for_testcase(path).values():
+            permitted_verdicts &= exp.permitted_verdicts()
+        return permitted_verdicts
+
+    def is_permitted(self, verdict: str, path: Path):
+        """Is the result permitted for the testcase at the given path?
+
+        Accepts verdicts in long form. (Maybe it shouldn't.)
+        """
+        return verdict in self.permitted_verdicts_for_testcase(path)
+
+    def missing_required_verdicts(
+        self, verdict_for_testcase: dict[Path, str]
+    ) -> dict[TestCasePattern, set[str]]:
+        """Which verdicts are missing?
+
+        Returns a map of expectation patterns to sets of verdicts.
+
+        >>> e = Expectations("does not terminate")
+        >>> results = {"sample/1": "AC", "secret/1": "AC", "secret/2": "WA"}
+        >>> e.missing_required_verdicts(results) ==  {'': {'RTE', 'TLE'}}
+        True
+        >>> results = {"sample/1": "AC", "secret/1": "TLE", "secret/2": "WA"}
+        >>> e.missing_required_verdicts(results)
+        {}
+        """
+
+        missing = dict()
+        for tcpattern, exp in self.items():
+            if not exp.required_verdicts():
+                continue
+            for testcase, verdict in verdict_for_testcase.items():
+                if re.match(tcpattern, str(testcase)) and verdict in exp.required_verdicts():
+                    break
+            else:
+                missing[tcpattern] = exp.required_verdicts()
+        return missing
+
+    def is_satisfied_by(self, results: dict[Path, str]) -> bool:
+        """Are all requirements satisfied?"""
+        missing = self.missing_required_verdicts(results)
+        return all(self.is_permitted(results[path], path) for path in results) and all(
+            not missing_verdict for missing_verdict in missing.values()
+        )
+
+    def __repr__(self):
+        return ', '.join(f"'{k}': {{{repr(v)}}}" for k, v in self.items())
+
+
+class Registry(dict[str, Expectations]):
+    """A dictionary-like class that maps submission patterns to expectations."""
+
+    @staticmethod
+    def from_dict(dictionary):
+        """Factory method."""
+        return Registry({k: Expectations(v) for k, v in dictionary.items()})
+
+    def for_path(self, path: Path):
+        """Return a restricted Registry where all patterns
+        match the given path.
+
+        >>> registry = Registry({
+        ...     'accepted': Expectations('accepted'),
+        ...     'accepted/th': Expectations({'sample': 'accepted'}),
+        ...     'wrong_answer': Expectations('wrong answer')
+        ... })
+        >>> for k, v in registry.for_path(Path('accepted/th.py')).items():
+        ...    print(k, ":", v)
+        accepted : '': {permitted: {AC}, required: None}
+        accepted/th : 'sample': {permitted: {AC}, required: None}
+
+
+        A registry is just a dict; you can add more expectations to it
+        with the normal syntax:
+
+        >>> registry['wrong_answer/greedy.py'] = Expectations({'sample': 'accepted'})
+        >>> for k, v in registry.for_path(Path('wrong_answer/greedy.py')).items():
+        ...    print(k, ":", v)
+        wrong_answer : '': {permitted: {AC, WA}, required: {WA}}
+        wrong_answer/greedy.py : 'sample': {permitted: {AC}, required: None}
+
+        path:
+            a pathlib.Path to a submission
+        """
+        return Registry(
+            {
+                pattern: expectation
+                for pattern, expectation in self.items()
+                if re.match(pattern, str(path))
+            }
+        )
+
+    def is_permitted(self, verdict, testcase: Path) -> bool:
+        """shut up"""
+
+        return all(e.is_permitted(verdict, testcase) for e in self.values())
+
+    def violated_permissions(
+        self, verdict, testcase: Path
+    ) -> list[tuple[str, TestCasePattern, set[str]]]:
+        """Which permissions are violated by the given verdict for the given testcase?
+
+        Return:
+            A list of tuples; each tuple consists of
+            - the submissions pattern
+            - the test case pattern
+            - the set of verdicts that was expected
+            The list is sorted; in the typical case this means that less
+            specific rules come first.
+        """
+        violations = []
+        for prefix, expectation in self.items():
+            for pattern, base in expectation.for_testcase(testcase).items():
+                permitted_verdicts = base.permitted_verdicts()
+                if verdict in permitted_verdicts:
+                    continue
+                violations.append((prefix, pattern, permitted_verdicts))
+        return list(sorted(violations))
+
+    def unsatisfied_requirements(
+        self, verdict_for_testcase: dict[Path, str]
+    ) -> list[tuple[str, TestCasePattern, set[str]]]:
+        """Which permissions are violated by the given results?
+
+        Paramters:
+            verdict_for_testcase:
+                a mapping of testcase path to verdict
+
+        Return:
+            A list of tuples; each tuple consists of
+            - the submissions pattern
+            - the test case pattern
+            - the set of verdicts that was required
+            The list is sorted; in the typical case this means that less
+            specific rules come first.
+        """
+        missing = []
+        for prefix, expectations in self.items():
+            missing_verdicts = expectations.missing_required_verdicts(verdict_for_testcase)
+            for pattern, verdicts in missing_verdicts.items():
+                missing.append((prefix, pattern, verdicts))
+
+        return missing
+
+    def check(self, results) -> bool:
+        """Do the results satisfy all the expectations?
+
+        Note that expectations compose in different ways;
+        permissions are subtractive, requirements additive.
+
+        >>> registry = Registry(
+        ...     a= Expectations({"sample": { 'permitted': ['AC', 'WA']}}),
+        ...     b= Expectations({"sample": { 'permitted': ['AC', 'TLE']}})
+        ... )
+        >>> for v in ['AC', 'TLE', 'WA']:
+        ...     result = {'sample': v }
+        ...     print(f"{v}:", registry.check(result))
+        AC: True
+        TLE: False
+        WA: False
+
+        Typically, the expectations registered for a submission have
+        patterns like `secret` and `secret/huge` rather than mutually
+        exclusive `a` and `b`, and then this mechanism allows increasingly
+        fine-grained specification.
+        """
+        return all(e.is_satisfied_by(results) for e in self.values())
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()
diff --git a/bin/problem.py b/bin/problem.py
index 9dd398695..46daf4b8d 100644
--- a/bin/problem.py
+++ b/bin/problem.py
@@ -5,8 +5,10 @@
 import sys
 
 from pathlib import Path
+from functools import lru_cache
 
 import config
+import expectations
 import parallel
 import program
 import run
@@ -37,6 +39,7 @@ def __init__(self, path, tmpdir, label=None):
         self._program_callbacks = dict()
         # Dictionary from path to parsed file contents.
         self._testdata_yamls = dict()
+        self._expectations_registry = None
 
         # The label for the problem: A, B, A1, A2, X, ...
         self.label = label
@@ -438,6 +441,23 @@ def build_program(p):
 
         problem._validators[key] = validators
         return validators
+    
+    # TODO Q from Thore: ok to use self here instead of problem?
+    def get_expectations_registry(self):
+        """ Parse yaml file (if any) describing the expectations for this problem.
+        """
+        if self._expectations_registry is None:
+            path = self.path / 'submissions' / 'expectations.yaml'
+            if has_ryaml:
+                try:
+                    yamldata = read_yaml_settings(path)
+                except ruamel.yaml.scanner.ScannerError:
+                    fatal('Make sure problem.yaml does not contain any more {% ... %}.')
+            else:
+                yamldata = read_yaml_settings(path)
+            self._expectations_registry  = expectations.Registry.from_dict(yamldata)
+        return self._expectations_registry
+
 
     def run_submissions(problem):
         needans = False if problem.interactive else True
diff --git a/bin/run.py b/bin/run.py
index cb98e7384..7caba61c4 100644
--- a/bin/run.py
+++ b/bin/run.py
@@ -328,7 +328,7 @@ def __init__(self, problem, path, skip_double_build_warning=False):
 
         # The first element will match the directory the file is in, if possible.
         self.expected_verdicts = self._get_expected_verdicts()
-
+        self.expectations = self.problem.get_expectations_registry().for_path(self.short_path)
         # NOTE: Judging of interactive problems on systems without `os.wait4` is
         # suboptimal because we cannot determine which of the submission and
         # interactor exits first. Thus, we don't distinguish the different non-AC
@@ -400,9 +400,10 @@ def _get_expected_verdicts(self):
                 verdicts = [subdir]
             else:
                 if len(verdicts) == 0:
-                    error(
-                        f'Submission {self.short_path} must have @EXPECTED_RESULTS@. Defaulting to ACCEPTED.'
-                    )
+                    pass # TODO (Thore): made this shut up!
+                    #error(
+                    #    f'Submission {self.short_path} must have @EXPECTED_RESULTS@. Defaulting to ACCEPTED.'
+                    #)
 
         if len(verdicts) == 0:
             verdicts = ['ACCEPTED']
@@ -452,9 +453,10 @@ def run_all_testcases(
 
         verdict = (-100, 'ACCEPTED', 'ACCEPTED', 0)  # priority, verdict, print_verdict, duration
         verdict_run = None
+        verdict_for_testcase = dict()
 
         def process_run(run, p):
-            nonlocal max_duration, verdict, verdict_run
+            nonlocal max_duration, verdict, verdict_run, verdict_for_testcase
 
             localbar = bar.start(run)
             result = run.run()
@@ -476,7 +478,10 @@ def process_run(run, p):
             if table_dict is not None:
                 table_dict[run.name] = result.verdict == 'ACCEPTED'
 
-            got_expected = result.verdict in ['ACCEPTED'] + self.expected_verdicts
+            verdict_short = short_verdict(result.verdict)
+            verdict_for_testcase[run.name] = verdict_short
+            #got_expected = result.verdict in ['ACCEPTED'] + self.expected_verdicts
+            got_expected = self.expectations.is_permitted(verdict_short, run.name)
 
             # Print stderr whenever something is printed
             if result.out and result.err:
@@ -514,8 +519,17 @@ def process_run(run, p):
                     data += '\n'
                 data += f'{f.name}:' + localbar._format_data(t) + '\n'
 
+            if not got_expected:
+                localbar.error(f'{result.duration:6.3f}s {result.print_verdict()}', data)
+            short = short_verdict(result.verdict)
+            for prefix, pattern, verdicts in self.expectations.violated_permissions(short, run.name):
+                prefix = (f'{Fore.CYAN}{prefix:>{len(localbar.prefix)}}{Style.RESET_ALL}:' +
+                          f'{pattern:<{localbar.item_width}}')
+                localbar.warn(f"permits {verbose_verdicts(verdicts)}", prefix=prefix)
+
             localbar.done(got_expected, f'{result.duration:6.3f}s {result.print_verdict()}', data)
 
+
             # Lazy judging: stop on the first error when not in verbose mode.
             if (
                 not config.args.verbose and not config.args.table
@@ -534,16 +548,22 @@ def process_run(run, p):
         self.print_verdict = verdict[2]
         self.duration = max_duration
 
+        # Check presence of required verdicts among testgroups
+        for prefix, pattern, verdicts in self.expectations.unsatisfied_requirements(verdict_for_testcase):
+            prefix = (f'{Fore.CYAN}{prefix:>{len(bar.prefix)}}{Style.RESET_ALL}: ' +
+                      f'{pattern:<{bar.item_width}}')
+            bar.warn(f"no test case got {verbose_verdicts(verdicts)}", prefix=prefix)
+
         # Use a bold summary line if things were printed before.
         if bar.logged:
             color = (
                 Style.BRIGHT + Fore.GREEN
-                if self.verdict in self.expected_verdicts
+                if self.expectations.is_permitted(short_verdict(self.verdict), Path())
                 else Style.BRIGHT + Fore.RED
             )
             boldcolor = Style.BRIGHT
         else:
-            color = Fore.GREEN if self.verdict in self.expected_verdicts else Fore.RED
+            color = Fore.GREEN if self.expectations.is_permitted(short_verdict(self.verdict), Path()) else Fore.RED
             boldcolor = ''
 
         printed_newline = bar.finalize(
diff --git a/bin/util.py b/bin/util.py
index 29bbaaf63..f6bbd63cb 100644
--- a/bin/util.py
+++ b/bin/util.py
@@ -186,6 +186,7 @@ def clearline(self):
             return
         print(self.carriage_return, end='', flush=True, file=sys.stderr)
 
+    @staticmethod
     def action(prefix, item, width=None, total_width=None):
         if width is not None and total_width is not None and len(prefix) + 2 + width > total_width:
             width = total_width - len(prefix) - 2
@@ -273,7 +274,7 @@ def _format_data(data):
 
     # Log can be called multiple times to make multiple persistent lines.
     # Make sure that the message does not end in a newline.
-    def log(self, message='', data='', color=Fore.GREEN, *, resume=True):
+    def log(self, message='', data='', color=Fore.GREEN, *, resume=True, prefix=None):
         with self.lock:
             if message is None:
                 message = ''
@@ -292,7 +293,7 @@ def log(self, message='', data='', color=Fore.GREEN, *, resume=True):
                     self.needs_leading_newline = False
 
             print(
-                self.get_prefix(),
+                self.get_prefix() if prefix is None else prefix,
                 color,
                 message,
                 ProgressBar._format_data(data),
@@ -313,9 +314,9 @@ def debug(self, message, data=''):
         if config.args.verbose:
             self.log(message, data)
 
-    def warn(self, message='', data=''):
+    def warn(self, message='', data='', prefix=None):
         config.n_warn += 1
-        self.log(message, data, Fore.YELLOW)
+        self.log(message, data, Fore.YELLOW, prefix=prefix)
 
     # Error removes the current item from the in_progress set.
     def error(self, message='', data=''):
@@ -981,3 +982,33 @@ def combine_hashes_dict(d):
         if d[key] is not None:
             hasher.update(d[key].encode())
     return hasher.hexdigest()
+
+
+def short_verdict(verdict):
+    return {
+            'ACCEPTED': 'AC',
+            'WRONG_ANSWER': 'WA',
+            'RUN_TIME_ERROR': 'RTE',
+            'TLE (aborted)': 'TLE',
+            'TIME_LIMIT_EXCEEDED': 'TLE'
+            }[verdict]
+
+def verbose_verdicts(verdicts: str | set[str], oxford_comma=True) -> str:
+    long_form = {
+            'AC': 'ACCEPTED',
+            'WA': 'WRONG_ANSWER',
+            'RTE': 'RUN_TIME_ERROR',
+            'TLE': 'TLE (aborted)'
+            }
+    if isinstance(verdicts, str):
+        verdicts = {[verdicts]}
+    verdicts = list(sorted(verdicts))
+    if len(verdicts) == 1:
+        return long_form[verdicts[0]]
+    elif len(verdicts) == 2:
+        return f"{long_form[verdicts[0]]} or {long_form[verdicts[1]]}"
+    else:
+        assert len(verdicts) == 3, len(verdicts)
+        return f"{long_form[verdicts[0]]}, {long_form[verdicts[1]]}, or {long_form[verdicts[2]]}"
+
+
diff --git a/doc/expectations.md b/doc/expectations.md
new file mode 100644
index 000000000..9aef737b6
--- /dev/null
+++ b/doc/expectations.md
@@ -0,0 +1,279 @@
+# Expectations
+
+This framework allows problem author to express their expecations for the behaviour of a submission on the test data.
+
+## Test Case Verdict
+
+The behaviour of a submission on a _single_ test case is summarised in a *verdict*.
+
+The verdicts are:
+
+* AC: Accepted. The submission terminates successfully within the time limit and the output vaidator accepts the submission output.
+* WA: The submission terminates successfully within the time limit, but the output validator rejects the submission output on this testcase.
+* TLE: The submission does not terminate within the time limit.
+* RTE: The submission aborts within the time limit with a runtime error.
+
+
+## Common Expectations for a Submission
+
+
+The expected behaviour of a submission on the test data often falls into a number of common classes:
+
+* accepted: Every test case is `AC`.
+* wrong answer: Every test case receives `AC` or `WA`;  _some_ test case receives `WA`.
+* time limit exceeded: Every test case receives `AC` or `TLE`;  _some_ test case receives `TLE`.
+* runtime exception: Every test case receives `AC` or `RTE`;  _some_ test case receives `RTE`.
+* does not terminate: Every test case receives `AC`, `RTE`, or `TLE` (but not `WA`);  _some_ test case receives `RTE` or `TLE`.
+* not accepted: Not every test case receives `AC`. This is the complement of _accepted_.
+
+In general, an expecation consists of a set of _permitted_ verdicts and set of _required_ verdicts.
+
+* All test case must receive one of the permitted verdicts. If no permitted verdicts are specified, _all_ verdicts are permitted.
+* Some test case must receive one of the required verdicts. If no required verdicts are specified, _no_ verdict is required.
+
+Thus, the common expectations above can be spelt out in terms of lists of verdicts.
+For instance for the submission `mysubmission.cpp`:
+
+```yaml
+mysubmission.py: accepted
+```
+
+is the same as
+
+```yaml
+mysubmission.py:
+  permitted: [AC]
+```
+Similarly, 
+
+```yaml
+mysubmission.py: time limit exceeded
+```
+
+is the same as
+
+```yaml
+mysubmission.py:
+  permitted: [AC, TLE]
+  required: [TLE]
+```
+
+## Specifying Expectations
+
+Expectations for submissions can be provided for a group of submissions or a single submission in
+a file `/submissions/expectations.yaml`.
+Submission patterns match by prefix, so it is easy to specify the expected behaviour of submissions by placing them into various subdirectories of `/submissions`.
+A common tradition is specified like this:
+
+```yaml
+accepted: accepted
+wrong_answer: wrong answer
+time_limit_exceeded: time limit exceeded
+runtime_exception: runtime exception
+```
+This would associate the expectation “accepted” with the submission `/submissions/accepted/mysubmission.cpp`.
+The flexibility of the expectations framework is that it is agnostic about directory names; for instance you can  your crashing submissions in `/submissions/run_time_error/` and put other requirements on the submissions in `/submissions/mixed/`:
+
+```yaml
+run_time_error: runtime exception
+mixed:
+  permitted: [AC, WA, TLE]
+  required: [WA]
+```
+
+## Specification per Submission
+
+The submission pattern is matches by prefix, so instead of directories you can specify individual submissions:
+
+```yaml
+mixed/alice.py
+  permitted: [AC, WA, TLE]
+  required: [WA]
+mixed/bob.py
+  permitted: [AC, WA, TLE]
+  required: [TLE]
+```
+
+## Specification per Test Data
+
+Top-level expectations apply to all test data, but you can be more fine-grained and specify expectations for subdirectories of `/data`.
+For instance, if you want all submission in `wrong_answer` to pass the sample inputs, you’d write:
+
+```yaml
+wrong_answer:
+  sample: accepted
+  secret: accepted
+```
+
+# Schema
+
+Here is the specification:
+
+```cue
+#registry
+
+#registry: close({ [string]: #root })
+
+#verdict: "AC" | "WA" | "RTE" | "TLE" 
+#verdicts: [...#verdict]
+
+#root: { 
+    #expectations
+    [=~"^(sample|secret)"]: #expectations
+}
+
+#expectations: {
+    #common
+    #range
+    permitted?: #verdicts // only these verdicts may appear
+    required?:  #verdicts // at least one of these verdicts must appear
+    judge_message?: string // this judgemessage must appear
+    score?: #range
+    fractional_score?: #fractional_range
+    }
+
+#common: "accepted" |        // { permitted: AC; required: AC }
+    "wrong answer" |         // { permitted: [AC, WA]; required: WA }
+    "time limit exceeded" |  // { permitted: [AC, TLE]; required: TLE }
+    "runtime exception" |    // { permitted: [AC, RTE]; required: RTE }
+    "does not terminate" |   // { permitted: [AC, TLE, RTE}; required: [RTE, TLE]
+    "not accepted" |         // { required: [RTE, TLE, WA] }
+    "full score"             // { fractional_score: 1.0 }
+    
+#range: number | [number, number] 
+#fractional_range: #fraction | [#fraction, #fraction]
+#fraction: >= 0.0 | <= 1.0 | float
+```
+
+# Examples
+
+```yaml
+# Simple examples for some common cases
+
+a.py: accepted            # AC on all cases
+b.py: wrong answer        # at least one WA, otherwise AC
+c.py: time limit exceeded # at least one TLE, otherwise AC
+d.py: runtime exception   # at least one RTE, otherwise AC
+e.py: does not terminate  # at least one RTE or TLE, but no WA
+f.py: not accepted        # at least one RTE, TLE, or WA
+g.py: full score          # gets max_score
+
+# submission are identified by prefix:
+
+wrong_answer/: wrong answer # expectations "wrong answer" apply to "wrong_answer/th.py" etc.
+
+# Abbreviations are just shorthands for richer maps 
+# of "required" and "permitted" keys.
+#
+# For instance, these are the same:
+
+th.py: accepted
+---
+th.py:
+  permitted: [AC]
+  required: [AC]
+---
+
+# A submission failed by the output validator on some testcase
+# These are the same:
+
+wrong.py: wrong answer
+---
+wrong.py:
+  permitted: [WA, AC]
+  required: [WA]
+---
+wrong.py:
+  permitted: # alternative yaml syntax for list of strings
+    - WA
+    - AC
+  required: [WA]
+---
+
+# Specify that the submission fails, but passes the samples.
+# These are the same, using the same abbreviations as
+# above for "accepted" and "wrong answer"
+
+wrong.py:
+  sample: accepted
+  secret: wrong answer
+---
+wrong.py:
+  sample: 
+    permitted: [AC]
+    required: [AC]
+  secret:
+    permitted: [AC, WA]
+    required: [WA]
+
+# Constraints apply to testcases in entire subtree of cases that match the string:
+funky.cpp:
+  permitted: [AC, WA, RTE]
+  secret:
+      permitted: [AC, RTE, TLE] # TLE is forbidden at ancestor, so this makes no sense
+  secret/small: accepted # more restrictive than ancestor: this is fine
+          
+# Specification for subgroups works "all the way down to the tescase"
+# though it's seldomly needed:
+funky.cpp:
+  secret/huge_instances/disconnected_graph:
+          permitted: [RTE, TLE]
+          
+# Can also specify a required judgemessage, not only verdicts
+linear_search.py:
+  judge_message: "too many rounds" # matches judgemessage.txt as substring, case-insensitive
+  
+# Allow digit regex to catch auto-numbered groups: `\d+`
+
+submission:py
+  secret/\d+-group/: accepted # matches 02-group
+
+#########
+# Scoring
+#########
+
+# simplest case:
+th.py: full score
+
+# Partial solutions can be given in various ways:
+partial.py: [50, 60]
+---
+partial.py: 60
+---
+partial.py:
+  score: 60
+---
+partial.py:
+  score: [50, 60]
+---
+partial.py:
+  fractional_score: [.5, .6] # percentage of full score
+---
+# For subtasks, you probably want to specify the
+# outcome per subgroup. You need to be more verbose:
+partial.py:
+  secret/subtask1: full score
+  secret/subtask2: 0
+  secret/subtask3: 0
+---
+# Can be even more verbose about scores
+partial.py:
+  secret/subtask1: full score
+  secret/subtask2:
+        score: 13   # absolute score on this group is exactly 13
+  secret/subtask3: # between 10% and 40% of (full score for subtask 3)
+        fractional_score: [.1, .4] 
+---
+# Can still specify testcases
+bruteforce.py:
+  secret/subtask1: full score # subtask 1 has small instances
+  secret/subtask2:
+        score: 0      # No points for this subtask
+        required: TLE # ... because some testcase timed out
+        permitted: [AC, TLE] # ... rather than any WAs
+---
+# The common abbreviations work here as well, you probably want to write this instead:
+bruteforce.py:
+  secret/subtask1: full score # could write "accepted" as well in this case
+  secret/subtask2: time limit exceeded # this is more informative than "0"
+```
diff --git a/readme.md b/readme.md
index feceb035b..dd42baa44 100644
--- a/readme.md
+++ b/readme.md
@@ -1,203 +1,3 @@
-# BAPCtools
+# Expectation Proposal Implemented in BAPCtools
 
-BAPCtools is a tool for creating and developing problems following the
-CLICS (DOMjudge/Kattis) problem format specified [here](https://icpc.io/problem-package-format/spec/problem_package_format).
-
-The aim of this tool is to run all necessary compilation, validation, and
-testing commands while working on an ICPC-style problem.
-Ideally one should never have to manually run any compilation or testing command themselves.
-
-I'm interested to know who's using this, so feel free to inform me (e.g. via an issue) if so ;)
-The current state is relatively stable, but things do change from time to
-time since I'm not aware of usage outside of BAPC yet.
-
-## Installation
-
-You can install the [bapctools-git AUR
-package](https://aur.archlinux.org/packages/bapctools-git/), mirrored
-[here](https://github.com/RagnarGrootKoerkamp/bapctools-git), or use the [Docker
-image](#Docker).
-
-Otherwise, clone this repository and install the required dependencies manually.
-(If you know how to make a Debian package, feel free to help out.)
-
-- Python 3 (>= 3.6).
-- The [yaml library](https://pyyaml.org/wiki/PyYAMLDocumentation) via `pip install pyyaml` or the `python[3]-yaml` Arch Linux package.
-- The [colorama library](https://pypi.org/project/colorama/) via `pip install colorama` or the `python[3]-colorama` Arch Linux package.
-- The `argcomplete` library for command line argument completion. Install via
-  `python[3]-argcomplete`.
-
-  - Note that actually using `argcomplete` is optional, but recommended.
-    Detailed instructions are [here](https://argcomplete.readthedocs.io/en/latest/).
-
-    TL;DR: Put `eval "$(register-python-argcomplete[3] tools.py)"` in your `.bashrc` or `.zshrc`.
-
-Optional dependencies, required for some subcommands:
-
-- The [ruamel.yaml library](https://pypi.org/project/ruamel.yaml/) via `pip install ruamel.yaml` or the `python-ruamel-yaml` Arch Linux package (`python3-ruamel.yaml` on Debian derivatives).
-  - This is only needed for commands that update `generators.yaml`.
-- The `latexmk` and `pdflatex` commands, provided by `texlive-bin` on Arch Linux and
-  potentially some specific LaTeX packages (like tikz) provided by
-  `texlive-extra`.
-  These are only needed for building `pdf` files, not for `run` and `validate` and such.
-- The [matplotlib library](https://pypi.org/project/matplotlib/) via `pip install matplotlib` or the `python[3]-matplotlib` Linux package.
-  - This is optional and only used by the `solve_stats` command.
-- The [requests library](https://pypi.org/project/requests/) via `pip install requests` or the `python[3]-requests` Linux package.
-  - This is optional and only used by the commands that call the DOMjudge API (`export`, `solutions --order-from-css`, and `solve_stats`) or the Slack API (`create_slack_channels` command).
-- The [questionary library](https://pypi.org/project/questionary/) via `pip install questionary`.
-  - This is optional and only used by the `new_contest` and `new_problem` commands.
-
-After cloning the repository, symlink [bin/tools.py](bin/tools.py) to somewhere in your `$PATH`. E.g., if `~/bin/` is in your `$PATH`, you can do:
-
-```
-% ln -s ~/git/BAPCtools/bin/tools.py ~/bin/bt
-```
-
-### Windows
-
-For Windows, you'll need the following in your
-`path`:
-
-- `Python` for Python 3
-- `g++` to compile C++
-- `javac` and `java` to compile and run `java`.
-
-Resource limits (memory limit/hard cpu time limit) are also not supported.
-
-BAPCtools makes use of symlinks for building programs. By default users are not allowed to create symlinks on Windows.
-This can be fixed by enabling Developer Mode on Windows (Only since Windows 10, version 1703 or newer).
-
-### Docker
-
-A docker image containing this git repo and dependencies, together with commonly
-used languages, is provided at
-[ragnargrootkoerkamp/bacptools](https://hub.docker.com/r/ragnargrootkoerkamp/bapctools).
-This version may be somewhat outdated. Ping me if you'd like it to be updated.
-
-This image can be used for e.g.:
-
-- running CI on your repo. Also see `bt gitlabci` which generates a
-  `.gitlab-ci.yaml` file. Make sure to clear the entrypoint, e.g. `entrypoint: [""]`.
-- running `bt` on your local problems. Use this command to mount your local
-  directory into the docker image and run a command on it:
-  ```
-  docker run -v $PWD:/data --rm -it ragnargrootkoerkamp/bapctools <bt subcommands>
-  ```
-
-To update the image:
-
-```
-$ sudo systemctl start docker
-$ docker pull archlinux:latest
-$ docker login
-$ docker build . -t ragnargrootkoerkamp/bapctools
-$ docker push ragnargrootkoerkamp/bapctools
-$ ssh <server> sudo docker pull ragnargrootkoerkamp/bapctools
-```
-
-The last step is needed when your CI server is not automatically pulling the latest version.
-
-## Usage
-
-BAPCtools can be run either from a problem directory or a contest directory. This
-is automatically detected by searching for the `problem.yaml` file.
-
-The most common commands and options to use on an existing repository are:
-
-- [`bt run [-v] [submissions [submissions ...]] [testcases [testcases ...]]`](#run)
-- [`bt test <submission> [--interactive | --samples | [testcases [testcases ...]]]`](#test)
-- [`bt generate [-v] [--jobs JOBS]`](#generate)
-- [`bt validate [-v] [--remove | --move-to DIR] [testcases [testcases ...]]`](#validate)
-- [`bt pdf [-v]`](#pdf)
-
-The list of all available commands and options is at [doc/commands.md#synopsis](doc/commands.md#synopsis),
-and more information regarding the implementation is at [doc/implementation_notes.md](doc/implementation_notes.md).
-
-### Run
-
-- `bt run [-v] [submissions [submissions ...]] [testcases [testcases ...]]`
-
-Without arguments, the `run` command runs all submissions against all testcases.
-Specify one or more submissions and one or more testcases to only run the given submissions against the given testcases.
-
-Before running the given submissions, this command first makes sure that all generated testcases are up to date (in case `generators/generators.yaml` was found).
-
-![run](doc/images/run.gif)
-
-By default, `bt run` only prints one summary line per submission, and one additional line for each testcase with an unexpected result. Use `-v` to print one line per testcase instead.
-
-![run -v](doc/images/run-v.gif)
-
-### Test
-
-- `bt test <submission> [--samples | [testcases [testcases ...]]]`
-
-Use the `test` command to run a single submission on some testcases. The submission `stdout` and `stderr` are printed to the terminal instead of verified as an answer file.
-Use `--samples` to run on the samples, or pass a list of testcases or directories containing testcases. Use `--interactive`/`-i` to run in interactive mode, where console input is forwarded to the submission.
-This rebuilds and reruns the program until either `control-C` or `control-D` is pressed. It's also possible to supply the test case on the command line directly using e.g. `< /path/to/file.in` or `<<< "10 20"`.
-
-![test](doc/images/test.png)
-
-### Generate
-
-- `bt generate [-v] [--jobs JOBS]`
-
-Use the `generate` command to generate the testcases specified in `generators/generators.yaml`. See [doc/generators.md](doc/generators.md) for the specification of `generators.yaml` and see [doc/commands.md#generate](doc/commands.md#generate) for the full list of arguents.
-Use `-j 0` to disable running multiple jobs in parallel (the default is `4`).
-
-![generate](./doc/images/generate.gif)
-
-### Validate
-
-- `bt validate [-v] [--remove | --move-to DIR] [testcases [testcases ...]]`
-
-Validate all the `.in` and `.ans` for all (given) testcases. It runs all validators from `input_validators` and `output_validators`.
-
-Validators can be one of
-
-- a single-file program.
-- a multi-file program with all files in a common directory.
-- a .ctd CheckTestData file (this needs the `checktestdata` executable in your `$PATH`).
-- a .viva file.
-
-You can use `--remove` to delete all failing testcases or `--move <dir>` to move
-them to a separate directory.
-
-![validator](./doc/images/validate.png)
-
-### Pdf
-
-- `bt pdf [-v]`
-
-Use this command to compile the `problem.en.pdf` from the `problem_statement/problem.en.tex` LaTeX statement.
-`problem.en.pdf` is written to the problem directory itself.
-
-This can also be used to create the contest pdf by running it from the contest directory.
-
-## Personal configuration file
-
-For some command-line flags, it is convenient if they are always set to the same value, which differs per user
-(e.g., `--username` or `--password` for commands that access a CCS like DOMjudge,
-or `--jobs` to limit parallel execution) or per contest (e.g., which statement languages are used).
-For this, you can create a configuration YAML file containing key-value pairs
-in one of the following locations, from low to high priority:
-
-- `$XDG_CONFIG_HOME/bapctools/config.yaml` (Unix-ish systems, where `$XDG_CONFIG_HOME` usually is `~/.config`)
-- `%AppData%/bapctools/config.yaml` (Windows systems)
-- `<contest directory>/.bapctools.yaml`
-
-The keys in this config file can be any option that can be passed on the command-line.
-Note that the keys should be written out in full (e.g., `username: jury` rather than `u: jury`)
-and any hyphens should be replaced with an underscore (e.g., `no_bar: True` rather than `no-bar: True`).
-
-## Contributing / Style guide
-
-- The python code in the repository is formatted using [black](https://github.com/psf/black).
-  To enable the pre-commit hook, install [pre-commit](https://pre-commit.com/)
-  with `pip` or your package manager (Arch: `python-pre-commit`) and run
-  `pre-commit install` from the repository root. All python code will now automatically be formatted
-  on each commit.
-
-- Imports are usually ordered with system libraries first, followed by a
-  newline, followed by local includes. Both groups are sorted alphabetically,
-  and `import` comes before `from ... import`.
+See [doc/expectations](doc/expectations.md)
diff --git a/test/problems/hello/submissions/expectations.yaml b/test/problems/hello/submissions/expectations.yaml
new file mode 100644
index 000000000..e25a890a4
--- /dev/null
+++ b/test/problems/hello/submissions/expectations.yaml
@@ -0,0 +1,7 @@
+accepted/: accepted
+wrong_answer/: wrong answer
+mixed/:
+  allowed: ["AC", "WA", "TLE", "RTE"]
+rejected/: not accepted
+run_time_error/: runtime exception
+time_limit_exceeded/: time limit exceeded
diff --git a/test/problems/spanishinquisition/generators/generators.yaml b/test/problems/spanishinquisition/generators/generators.yaml
new file mode 100644
index 000000000..7aebebda4
--- /dev/null
+++ b/test/problems/spanishinquisition/generators/generators.yaml
@@ -0,0 +1,36 @@
+solution: /submissions/accepted/th.py
+data:
+  sample:
+    data:
+      "2":
+        in: 1 1
+        ans: "2"
+  secret:
+    data:
+      "3":
+        in: 1 2
+        ans: "3"
+      inverse:
+        in: -1 1
+        ans: "0"
+      neg:
+        data:
+          neg_five:
+            in: -5 3
+            ans: "-2"
+      thirteen:
+        in: 10 3
+        ans: "13"
+      fourteen:
+        in: 10 4
+        ans: "14"
+      031-numbered:
+        in: 3 4
+        ans: "7"
+
+            
+
+
+
+
+
diff --git a/test/problems/spanishinquisition/input_validators/validate.py b/test/problems/spanishinquisition/input_validators/validate.py
new file mode 100644
index 000000000..a452ae032
--- /dev/null
+++ b/test/problems/spanishinquisition/input_validators/validate.py
@@ -0,0 +1,12 @@
+#! /usr/bin/env python3
+
+import sys
+
+line = input().split()
+
+try:
+    x = int(line[0])
+    y = int(line[1])
+except ValueError:
+    sys.exit(43)
+sys.exit(42)
diff --git a/test/problems/spanishinquisition/problem.yaml b/test/problems/spanishinquisition/problem.yaml
new file mode 100644
index 000000000..648fefd1e
--- /dev/null
+++ b/test/problems/spanishinquisition/problem.yaml
@@ -0,0 +1 @@
+name: Spanish Inquisition
diff --git a/test/problems/spanishinquisition/problem_statement/problem.en.tex b/test/problems/spanishinquisition/problem_statement/problem.en.tex
new file mode 100644
index 000000000..53c65a9ef
--- /dev/null
+++ b/test/problems/spanishinquisition/problem_statement/problem.en.tex
@@ -0,0 +1 @@
+I didn't expect this.
diff --git a/test/problems/spanishinquisition/submissions/accepted/bad-wa.py b/test/problems/spanishinquisition/submissions/accepted/bad-wa.py
new file mode 100644
index 000000000..a3a8bbdc4
--- /dev/null
+++ b/test/problems/spanishinquisition/submissions/accepted/bad-wa.py
@@ -0,0 +1,3 @@
+#! /usr/bin/env python3
+
+print(abs(sum(map(int, input().split()))))
diff --git a/test/problems/spanishinquisition/submissions/accepted/th.py b/test/problems/spanishinquisition/submissions/accepted/th.py
new file mode 100644
index 000000000..350cab154
--- /dev/null
+++ b/test/problems/spanishinquisition/submissions/accepted/th.py
@@ -0,0 +1,3 @@
+#! /usr/bin/env python3
+
+print(sum(map(int, input().split())))
diff --git a/test/problems/spanishinquisition/submissions/expectations.yaml b/test/problems/spanishinquisition/submissions/expectations.yaml
new file mode 100644
index 000000000..8045f98de
--- /dev/null
+++ b/test/problems/spanishinquisition/submissions/expectations.yaml
@@ -0,0 +1,42 @@
+# These are just shorthands and re-implement (correctly)
+# the directories of the problem package specification. 
+
+accepted/: accepted
+wrong_answer/: wrong answer
+time_limit_exceeded: time limit exceeded
+
+# You can go beyond that and specify expectation
+# per subgroup instead:
+
+wa_sample_ac/:
+  sample: accepted
+  secret: wrong answer
+
+# Use explicit permissions and requirements instead of the shorthands:
+
+mixed/allverdicts.py:
+  secret/3:
+    permitted: [WA]
+  secret/fourteen:
+    permitted: [AC]
+  secret/negative_ans/:
+    permitted: [TLE]
+  secret/:
+    required: [RTE]
+
+# Patterns match as regular expressions from the start:
+
+mixed/seven.sh:
+  secret/\d+-numbered: accepted   # matches testcase secret/031-numbered
+mixed/bad-eight.sh:
+  secret/\d+-numbered: accepted   # matches testcase secret/031-numbered (and will fail)
+
+mixed/superstitious.py:            # for this submission
+  secret/thirteen:                 # ... on testcases that match this pattern
+    permitted: [WA]                # ... permit exactly the verdict WA, nothing else
+  secret/(?!thirteen): accepted    # everywhere else: accept
+
+mixed/bad-fourteen.py:
+  secret/thirteen: 
+    permitted: [WA]
+  secret/(?!thirteen): accepted
diff --git a/test/problems/spanishinquisition/submissions/mixed/allverdicts.py b/test/problems/spanishinquisition/submissions/mixed/allverdicts.py
new file mode 100644
index 000000000..bd956024b
--- /dev/null
+++ b/test/problems/spanishinquisition/submissions/mixed/allverdicts.py
@@ -0,0 +1,10 @@
+#! /usr/bin/env python3
+""" This submission produces get *all* verdicts on various inputs."""
+
+x, y = map(int, input().split())
+result = 0
+while (x + y) // 2 != result // 2: # WA if result is odd
+    result += 1
+# TLE if result is negative
+assert result != 0 # RTEs for result 0
+print(result)
diff --git a/test/problems/spanishinquisition/submissions/mixed/bad-eight.sh b/test/problems/spanishinquisition/submissions/mixed/bad-eight.sh
new file mode 100644
index 000000000..93f77f6a8
--- /dev/null
+++ b/test/problems/spanishinquisition/submissions/mixed/bad-eight.sh
@@ -0,0 +1 @@
+echo 8
diff --git a/test/problems/spanishinquisition/submissions/mixed/bad-fourteen.py b/test/problems/spanishinquisition/submissions/mixed/bad-fourteen.py
new file mode 100644
index 000000000..68952e4a0
--- /dev/null
+++ b/test/problems/spanishinquisition/submissions/mixed/bad-fourteen.py
@@ -0,0 +1,12 @@
+#! /usr/bin/env python3
+""" Irrationally afraid of 13, and (as expected)
+    fails on corresponding inputs. However,
+    also (unexpectedly) fails on 14.
+"""
+
+x, y = map(int, input().split())
+result = x + y
+if 12 < result < 15:
+    print(12)
+else:
+    print(result)
diff --git a/test/problems/spanishinquisition/submissions/mixed/seven.sh b/test/problems/spanishinquisition/submissions/mixed/seven.sh
new file mode 100644
index 000000000..65c166890
--- /dev/null
+++ b/test/problems/spanishinquisition/submissions/mixed/seven.sh
@@ -0,0 +1 @@
+echo 7
diff --git a/test/problems/spanishinquisition/submissions/mixed/superstitious.py b/test/problems/spanishinquisition/submissions/mixed/superstitious.py
new file mode 100644
index 000000000..70de8bd1b
--- /dev/null
+++ b/test/problems/spanishinquisition/submissions/mixed/superstitious.py
@@ -0,0 +1,11 @@
+#! /usr/bin/env python3
+""" Irrationally afraid of the integer between 12 and 14 and therefore fails on
+    inputs where that's the correct answer.
+"""
+
+x, y = map(int, input().split())
+result = x + y
+if 12 < result < 14:
+    print(12)
+else:
+    print(result)
diff --git a/test/problems/spanishinquisition/submissions/time_limit_exceeded/bad-tle-wa.py b/test/problems/spanishinquisition/submissions/time_limit_exceeded/bad-tle-wa.py
new file mode 100644
index 000000000..740c8f132
--- /dev/null
+++ b/test/problems/spanishinquisition/submissions/time_limit_exceeded/bad-tle-wa.py
@@ -0,0 +1,10 @@
+#! /usr/bin/env python3
+""" This is wrong exactly when the two inputs are different and runs
+    forever exactly if the sum is negative.
+"""
+
+inputs = set(map(int, input().split()))
+result = 0
+while result != sum(inputs):
+    result += 1
+print(result)
diff --git a/test/problems/spanishinquisition/submissions/wa_sample_ac/bad-five.py b/test/problems/spanishinquisition/submissions/wa_sample_ac/bad-five.py
new file mode 100644
index 000000000..c28cd73d0
--- /dev/null
+++ b/test/problems/spanishinquisition/submissions/wa_sample_ac/bad-five.py
@@ -0,0 +1,2 @@
+#! /usr/bin/env python3
+print(5)
diff --git a/test/problems/spanishinquisition/submissions/wa_sample_ac/two.py b/test/problems/spanishinquisition/submissions/wa_sample_ac/two.py
new file mode 100644
index 000000000..d0e0fd661
--- /dev/null
+++ b/test/problems/spanishinquisition/submissions/wa_sample_ac/two.py
@@ -0,0 +1 @@
+print(2)
diff --git a/test/problems/spanishinquisition/submissions/wrong_answer/as_set.py b/test/problems/spanishinquisition/submissions/wrong_answer/as_set.py
new file mode 100644
index 000000000..4ea325e21
--- /dev/null
+++ b/test/problems/spanishinquisition/submissions/wrong_answer/as_set.py
@@ -0,0 +1,5 @@
+#! /usr/bin/env python3
+""" This is wrong exactly when the two inputs are different """
+
+inputs = set(map(int, input().split()))
+print(sum(inputs))
diff --git a/test/problems/spanishinquisition/submissions/wrong_answer/bad-ac.py b/test/problems/spanishinquisition/submissions/wrong_answer/bad-ac.py
new file mode 100644
index 000000000..350cab154
--- /dev/null
+++ b/test/problems/spanishinquisition/submissions/wrong_answer/bad-ac.py
@@ -0,0 +1,3 @@
+#! /usr/bin/env python3
+
+print(sum(map(int, input().split())))
diff --git a/test/problems/spanishinquisition/submissions/wrong_answer/bad-search.py b/test/problems/spanishinquisition/submissions/wrong_answer/bad-search.py
new file mode 100644
index 000000000..625c2e91f
--- /dev/null
+++ b/test/problems/spanishinquisition/submissions/wrong_answer/bad-search.py
@@ -0,0 +1,8 @@
+#! /usr/bin/env python3
+
+x, y = map(int, input().split())
+
+result = 0
+while result != x + y:
+    result += 1
+print(result)
diff --git a/test/problems/spanishinquisition/submissions/wrong_answer/wa.py b/test/problems/spanishinquisition/submissions/wrong_answer/wa.py
new file mode 100644
index 000000000..ea1aa6e41
--- /dev/null
+++ b/test/problems/spanishinquisition/submissions/wrong_answer/wa.py
@@ -0,0 +1,2 @@
+#! /usr/bin/env python3
+print(abs(sum(map(int, input().split()))))