diff --git a/bin/expectations.py b/bin/expectations.py new file mode 100644 index 000000000..7337f5515 --- /dev/null +++ b/bin/expectations.py @@ -0,0 +1,409 @@ +"""Expectations for a submission + +Here is a sample expectations.yaml file: + + accepted/: accepted # Every submission in accepted/* should be accepted + wrong_answer/th.py: # This particular submission ... + sample: accepted # ... should be acceped on sample + secret: wrong answer # ... but fail with WA on some test case in secret + mixed/failing.java # For this particular submission, ... + secret/huge/graph07: # ... on this particular test case ... + permitted: [TLE, RTE] # ... only TLE and RTE are permitted + +A yaml parser will turn this into a dict that can be fed to the Registry class: + +>>> exp_dict = { +... "accepted/": "accepted", +... "wrong_answer/th.py": {"sample": "accepted", "secret": "wrong answer"}, +... "mixed/failing.java": {"secret/huge/graph07": {"permitted": ["TLE", "RTE"]}}, +... "mixed/": {"sample": "accepted"} +... } + + +>>> registry = Registry.from_dict(exp_dict) +>>> registry['mixed/'] +'sample': {permitted: {AC}, required: None} + +Expectations for a single submission can now be extracted from +the registry. Here, the submission `mixed/failing.java` matches two patterns, +so those will be the expectations that apply to that submission. + +>>> sub_registry = registry.for_path(Path("mixed/failing.java")) +>>> sorted(sub_registry.keys()) +['mixed/', 'mixed/failing.java'] + +Expectations for a submission can be compared with actual validation +results. This runs all (in this case, both), sets of expecations +against the results. + +>>> results_ac = { "sample/1": "AC", "secret/1": "AC", "secret/2": "AC" } +>>> results_wa = { "sample/1": "WA", "secret/1": "AC", "secret/2": "WA" } +>>> sub_registry.check(results_ac) +True +>>> sub_registry.check(results_wa) +False + + +Altenatively, supply a submission path check the submission and results +directly against the expectations dictionary. + +>>> registry.for_path(Path("accepted/ragnar.cpp")).check(results_ac) +True +>>> registry.for_path(Path("accepted/ragnar.cpp")).check(results_wa) +False +>>> registry.for_path(Path("wrong_answer/th.py")).check(results_wa) +False +>>> results_wa_secret = { "sample/1": "AC", "secret/1": "AC", "secret/2": "WA" } +>>> registry.for_path(Path("wrong_answer/th.py")).check(results_wa_secret) +True + +Checking some results against no relevant expectations always succeeds: +>>> registry.for_path(Path("mixed/failing.java")).check(results_wa_secret) +True + +Terminology +----------- + +verdict + A testcase can have a verdict, which is any of 'AC', 'WA', 'RTE', 'TLE'. + (Note that the verdict 'JE' is never expected.) + +result + a verdict for a path representing a testcase, like "TLE" for "secret/huge/random-01" + +score + A finite number, often just an integer in the range {0, ..., 100}, but can be a float. + NOT IMPLEMENTED + +range + A string of two space-separated numbers, like '0 30' or '-inf 43' or '3.14 3.14'; + a one-value range can be abbreviated: '5' is the range '5 5'. + NOT IMPLEMENTED +""" + +from pathlib import Path +import re + + +class TestCasePattern(str): + """A pattern that matches against testgroups and -cases.""" + + def __new__(cls, content): + if content != "" and not content.startswith("sample") and not content.startswith("secret"): + raise ValueError(f"Unexpected test case pattern {content}") + return super().__new__(cls, content) + + +class BaseExpectations: + """Base expectations.""" + + def __init__(self, expectations: str | list[int | float] | dict): + self._permitted_verdicts: set[str] | None = None + self._required_verdicts: set[str] | None = None + + if isinstance(expectations, str): + self._set_common(expectations) + elif isinstance(expectations, list): + raise ValueError("Range expecations not implemented") + elif isinstance(expectations, dict): + for k, val in expectations.items(): + if k == "permitted": + self._permitted_verdicts = val if isinstance(val, set) else set(val) + elif k == "required": + self._required_verdicts = val if isinstance(val, set) else set(val) + elif k in ["judge_message", "score", "fractional_score"]: + raise ValueError(f"Key {k} not implemented") + else: + raise ValueError(f"Unrecognised key {k}") + + def permitted_verdicts(self) -> set[str]: + """Returns a set of verdicts.""" + return self._permitted_verdicts or set(["AC", "WA", "TLE", "RTE"]) + + def required_verdicts(self) -> set[str]: + """Returns a set of verdicts.""" + return self._required_verdicts or set() + + def _set_common(self, abbreviation): + permissions = None + requirements = None + if abbreviation == "accepted": + permissions = set(["AC"]) + elif abbreviation == "wrong answer": + permissions = set(["AC", "WA"]) + requirements = set(["WA"]) + elif abbreviation == "time limit exceeded": + permissions = set(["AC", "TLE"]) + requirements = set(["TLE"]) + elif abbreviation == "runtime exception": + permissions = set(["AC", "RTE"]) + requirements = set(["RTE"]) + elif abbreviation == "does not terminate": + permissions = set(["AC", "RTE", "TLE"]) + requirements = set(["RTE", "TLE"]) + elif abbreviation == "not accepted": + requirements = set(["RTE", "TLE", "WA"]) + else: + assert False, f"unknown abbreviation {abbreviation}" + if permissions is not None: + self._permitted_verdicts = permissions + if requirements is not None: + self._required_verdicts = requirements + + def __repr__(self): + + def sorted_set_str(verdicts: set|None) -> str: + if verdicts is None: + return "None" + else: + return "{" + ", ".join(sorted(verdicts)) + "}" + + + return (f"permitted: {sorted_set_str(self._permitted_verdicts)}, " + + f"required: {sorted_set_str(self._required_verdicts)}") + + +class Expectations(dict[TestCasePattern, BaseExpectations]): + """The expectations for a submission pattern; it maps testcase patterns + to BaseExpectations. + + >>> e = Expectations("accepted") + >>> e + '': {permitted: {AC}, required: None} + >>> e.permitted_verdicts_for_testcase(Path("sample/1")) + {'AC'} + + Specify expectations by testgroup: + + >>> f = Expectations({'': 'wrong answer', 'sample': 'accepted', 'secret': 'wrong answer'}) + >>> f['sample'] + permitted: {AC}, required: None + + Or by testcase + >>> list(sorted(f.for_testcase('sample/1').keys())) + ['', 'sample'] + """ + + def __init__(self, expectations: str | list[int | float] | dict): + """ + Arguments + --------- + + expectations + list of common expectations, or range, or map + """ + + self.data: dict[str, BaseExpectations] = dict() + + if not isinstance(expectations, dict): + expectations = {"": expectations} + for k, val in expectations.items(): + if not (k == "" or k.startswith("sample") or k.startswith("secret")): + raise ValueError(f"Unexpected test data pattern: {k}") + self[TestCasePattern(k)] = BaseExpectations(val) + + def for_testcase(self, path: Path) -> dict[TestCasePattern, BaseExpectations]: + """Returns a dictionary over the patterns that apply for the given test case path. + + >>> e = Expectations( {'secret': { 'permitted': ['AC', 'TLE', 'WA']}, + ... 'secret/(tc)?[0-9]+-huge': { 'permitted': ['TLE'] }, + ... 'secret/[0-9]+-disconnected': { 'permitted': ['WA'] }}) + >>> list(sorted(e.for_testcase("secret/tc05-huge").keys())) + ['secret', 'secret/(tc)?[0-9]+-huge'] + >>> list(sorted(e.for_testcase("secret/05-disconnected").keys())) + ['secret', 'secret/[0-9]+-disconnected'] + >>> list(sorted(e.for_testcase("secret/abc-disconnected").keys())) + ['secret'] + """ + + return { + pattern: expectations + for pattern, expectations in self.items() + if re.match(pattern, str(path)) + } + + def permitted_verdicts_for_testcase(self, path: Path) -> set[str]: + """Returns a set of verdicts that is permitted at the given test case path. + + Permissions are restrictions, so that if several permissions apply, + their *intersection* is permitted + + >>> e = Expectations( {'secret': { 'permitted': ['AC', 'TLE']}, + ... 'secret/foo': { 'permitted': ['RTE', 'TLE'] }}) + >>> e.permitted_verdicts_for_testcase("secret/foo") + {'TLE'} + """ + permitted_verdicts = set(["AC", "TLE", "WA", "RTE"]) + for exp in self.for_testcase(path).values(): + permitted_verdicts &= exp.permitted_verdicts() + return permitted_verdicts + + def is_permitted(self, verdict: str, path: Path): + """Is the result permitted for the testcase at the given path? + + Accepts verdicts in long form. (Maybe it shouldn't.) + """ + return verdict in self.permitted_verdicts_for_testcase(path) + + def missing_required_verdicts( + self, verdict_for_testcase: dict[Path, str] + ) -> dict[TestCasePattern, set[str]]: + """Which verdicts are missing? + + Returns a map of expectation patterns to sets of verdicts. + + >>> e = Expectations("does not terminate") + >>> results = {"sample/1": "AC", "secret/1": "AC", "secret/2": "WA"} + >>> e.missing_required_verdicts(results) == {'': {'RTE', 'TLE'}} + True + >>> results = {"sample/1": "AC", "secret/1": "TLE", "secret/2": "WA"} + >>> e.missing_required_verdicts(results) + {} + """ + + missing = dict() + for tcpattern, exp in self.items(): + if not exp.required_verdicts(): + continue + for testcase, verdict in verdict_for_testcase.items(): + if re.match(tcpattern, str(testcase)) and verdict in exp.required_verdicts(): + break + else: + missing[tcpattern] = exp.required_verdicts() + return missing + + def is_satisfied_by(self, results: dict[Path, str]) -> bool: + """Are all requirements satisfied?""" + missing = self.missing_required_verdicts(results) + return all(self.is_permitted(results[path], path) for path in results) and all( + not missing_verdict for missing_verdict in missing.values() + ) + + def __repr__(self): + return ', '.join(f"'{k}': {{{repr(v)}}}" for k, v in self.items()) + + +class Registry(dict[str, Expectations]): + """A dictionary-like class that maps submission patterns to expectations.""" + + @staticmethod + def from_dict(dictionary): + """Factory method.""" + return Registry({k: Expectations(v) for k, v in dictionary.items()}) + + def for_path(self, path: Path): + """Return a restricted Registry where all patterns + match the given path. + + >>> registry = Registry({ + ... 'accepted': Expectations('accepted'), + ... 'accepted/th': Expectations({'sample': 'accepted'}), + ... 'wrong_answer': Expectations('wrong answer') + ... }) + >>> for k, v in registry.for_path(Path('accepted/th.py')).items(): + ... print(k, ":", v) + accepted : '': {permitted: {AC}, required: None} + accepted/th : 'sample': {permitted: {AC}, required: None} + + + A registry is just a dict; you can add more expectations to it + with the normal syntax: + + >>> registry['wrong_answer/greedy.py'] = Expectations({'sample': 'accepted'}) + >>> for k, v in registry.for_path(Path('wrong_answer/greedy.py')).items(): + ... print(k, ":", v) + wrong_answer : '': {permitted: {AC, WA}, required: {WA}} + wrong_answer/greedy.py : 'sample': {permitted: {AC}, required: None} + + path: + a pathlib.Path to a submission + """ + return Registry( + { + pattern: expectation + for pattern, expectation in self.items() + if re.match(pattern, str(path)) + } + ) + + def is_permitted(self, verdict, testcase: Path) -> bool: + """shut up""" + + return all(e.is_permitted(verdict, testcase) for e in self.values()) + + def violated_permissions( + self, verdict, testcase: Path + ) -> list[tuple[str, TestCasePattern, set[str]]]: + """Which permissions are violated by the given verdict for the given testcase? + + Return: + A list of tuples; each tuple consists of + - the submissions pattern + - the test case pattern + - the set of verdicts that was expected + The list is sorted; in the typical case this means that less + specific rules come first. + """ + violations = [] + for prefix, expectation in self.items(): + for pattern, base in expectation.for_testcase(testcase).items(): + permitted_verdicts = base.permitted_verdicts() + if verdict in permitted_verdicts: + continue + violations.append((prefix, pattern, permitted_verdicts)) + return list(sorted(violations)) + + def unsatisfied_requirements( + self, verdict_for_testcase: dict[Path, str] + ) -> list[tuple[str, TestCasePattern, set[str]]]: + """Which permissions are violated by the given results? + + Paramters: + verdict_for_testcase: + a mapping of testcase path to verdict + + Return: + A list of tuples; each tuple consists of + - the submissions pattern + - the test case pattern + - the set of verdicts that was required + The list is sorted; in the typical case this means that less + specific rules come first. + """ + missing = [] + for prefix, expectations in self.items(): + missing_verdicts = expectations.missing_required_verdicts(verdict_for_testcase) + for pattern, verdicts in missing_verdicts.items(): + missing.append((prefix, pattern, verdicts)) + + return missing + + def check(self, results) -> bool: + """Do the results satisfy all the expectations? + + Note that expectations compose in different ways; + permissions are subtractive, requirements additive. + + >>> registry = Registry( + ... a= Expectations({"sample": { 'permitted': ['AC', 'WA']}}), + ... b= Expectations({"sample": { 'permitted': ['AC', 'TLE']}}) + ... ) + >>> for v in ['AC', 'TLE', 'WA']: + ... result = {'sample': v } + ... print(f"{v}:", registry.check(result)) + AC: True + TLE: False + WA: False + + Typically, the expectations registered for a submission have + patterns like `secret` and `secret/huge` rather than mutually + exclusive `a` and `b`, and then this mechanism allows increasingly + fine-grained specification. + """ + return all(e.is_satisfied_by(results) for e in self.values()) + + +if __name__ == "__main__": + import doctest + + doctest.testmod() diff --git a/bin/problem.py b/bin/problem.py index 9dd398695..46daf4b8d 100644 --- a/bin/problem.py +++ b/bin/problem.py @@ -5,8 +5,10 @@ import sys from pathlib import Path +from functools import lru_cache import config +import expectations import parallel import program import run @@ -37,6 +39,7 @@ def __init__(self, path, tmpdir, label=None): self._program_callbacks = dict() # Dictionary from path to parsed file contents. self._testdata_yamls = dict() + self._expectations_registry = None # The label for the problem: A, B, A1, A2, X, ... self.label = label @@ -438,6 +441,23 @@ def build_program(p): problem._validators[key] = validators return validators + + # TODO Q from Thore: ok to use self here instead of problem? + def get_expectations_registry(self): + """ Parse yaml file (if any) describing the expectations for this problem. + """ + if self._expectations_registry is None: + path = self.path / 'submissions' / 'expectations.yaml' + if has_ryaml: + try: + yamldata = read_yaml_settings(path) + except ruamel.yaml.scanner.ScannerError: + fatal('Make sure problem.yaml does not contain any more {% ... %}.') + else: + yamldata = read_yaml_settings(path) + self._expectations_registry = expectations.Registry.from_dict(yamldata) + return self._expectations_registry + def run_submissions(problem): needans = False if problem.interactive else True diff --git a/bin/run.py b/bin/run.py index cb98e7384..7caba61c4 100644 --- a/bin/run.py +++ b/bin/run.py @@ -328,7 +328,7 @@ def __init__(self, problem, path, skip_double_build_warning=False): # The first element will match the directory the file is in, if possible. self.expected_verdicts = self._get_expected_verdicts() - + self.expectations = self.problem.get_expectations_registry().for_path(self.short_path) # NOTE: Judging of interactive problems on systems without `os.wait4` is # suboptimal because we cannot determine which of the submission and # interactor exits first. Thus, we don't distinguish the different non-AC @@ -400,9 +400,10 @@ def _get_expected_verdicts(self): verdicts = [subdir] else: if len(verdicts) == 0: - error( - f'Submission {self.short_path} must have @EXPECTED_RESULTS@. Defaulting to ACCEPTED.' - ) + pass # TODO (Thore): made this shut up! + #error( + # f'Submission {self.short_path} must have @EXPECTED_RESULTS@. Defaulting to ACCEPTED.' + #) if len(verdicts) == 0: verdicts = ['ACCEPTED'] @@ -452,9 +453,10 @@ def run_all_testcases( verdict = (-100, 'ACCEPTED', 'ACCEPTED', 0) # priority, verdict, print_verdict, duration verdict_run = None + verdict_for_testcase = dict() def process_run(run, p): - nonlocal max_duration, verdict, verdict_run + nonlocal max_duration, verdict, verdict_run, verdict_for_testcase localbar = bar.start(run) result = run.run() @@ -476,7 +478,10 @@ def process_run(run, p): if table_dict is not None: table_dict[run.name] = result.verdict == 'ACCEPTED' - got_expected = result.verdict in ['ACCEPTED'] + self.expected_verdicts + verdict_short = short_verdict(result.verdict) + verdict_for_testcase[run.name] = verdict_short + #got_expected = result.verdict in ['ACCEPTED'] + self.expected_verdicts + got_expected = self.expectations.is_permitted(verdict_short, run.name) # Print stderr whenever something is printed if result.out and result.err: @@ -514,8 +519,17 @@ def process_run(run, p): data += '\n' data += f'{f.name}:' + localbar._format_data(t) + '\n' + if not got_expected: + localbar.error(f'{result.duration:6.3f}s {result.print_verdict()}', data) + short = short_verdict(result.verdict) + for prefix, pattern, verdicts in self.expectations.violated_permissions(short, run.name): + prefix = (f'{Fore.CYAN}{prefix:>{len(localbar.prefix)}}{Style.RESET_ALL}:' + + f'{pattern:<{localbar.item_width}}') + localbar.warn(f"permits {verbose_verdicts(verdicts)}", prefix=prefix) + localbar.done(got_expected, f'{result.duration:6.3f}s {result.print_verdict()}', data) + # Lazy judging: stop on the first error when not in verbose mode. if ( not config.args.verbose and not config.args.table @@ -534,16 +548,22 @@ def process_run(run, p): self.print_verdict = verdict[2] self.duration = max_duration + # Check presence of required verdicts among testgroups + for prefix, pattern, verdicts in self.expectations.unsatisfied_requirements(verdict_for_testcase): + prefix = (f'{Fore.CYAN}{prefix:>{len(bar.prefix)}}{Style.RESET_ALL}: ' + + f'{pattern:<{bar.item_width}}') + bar.warn(f"no test case got {verbose_verdicts(verdicts)}", prefix=prefix) + # Use a bold summary line if things were printed before. if bar.logged: color = ( Style.BRIGHT + Fore.GREEN - if self.verdict in self.expected_verdicts + if self.expectations.is_permitted(short_verdict(self.verdict), Path()) else Style.BRIGHT + Fore.RED ) boldcolor = Style.BRIGHT else: - color = Fore.GREEN if self.verdict in self.expected_verdicts else Fore.RED + color = Fore.GREEN if self.expectations.is_permitted(short_verdict(self.verdict), Path()) else Fore.RED boldcolor = '' printed_newline = bar.finalize( diff --git a/bin/util.py b/bin/util.py index 29bbaaf63..f6bbd63cb 100644 --- a/bin/util.py +++ b/bin/util.py @@ -186,6 +186,7 @@ def clearline(self): return print(self.carriage_return, end='', flush=True, file=sys.stderr) + @staticmethod def action(prefix, item, width=None, total_width=None): if width is not None and total_width is not None and len(prefix) + 2 + width > total_width: width = total_width - len(prefix) - 2 @@ -273,7 +274,7 @@ def _format_data(data): # Log can be called multiple times to make multiple persistent lines. # Make sure that the message does not end in a newline. - def log(self, message='', data='', color=Fore.GREEN, *, resume=True): + def log(self, message='', data='', color=Fore.GREEN, *, resume=True, prefix=None): with self.lock: if message is None: message = '' @@ -292,7 +293,7 @@ def log(self, message='', data='', color=Fore.GREEN, *, resume=True): self.needs_leading_newline = False print( - self.get_prefix(), + self.get_prefix() if prefix is None else prefix, color, message, ProgressBar._format_data(data), @@ -313,9 +314,9 @@ def debug(self, message, data=''): if config.args.verbose: self.log(message, data) - def warn(self, message='', data=''): + def warn(self, message='', data='', prefix=None): config.n_warn += 1 - self.log(message, data, Fore.YELLOW) + self.log(message, data, Fore.YELLOW, prefix=prefix) # Error removes the current item from the in_progress set. def error(self, message='', data=''): @@ -981,3 +982,33 @@ def combine_hashes_dict(d): if d[key] is not None: hasher.update(d[key].encode()) return hasher.hexdigest() + + +def short_verdict(verdict): + return { + 'ACCEPTED': 'AC', + 'WRONG_ANSWER': 'WA', + 'RUN_TIME_ERROR': 'RTE', + 'TLE (aborted)': 'TLE', + 'TIME_LIMIT_EXCEEDED': 'TLE' + }[verdict] + +def verbose_verdicts(verdicts: str | set[str], oxford_comma=True) -> str: + long_form = { + 'AC': 'ACCEPTED', + 'WA': 'WRONG_ANSWER', + 'RTE': 'RUN_TIME_ERROR', + 'TLE': 'TLE (aborted)' + } + if isinstance(verdicts, str): + verdicts = {[verdicts]} + verdicts = list(sorted(verdicts)) + if len(verdicts) == 1: + return long_form[verdicts[0]] + elif len(verdicts) == 2: + return f"{long_form[verdicts[0]]} or {long_form[verdicts[1]]}" + else: + assert len(verdicts) == 3, len(verdicts) + return f"{long_form[verdicts[0]]}, {long_form[verdicts[1]]}, or {long_form[verdicts[2]]}" + + diff --git a/doc/expectations.md b/doc/expectations.md new file mode 100644 index 000000000..9aef737b6 --- /dev/null +++ b/doc/expectations.md @@ -0,0 +1,279 @@ +# Expectations + +This framework allows problem author to express their expecations for the behaviour of a submission on the test data. + +## Test Case Verdict + +The behaviour of a submission on a _single_ test case is summarised in a *verdict*. + +The verdicts are: + +* AC: Accepted. The submission terminates successfully within the time limit and the output vaidator accepts the submission output. +* WA: The submission terminates successfully within the time limit, but the output validator rejects the submission output on this testcase. +* TLE: The submission does not terminate within the time limit. +* RTE: The submission aborts within the time limit with a runtime error. + + +## Common Expectations for a Submission + + +The expected behaviour of a submission on the test data often falls into a number of common classes: + +* accepted: Every test case is `AC`. +* wrong answer: Every test case receives `AC` or `WA`; _some_ test case receives `WA`. +* time limit exceeded: Every test case receives `AC` or `TLE`; _some_ test case receives `TLE`. +* runtime exception: Every test case receives `AC` or `RTE`; _some_ test case receives `RTE`. +* does not terminate: Every test case receives `AC`, `RTE`, or `TLE` (but not `WA`); _some_ test case receives `RTE` or `TLE`. +* not accepted: Not every test case receives `AC`. This is the complement of _accepted_. + +In general, an expecation consists of a set of _permitted_ verdicts and set of _required_ verdicts. + +* All test case must receive one of the permitted verdicts. If no permitted verdicts are specified, _all_ verdicts are permitted. +* Some test case must receive one of the required verdicts. If no required verdicts are specified, _no_ verdict is required. + +Thus, the common expectations above can be spelt out in terms of lists of verdicts. +For instance for the submission `mysubmission.cpp`: + +```yaml +mysubmission.py: accepted +``` + +is the same as + +```yaml +mysubmission.py: + permitted: [AC] +``` +Similarly, + +```yaml +mysubmission.py: time limit exceeded +``` + +is the same as + +```yaml +mysubmission.py: + permitted: [AC, TLE] + required: [TLE] +``` + +## Specifying Expectations + +Expectations for submissions can be provided for a group of submissions or a single submission in +a file `/submissions/expectations.yaml`. +Submission patterns match by prefix, so it is easy to specify the expected behaviour of submissions by placing them into various subdirectories of `/submissions`. +A common tradition is specified like this: + +```yaml +accepted: accepted +wrong_answer: wrong answer +time_limit_exceeded: time limit exceeded +runtime_exception: runtime exception +``` +This would associate the expectation “accepted” with the submission `/submissions/accepted/mysubmission.cpp`. +The flexibility of the expectations framework is that it is agnostic about directory names; for instance you can your crashing submissions in `/submissions/run_time_error/` and put other requirements on the submissions in `/submissions/mixed/`: + +```yaml +run_time_error: runtime exception +mixed: + permitted: [AC, WA, TLE] + required: [WA] +``` + +## Specification per Submission + +The submission pattern is matches by prefix, so instead of directories you can specify individual submissions: + +```yaml +mixed/alice.py + permitted: [AC, WA, TLE] + required: [WA] +mixed/bob.py + permitted: [AC, WA, TLE] + required: [TLE] +``` + +## Specification per Test Data + +Top-level expectations apply to all test data, but you can be more fine-grained and specify expectations for subdirectories of `/data`. +For instance, if you want all submission in `wrong_answer` to pass the sample inputs, you’d write: + +```yaml +wrong_answer: + sample: accepted + secret: accepted +``` + +# Schema + +Here is the specification: + +```cue +#registry + +#registry: close({ [string]: #root }) + +#verdict: "AC" | "WA" | "RTE" | "TLE" +#verdicts: [...#verdict] + +#root: { + #expectations + [=~"^(sample|secret)"]: #expectations +} + +#expectations: { + #common + #range + permitted?: #verdicts // only these verdicts may appear + required?: #verdicts // at least one of these verdicts must appear + judge_message?: string // this judgemessage must appear + score?: #range + fractional_score?: #fractional_range + } + +#common: "accepted" | // { permitted: AC; required: AC } + "wrong answer" | // { permitted: [AC, WA]; required: WA } + "time limit exceeded" | // { permitted: [AC, TLE]; required: TLE } + "runtime exception" | // { permitted: [AC, RTE]; required: RTE } + "does not terminate" | // { permitted: [AC, TLE, RTE}; required: [RTE, TLE] + "not accepted" | // { required: [RTE, TLE, WA] } + "full score" // { fractional_score: 1.0 } + +#range: number | [number, number] +#fractional_range: #fraction | [#fraction, #fraction] +#fraction: >= 0.0 | <= 1.0 | float +``` + +# Examples + +```yaml +# Simple examples for some common cases + +a.py: accepted # AC on all cases +b.py: wrong answer # at least one WA, otherwise AC +c.py: time limit exceeded # at least one TLE, otherwise AC +d.py: runtime exception # at least one RTE, otherwise AC +e.py: does not terminate # at least one RTE or TLE, but no WA +f.py: not accepted # at least one RTE, TLE, or WA +g.py: full score # gets max_score + +# submission are identified by prefix: + +wrong_answer/: wrong answer # expectations "wrong answer" apply to "wrong_answer/th.py" etc. + +# Abbreviations are just shorthands for richer maps +# of "required" and "permitted" keys. +# +# For instance, these are the same: + +th.py: accepted +--- +th.py: + permitted: [AC] + required: [AC] +--- + +# A submission failed by the output validator on some testcase +# These are the same: + +wrong.py: wrong answer +--- +wrong.py: + permitted: [WA, AC] + required: [WA] +--- +wrong.py: + permitted: # alternative yaml syntax for list of strings + - WA + - AC + required: [WA] +--- + +# Specify that the submission fails, but passes the samples. +# These are the same, using the same abbreviations as +# above for "accepted" and "wrong answer" + +wrong.py: + sample: accepted + secret: wrong answer +--- +wrong.py: + sample: + permitted: [AC] + required: [AC] + secret: + permitted: [AC, WA] + required: [WA] + +# Constraints apply to testcases in entire subtree of cases that match the string: +funky.cpp: + permitted: [AC, WA, RTE] + secret: + permitted: [AC, RTE, TLE] # TLE is forbidden at ancestor, so this makes no sense + secret/small: accepted # more restrictive than ancestor: this is fine + +# Specification for subgroups works "all the way down to the tescase" +# though it's seldomly needed: +funky.cpp: + secret/huge_instances/disconnected_graph: + permitted: [RTE, TLE] + +# Can also specify a required judgemessage, not only verdicts +linear_search.py: + judge_message: "too many rounds" # matches judgemessage.txt as substring, case-insensitive + +# Allow digit regex to catch auto-numbered groups: `\d+` + +submission:py + secret/\d+-group/: accepted # matches 02-group + +######### +# Scoring +######### + +# simplest case: +th.py: full score + +# Partial solutions can be given in various ways: +partial.py: [50, 60] +--- +partial.py: 60 +--- +partial.py: + score: 60 +--- +partial.py: + score: [50, 60] +--- +partial.py: + fractional_score: [.5, .6] # percentage of full score +--- +# For subtasks, you probably want to specify the +# outcome per subgroup. You need to be more verbose: +partial.py: + secret/subtask1: full score + secret/subtask2: 0 + secret/subtask3: 0 +--- +# Can be even more verbose about scores +partial.py: + secret/subtask1: full score + secret/subtask2: + score: 13 # absolute score on this group is exactly 13 + secret/subtask3: # between 10% and 40% of (full score for subtask 3) + fractional_score: [.1, .4] +--- +# Can still specify testcases +bruteforce.py: + secret/subtask1: full score # subtask 1 has small instances + secret/subtask2: + score: 0 # No points for this subtask + required: TLE # ... because some testcase timed out + permitted: [AC, TLE] # ... rather than any WAs +--- +# The common abbreviations work here as well, you probably want to write this instead: +bruteforce.py: + secret/subtask1: full score # could write "accepted" as well in this case + secret/subtask2: time limit exceeded # this is more informative than "0" +``` diff --git a/readme.md b/readme.md index feceb035b..dd42baa44 100644 --- a/readme.md +++ b/readme.md @@ -1,203 +1,3 @@ -# BAPCtools +# Expectation Proposal Implemented in BAPCtools -BAPCtools is a tool for creating and developing problems following the -CLICS (DOMjudge/Kattis) problem format specified [here](https://icpc.io/problem-package-format/spec/problem_package_format). - -The aim of this tool is to run all necessary compilation, validation, and -testing commands while working on an ICPC-style problem. -Ideally one should never have to manually run any compilation or testing command themselves. - -I'm interested to know who's using this, so feel free to inform me (e.g. via an issue) if so ;) -The current state is relatively stable, but things do change from time to -time since I'm not aware of usage outside of BAPC yet. - -## Installation - -You can install the [bapctools-git AUR -package](https://aur.archlinux.org/packages/bapctools-git/), mirrored -[here](https://github.com/RagnarGrootKoerkamp/bapctools-git), or use the [Docker -image](#Docker). - -Otherwise, clone this repository and install the required dependencies manually. -(If you know how to make a Debian package, feel free to help out.) - -- Python 3 (>= 3.6). -- The [yaml library](https://pyyaml.org/wiki/PyYAMLDocumentation) via `pip install pyyaml` or the `python[3]-yaml` Arch Linux package. -- The [colorama library](https://pypi.org/project/colorama/) via `pip install colorama` or the `python[3]-colorama` Arch Linux package. -- The `argcomplete` library for command line argument completion. Install via - `python[3]-argcomplete`. - - - Note that actually using `argcomplete` is optional, but recommended. - Detailed instructions are [here](https://argcomplete.readthedocs.io/en/latest/). - - TL;DR: Put `eval "$(register-python-argcomplete[3] tools.py)"` in your `.bashrc` or `.zshrc`. - -Optional dependencies, required for some subcommands: - -- The [ruamel.yaml library](https://pypi.org/project/ruamel.yaml/) via `pip install ruamel.yaml` or the `python-ruamel-yaml` Arch Linux package (`python3-ruamel.yaml` on Debian derivatives). - - This is only needed for commands that update `generators.yaml`. -- The `latexmk` and `pdflatex` commands, provided by `texlive-bin` on Arch Linux and - potentially some specific LaTeX packages (like tikz) provided by - `texlive-extra`. - These are only needed for building `pdf` files, not for `run` and `validate` and such. -- The [matplotlib library](https://pypi.org/project/matplotlib/) via `pip install matplotlib` or the `python[3]-matplotlib` Linux package. - - This is optional and only used by the `solve_stats` command. -- The [requests library](https://pypi.org/project/requests/) via `pip install requests` or the `python[3]-requests` Linux package. - - This is optional and only used by the commands that call the DOMjudge API (`export`, `solutions --order-from-css`, and `solve_stats`) or the Slack API (`create_slack_channels` command). -- The [questionary library](https://pypi.org/project/questionary/) via `pip install questionary`. - - This is optional and only used by the `new_contest` and `new_problem` commands. - -After cloning the repository, symlink [bin/tools.py](bin/tools.py) to somewhere in your `$PATH`. E.g., if `~/bin/` is in your `$PATH`, you can do: - -``` -% ln -s ~/git/BAPCtools/bin/tools.py ~/bin/bt -``` - -### Windows - -For Windows, you'll need the following in your -`path`: - -- `Python` for Python 3 -- `g++` to compile C++ -- `javac` and `java` to compile and run `java`. - -Resource limits (memory limit/hard cpu time limit) are also not supported. - -BAPCtools makes use of symlinks for building programs. By default users are not allowed to create symlinks on Windows. -This can be fixed by enabling Developer Mode on Windows (Only since Windows 10, version 1703 or newer). - -### Docker - -A docker image containing this git repo and dependencies, together with commonly -used languages, is provided at -[ragnargrootkoerkamp/bacptools](https://hub.docker.com/r/ragnargrootkoerkamp/bapctools). -This version may be somewhat outdated. Ping me if you'd like it to be updated. - -This image can be used for e.g.: - -- running CI on your repo. Also see `bt gitlabci` which generates a - `.gitlab-ci.yaml` file. Make sure to clear the entrypoint, e.g. `entrypoint: [""]`. -- running `bt` on your local problems. Use this command to mount your local - directory into the docker image and run a command on it: - ``` - docker run -v $PWD:/data --rm -it ragnargrootkoerkamp/bapctools - ``` - -To update the image: - -``` -$ sudo systemctl start docker -$ docker pull archlinux:latest -$ docker login -$ docker build . -t ragnargrootkoerkamp/bapctools -$ docker push ragnargrootkoerkamp/bapctools -$ ssh sudo docker pull ragnargrootkoerkamp/bapctools -``` - -The last step is needed when your CI server is not automatically pulling the latest version. - -## Usage - -BAPCtools can be run either from a problem directory or a contest directory. This -is automatically detected by searching for the `problem.yaml` file. - -The most common commands and options to use on an existing repository are: - -- [`bt run [-v] [submissions [submissions ...]] [testcases [testcases ...]]`](#run) -- [`bt test [--interactive | --samples | [testcases [testcases ...]]]`](#test) -- [`bt generate [-v] [--jobs JOBS]`](#generate) -- [`bt validate [-v] [--remove | --move-to DIR] [testcases [testcases ...]]`](#validate) -- [`bt pdf [-v]`](#pdf) - -The list of all available commands and options is at [doc/commands.md#synopsis](doc/commands.md#synopsis), -and more information regarding the implementation is at [doc/implementation_notes.md](doc/implementation_notes.md). - -### Run - -- `bt run [-v] [submissions [submissions ...]] [testcases [testcases ...]]` - -Without arguments, the `run` command runs all submissions against all testcases. -Specify one or more submissions and one or more testcases to only run the given submissions against the given testcases. - -Before running the given submissions, this command first makes sure that all generated testcases are up to date (in case `generators/generators.yaml` was found). - -![run](doc/images/run.gif) - -By default, `bt run` only prints one summary line per submission, and one additional line for each testcase with an unexpected result. Use `-v` to print one line per testcase instead. - -![run -v](doc/images/run-v.gif) - -### Test - -- `bt test [--samples | [testcases [testcases ...]]]` - -Use the `test` command to run a single submission on some testcases. The submission `stdout` and `stderr` are printed to the terminal instead of verified as an answer file. -Use `--samples` to run on the samples, or pass a list of testcases or directories containing testcases. Use `--interactive`/`-i` to run in interactive mode, where console input is forwarded to the submission. -This rebuilds and reruns the program until either `control-C` or `control-D` is pressed. It's also possible to supply the test case on the command line directly using e.g. `< /path/to/file.in` or `<<< "10 20"`. - -![test](doc/images/test.png) - -### Generate - -- `bt generate [-v] [--jobs JOBS]` - -Use the `generate` command to generate the testcases specified in `generators/generators.yaml`. See [doc/generators.md](doc/generators.md) for the specification of `generators.yaml` and see [doc/commands.md#generate](doc/commands.md#generate) for the full list of arguents. -Use `-j 0` to disable running multiple jobs in parallel (the default is `4`). - -![generate](./doc/images/generate.gif) - -### Validate - -- `bt validate [-v] [--remove | --move-to DIR] [testcases [testcases ...]]` - -Validate all the `.in` and `.ans` for all (given) testcases. It runs all validators from `input_validators` and `output_validators`. - -Validators can be one of - -- a single-file program. -- a multi-file program with all files in a common directory. -- a .ctd CheckTestData file (this needs the `checktestdata` executable in your `$PATH`). -- a .viva file. - -You can use `--remove` to delete all failing testcases or `--move ` to move -them to a separate directory. - -![validator](./doc/images/validate.png) - -### Pdf - -- `bt pdf [-v]` - -Use this command to compile the `problem.en.pdf` from the `problem_statement/problem.en.tex` LaTeX statement. -`problem.en.pdf` is written to the problem directory itself. - -This can also be used to create the contest pdf by running it from the contest directory. - -## Personal configuration file - -For some command-line flags, it is convenient if they are always set to the same value, which differs per user -(e.g., `--username` or `--password` for commands that access a CCS like DOMjudge, -or `--jobs` to limit parallel execution) or per contest (e.g., which statement languages are used). -For this, you can create a configuration YAML file containing key-value pairs -in one of the following locations, from low to high priority: - -- `$XDG_CONFIG_HOME/bapctools/config.yaml` (Unix-ish systems, where `$XDG_CONFIG_HOME` usually is `~/.config`) -- `%AppData%/bapctools/config.yaml` (Windows systems) -- `/.bapctools.yaml` - -The keys in this config file can be any option that can be passed on the command-line. -Note that the keys should be written out in full (e.g., `username: jury` rather than `u: jury`) -and any hyphens should be replaced with an underscore (e.g., `no_bar: True` rather than `no-bar: True`). - -## Contributing / Style guide - -- The python code in the repository is formatted using [black](https://github.com/psf/black). - To enable the pre-commit hook, install [pre-commit](https://pre-commit.com/) - with `pip` or your package manager (Arch: `python-pre-commit`) and run - `pre-commit install` from the repository root. All python code will now automatically be formatted - on each commit. - -- Imports are usually ordered with system libraries first, followed by a - newline, followed by local includes. Both groups are sorted alphabetically, - and `import` comes before `from ... import`. +See [doc/expectations](doc/expectations.md) diff --git a/test/problems/hello/submissions/expectations.yaml b/test/problems/hello/submissions/expectations.yaml new file mode 100644 index 000000000..e25a890a4 --- /dev/null +++ b/test/problems/hello/submissions/expectations.yaml @@ -0,0 +1,7 @@ +accepted/: accepted +wrong_answer/: wrong answer +mixed/: + allowed: ["AC", "WA", "TLE", "RTE"] +rejected/: not accepted +run_time_error/: runtime exception +time_limit_exceeded/: time limit exceeded diff --git a/test/problems/spanishinquisition/generators/generators.yaml b/test/problems/spanishinquisition/generators/generators.yaml new file mode 100644 index 000000000..7aebebda4 --- /dev/null +++ b/test/problems/spanishinquisition/generators/generators.yaml @@ -0,0 +1,36 @@ +solution: /submissions/accepted/th.py +data: + sample: + data: + "2": + in: 1 1 + ans: "2" + secret: + data: + "3": + in: 1 2 + ans: "3" + inverse: + in: -1 1 + ans: "0" + neg: + data: + neg_five: + in: -5 3 + ans: "-2" + thirteen: + in: 10 3 + ans: "13" + fourteen: + in: 10 4 + ans: "14" + 031-numbered: + in: 3 4 + ans: "7" + + + + + + + diff --git a/test/problems/spanishinquisition/input_validators/validate.py b/test/problems/spanishinquisition/input_validators/validate.py new file mode 100644 index 000000000..a452ae032 --- /dev/null +++ b/test/problems/spanishinquisition/input_validators/validate.py @@ -0,0 +1,12 @@ +#! /usr/bin/env python3 + +import sys + +line = input().split() + +try: + x = int(line[0]) + y = int(line[1]) +except ValueError: + sys.exit(43) +sys.exit(42) diff --git a/test/problems/spanishinquisition/problem.yaml b/test/problems/spanishinquisition/problem.yaml new file mode 100644 index 000000000..648fefd1e --- /dev/null +++ b/test/problems/spanishinquisition/problem.yaml @@ -0,0 +1 @@ +name: Spanish Inquisition diff --git a/test/problems/spanishinquisition/problem_statement/problem.en.tex b/test/problems/spanishinquisition/problem_statement/problem.en.tex new file mode 100644 index 000000000..53c65a9ef --- /dev/null +++ b/test/problems/spanishinquisition/problem_statement/problem.en.tex @@ -0,0 +1 @@ +I didn't expect this. diff --git a/test/problems/spanishinquisition/submissions/accepted/bad-wa.py b/test/problems/spanishinquisition/submissions/accepted/bad-wa.py new file mode 100644 index 000000000..a3a8bbdc4 --- /dev/null +++ b/test/problems/spanishinquisition/submissions/accepted/bad-wa.py @@ -0,0 +1,3 @@ +#! /usr/bin/env python3 + +print(abs(sum(map(int, input().split())))) diff --git a/test/problems/spanishinquisition/submissions/accepted/th.py b/test/problems/spanishinquisition/submissions/accepted/th.py new file mode 100644 index 000000000..350cab154 --- /dev/null +++ b/test/problems/spanishinquisition/submissions/accepted/th.py @@ -0,0 +1,3 @@ +#! /usr/bin/env python3 + +print(sum(map(int, input().split()))) diff --git a/test/problems/spanishinquisition/submissions/expectations.yaml b/test/problems/spanishinquisition/submissions/expectations.yaml new file mode 100644 index 000000000..8045f98de --- /dev/null +++ b/test/problems/spanishinquisition/submissions/expectations.yaml @@ -0,0 +1,42 @@ +# These are just shorthands and re-implement (correctly) +# the directories of the problem package specification. + +accepted/: accepted +wrong_answer/: wrong answer +time_limit_exceeded: time limit exceeded + +# You can go beyond that and specify expectation +# per subgroup instead: + +wa_sample_ac/: + sample: accepted + secret: wrong answer + +# Use explicit permissions and requirements instead of the shorthands: + +mixed/allverdicts.py: + secret/3: + permitted: [WA] + secret/fourteen: + permitted: [AC] + secret/negative_ans/: + permitted: [TLE] + secret/: + required: [RTE] + +# Patterns match as regular expressions from the start: + +mixed/seven.sh: + secret/\d+-numbered: accepted # matches testcase secret/031-numbered +mixed/bad-eight.sh: + secret/\d+-numbered: accepted # matches testcase secret/031-numbered (and will fail) + +mixed/superstitious.py: # for this submission + secret/thirteen: # ... on testcases that match this pattern + permitted: [WA] # ... permit exactly the verdict WA, nothing else + secret/(?!thirteen): accepted # everywhere else: accept + +mixed/bad-fourteen.py: + secret/thirteen: + permitted: [WA] + secret/(?!thirteen): accepted diff --git a/test/problems/spanishinquisition/submissions/mixed/allverdicts.py b/test/problems/spanishinquisition/submissions/mixed/allverdicts.py new file mode 100644 index 000000000..bd956024b --- /dev/null +++ b/test/problems/spanishinquisition/submissions/mixed/allverdicts.py @@ -0,0 +1,10 @@ +#! /usr/bin/env python3 +""" This submission produces get *all* verdicts on various inputs.""" + +x, y = map(int, input().split()) +result = 0 +while (x + y) // 2 != result // 2: # WA if result is odd + result += 1 +# TLE if result is negative +assert result != 0 # RTEs for result 0 +print(result) diff --git a/test/problems/spanishinquisition/submissions/mixed/bad-eight.sh b/test/problems/spanishinquisition/submissions/mixed/bad-eight.sh new file mode 100644 index 000000000..93f77f6a8 --- /dev/null +++ b/test/problems/spanishinquisition/submissions/mixed/bad-eight.sh @@ -0,0 +1 @@ +echo 8 diff --git a/test/problems/spanishinquisition/submissions/mixed/bad-fourteen.py b/test/problems/spanishinquisition/submissions/mixed/bad-fourteen.py new file mode 100644 index 000000000..68952e4a0 --- /dev/null +++ b/test/problems/spanishinquisition/submissions/mixed/bad-fourteen.py @@ -0,0 +1,12 @@ +#! /usr/bin/env python3 +""" Irrationally afraid of 13, and (as expected) + fails on corresponding inputs. However, + also (unexpectedly) fails on 14. +""" + +x, y = map(int, input().split()) +result = x + y +if 12 < result < 15: + print(12) +else: + print(result) diff --git a/test/problems/spanishinquisition/submissions/mixed/seven.sh b/test/problems/spanishinquisition/submissions/mixed/seven.sh new file mode 100644 index 000000000..65c166890 --- /dev/null +++ b/test/problems/spanishinquisition/submissions/mixed/seven.sh @@ -0,0 +1 @@ +echo 7 diff --git a/test/problems/spanishinquisition/submissions/mixed/superstitious.py b/test/problems/spanishinquisition/submissions/mixed/superstitious.py new file mode 100644 index 000000000..70de8bd1b --- /dev/null +++ b/test/problems/spanishinquisition/submissions/mixed/superstitious.py @@ -0,0 +1,11 @@ +#! /usr/bin/env python3 +""" Irrationally afraid of the integer between 12 and 14 and therefore fails on + inputs where that's the correct answer. +""" + +x, y = map(int, input().split()) +result = x + y +if 12 < result < 14: + print(12) +else: + print(result) diff --git a/test/problems/spanishinquisition/submissions/time_limit_exceeded/bad-tle-wa.py b/test/problems/spanishinquisition/submissions/time_limit_exceeded/bad-tle-wa.py new file mode 100644 index 000000000..740c8f132 --- /dev/null +++ b/test/problems/spanishinquisition/submissions/time_limit_exceeded/bad-tle-wa.py @@ -0,0 +1,10 @@ +#! /usr/bin/env python3 +""" This is wrong exactly when the two inputs are different and runs + forever exactly if the sum is negative. +""" + +inputs = set(map(int, input().split())) +result = 0 +while result != sum(inputs): + result += 1 +print(result) diff --git a/test/problems/spanishinquisition/submissions/wa_sample_ac/bad-five.py b/test/problems/spanishinquisition/submissions/wa_sample_ac/bad-five.py new file mode 100644 index 000000000..c28cd73d0 --- /dev/null +++ b/test/problems/spanishinquisition/submissions/wa_sample_ac/bad-five.py @@ -0,0 +1,2 @@ +#! /usr/bin/env python3 +print(5) diff --git a/test/problems/spanishinquisition/submissions/wa_sample_ac/two.py b/test/problems/spanishinquisition/submissions/wa_sample_ac/two.py new file mode 100644 index 000000000..d0e0fd661 --- /dev/null +++ b/test/problems/spanishinquisition/submissions/wa_sample_ac/two.py @@ -0,0 +1 @@ +print(2) diff --git a/test/problems/spanishinquisition/submissions/wrong_answer/as_set.py b/test/problems/spanishinquisition/submissions/wrong_answer/as_set.py new file mode 100644 index 000000000..4ea325e21 --- /dev/null +++ b/test/problems/spanishinquisition/submissions/wrong_answer/as_set.py @@ -0,0 +1,5 @@ +#! /usr/bin/env python3 +""" This is wrong exactly when the two inputs are different """ + +inputs = set(map(int, input().split())) +print(sum(inputs)) diff --git a/test/problems/spanishinquisition/submissions/wrong_answer/bad-ac.py b/test/problems/spanishinquisition/submissions/wrong_answer/bad-ac.py new file mode 100644 index 000000000..350cab154 --- /dev/null +++ b/test/problems/spanishinquisition/submissions/wrong_answer/bad-ac.py @@ -0,0 +1,3 @@ +#! /usr/bin/env python3 + +print(sum(map(int, input().split()))) diff --git a/test/problems/spanishinquisition/submissions/wrong_answer/bad-search.py b/test/problems/spanishinquisition/submissions/wrong_answer/bad-search.py new file mode 100644 index 000000000..625c2e91f --- /dev/null +++ b/test/problems/spanishinquisition/submissions/wrong_answer/bad-search.py @@ -0,0 +1,8 @@ +#! /usr/bin/env python3 + +x, y = map(int, input().split()) + +result = 0 +while result != x + y: + result += 1 +print(result) diff --git a/test/problems/spanishinquisition/submissions/wrong_answer/wa.py b/test/problems/spanishinquisition/submissions/wrong_answer/wa.py new file mode 100644 index 000000000..ea1aa6e41 --- /dev/null +++ b/test/problems/spanishinquisition/submissions/wrong_answer/wa.py @@ -0,0 +1,2 @@ +#! /usr/bin/env python3 +print(abs(sum(map(int, input().split()))))