|  | 
|  | 1 | +#!/usr/bin/env python | 
|  | 2 | + | 
|  | 3 | +"""Check if license fields are valid in all records.""" | 
|  | 4 | + | 
|  | 5 | +import asyncio | 
|  | 6 | +import dataclasses | 
|  | 7 | +import itertools | 
|  | 8 | +import json | 
|  | 9 | +import logging | 
|  | 10 | +import os | 
|  | 11 | +import time | 
|  | 12 | +import typing | 
|  | 13 | +from glob import glob | 
|  | 14 | +from pathlib import Path | 
|  | 15 | + | 
|  | 16 | +import click | 
|  | 17 | +import yaml | 
|  | 18 | + | 
|  | 19 | +LOOP = asyncio.get_event_loop() | 
|  | 20 | +MAPPING = Path(os.getcwd()) / "scripts" / "record_mapping.yaml" | 
|  | 21 | +FILES = Path(os.getcwd()) / "data" / "records" / "*" | 
|  | 22 | + | 
|  | 23 | + | 
|  | 24 | +@dataclasses.dataclass | 
|  | 25 | +class InvalidRecord: | 
|  | 26 | +    """Dataclass to hold information about a validated file.""" | 
|  | 27 | + | 
|  | 28 | +    recid: typing.Optional[str] | 
|  | 29 | +    path: Path | 
|  | 30 | +    msg: str | 
|  | 31 | + | 
|  | 32 | + | 
|  | 33 | +@click.command() | 
|  | 34 | +@click.option( | 
|  | 35 | +    "-m", | 
|  | 36 | +    "--mapping", | 
|  | 37 | +    default=MAPPING, | 
|  | 38 | +    type=click.Path(readable=True, path_type=Path, dir_okay=False), | 
|  | 39 | +    help="Path to check records against.", | 
|  | 40 | +) | 
|  | 41 | +@click.option( | 
|  | 42 | +    "-v", "--verbose", default=False, is_flag=True, help="Print verbose output." | 
|  | 43 | +) | 
|  | 44 | +@click.argument("files", type=click.Path(readable=True, path_type=Path), nargs=-1) | 
|  | 45 | +def command(**kwargs): | 
|  | 46 | +    """Validate a files of supplied paths. Arguments support unix-like patterns.""" | 
|  | 47 | +    try: | 
|  | 48 | +        LOOP.run_until_complete(main(**kwargs)) | 
|  | 49 | +    finally: | 
|  | 50 | +        LOOP.close() | 
|  | 51 | + | 
|  | 52 | + | 
|  | 53 | +async def main(mapping, verbose, files) -> None: | 
|  | 54 | +    """Validate record fields against a defined mapping.""" | 
|  | 55 | +    start_time = time.perf_counter() | 
|  | 56 | +    files = files or (FILES,) | 
|  | 57 | + | 
|  | 58 | +    log_level = logging.DEBUG if verbose else logging.INFO | 
|  | 59 | +    logging.basicConfig(level=log_level, format="[%(levelname)s] %(message)s") | 
|  | 60 | + | 
|  | 61 | +    logging.info("Loading mapping file...") | 
|  | 62 | +    mapping: dict = await LOOP.run_in_executor( | 
|  | 63 | +        None, lambda: yaml.safe_load(open(mapping, "r")) | 
|  | 64 | +    ) | 
|  | 65 | + | 
|  | 66 | +    globs = [glob(str(f)) for f in files] | 
|  | 67 | +    paths = [Path(g) for g in itertools.chain(*globs)] | 
|  | 68 | +    logging.info("Found %d files. Validating...", len(paths)) | 
|  | 69 | + | 
|  | 70 | +    tasks = [LOOP.create_task(validate_single(path, mapping)) for path in paths] | 
|  | 71 | +    results = await asyncio.gather(*tasks) | 
|  | 72 | + | 
|  | 73 | +    finish = f"within {time.perf_counter() - start_time:.2f} seconds" | 
|  | 74 | +    logging.info( | 
|  | 75 | +        "Validated %d files (%d records) %s.", | 
|  | 76 | +        len(paths), | 
|  | 77 | +        sum(r[0] for r in results), | 
|  | 78 | +        finish, | 
|  | 79 | +    ) | 
|  | 80 | + | 
|  | 81 | +    errors = {p: e for _, e, p in results if len(e)} | 
|  | 82 | +    if not errors: | 
|  | 83 | +        logging.info("All files validated successfully. No errors found.") | 
|  | 84 | +        exit(0) | 
|  | 85 | + | 
|  | 86 | +    logging.error( | 
|  | 87 | +        "Found %d errors in %d files.", | 
|  | 88 | +        sum(len(e) for e in errors.values()), | 
|  | 89 | +        len(errors), | 
|  | 90 | +    ) | 
|  | 91 | + | 
|  | 92 | +    for p, err in errors.items(): | 
|  | 93 | +        logging.error("File %s has %d errors:", p.name, len(err)) | 
|  | 94 | + | 
|  | 95 | +        for e in err: | 
|  | 96 | +            logging.error(" - %s: %s", e.recid or "UNSET", e.msg) | 
|  | 97 | + | 
|  | 98 | +    exit(1) | 
|  | 99 | + | 
|  | 100 | + | 
|  | 101 | +async def validate_single( | 
|  | 102 | +    path: Path, mapping: dict | 
|  | 103 | +) -> tuple[int, list[InvalidRecord], Path]: | 
|  | 104 | +    """Validate a single file against the mapping schema.""" | 
|  | 105 | +    errors = [] | 
|  | 106 | +    try: | 
|  | 107 | +        records = await asyncio.get_event_loop().run_in_executor( | 
|  | 108 | +            None, lambda p: json.loads(open(p, "rb").read()), path | 
|  | 109 | +        ) | 
|  | 110 | + | 
|  | 111 | +    except Exception as e: | 
|  | 112 | +        logging.error("Failed to load json file %s: %s", path.name, e) | 
|  | 113 | +        records = [] | 
|  | 114 | + | 
|  | 115 | +    def rcheck(doc, validation, stack=None) -> typing.Generator[str, None, None]: | 
|  | 116 | +        """Recursively checks a record against the validation schema.""" | 
|  | 117 | +        stack = stack or [] | 
|  | 118 | + | 
|  | 119 | +        if isinstance(validation, dict): | 
|  | 120 | +            for v_key, v_value in validation.items(): | 
|  | 121 | +                is_optional = v_key.startswith("?") | 
|  | 122 | +                v_key = v_key.removeprefix("?") | 
|  | 123 | + | 
|  | 124 | +                if v_key not in doc: | 
|  | 125 | +                    if not is_optional: | 
|  | 126 | +                        yield f"Missing required key [{']['.join(stack)}]->{v_key}" | 
|  | 127 | +                    continue | 
|  | 128 | + | 
|  | 129 | +                sub_docs = v if isinstance((v := doc[v_key]), list) else [v] | 
|  | 130 | +                for sub_doc in sub_docs: | 
|  | 131 | +                    yield from rcheck(sub_doc, v_value, stack + [v_key]) | 
|  | 132 | + | 
|  | 133 | +        else: | 
|  | 134 | +            if validation is None: | 
|  | 135 | +               return | 
|  | 136 | + | 
|  | 137 | +            allowed = validation if isinstance(validation, list) else [validation] | 
|  | 138 | + | 
|  | 139 | +            if not any(doc == pattern for pattern in allowed): | 
|  | 140 | +                yield f"Value of [{']['.join(stack)}]: `{doc}` does not match any valid patterns: {allowed}" | 
|  | 141 | + | 
|  | 142 | +    for record in records: | 
|  | 143 | +        if error_list := list(rcheck(record, mapping)): | 
|  | 144 | +            for e in error_list: | 
|  | 145 | +                rec = InvalidRecord(recid=record.get("recid"), path=path, msg=e) | 
|  | 146 | +                errors.append(rec) | 
|  | 147 | + | 
|  | 148 | +    logging.debug("Validated file %s with %d records.", path.name, len(records)) | 
|  | 149 | +    return len(records), errors, path | 
|  | 150 | + | 
|  | 151 | + | 
|  | 152 | +if __name__ == "__main__": | 
|  | 153 | +    command() | 
0 commit comments