diff --git a/doc/sphinx/source/utils/RTW/user_guide/workflow.rst b/doc/sphinx/source/utils/RTW/user_guide/workflow.rst index 790860ecaa..9b125749bc 100644 --- a/doc/sphinx/source/utils/RTW/user_guide/workflow.rst +++ b/doc/sphinx/source/utils/RTW/user_guide/workflow.rst @@ -65,6 +65,19 @@ The |RTW| performs the following steps: Runs each cycle for every recipe defined in the |RTW| after ``process`` has completed +``generate_report`` + :Description + Generate a by recipe HTML summary of the ``process`` and ``compare`` + jobs results. + :Runs on: + Localhost + :Executes: + The ``generate_html_report.py`` script from the |Rose| app, and other + helper scripts depending on ``SITE``. + :Details: + Runs for every cycle. The report is output to the |Cylc| share/cycle + directory. + ``housekeeping`` :Description: Removes the logs and data (including recipe outputs) diff --git a/esmvaltool/utils/recipe_test_workflow/app/generate_report/bin/fetch_commit_info.py b/esmvaltool/utils/recipe_test_workflow/app/generate_report/bin/fetch_commit_info.py new file mode 100644 index 0000000000..2dbe7a8fae --- /dev/null +++ b/esmvaltool/utils/recipe_test_workflow/app/generate_report/bin/fetch_commit_info.py @@ -0,0 +1,242 @@ +"""Fetch commit details from the GitHub API.""" + +import os + +import requests + +GITHUB_API_URL = "https://api.github.com" +GITHUB_API_PERSONAL_ACCESS_TOKEN = os.environ.get( + "GITHUB_API_PERSONAL_ACCESS_TOKEN" +) +HEADERS = { + "authorization": f"token {GITHUB_API_PERSONAL_ACCESS_TOKEN}", + # Suggested here: + # https://docs.github.com/en/rest/commits/commits?apiVersion=2022-11-28#get-a-commit--parameters + # Explanation here: + # https://docs.github.com/en/rest/overview/resources-in-the-rest-api#http-HEADERS + "accept": "application/vnd.github+json", +} + + +def fetch_commit_details_from_github_api( + shas_by_package_and_day, headers=HEADERS +): + """ + Fetch commit details from the GitHub API for the given SHAs. + + Parameters + ---------- + shas_by_package_and_day : dict[str, dict[str, str]] + A dictionary where keys are the package names and values are dictionaries + with days as keys and SHAs as values. E.g. + {"ESMValCore": {"today": "abcd123", "yesterday": "efgh456"}...}. + + Returns + ------- + dict[str, list[dict]] + A dictionary where keys are the package names and values are lists of + commit details for each day. E.g. + {"ESMValCore": [{"sha": "abcd123", ...}, ...], "ESMValTool": [...]} + """ + commit_details_by_package = {} + for package, shas_by_day in shas_by_package_and_day.items(): + if shas_by_day.get("yesterday") is None or shas_by_day.get( + "today" + ) == shas_by_day.get("yesterday"): + raw_commits = fetch_single_commit( + package, "ESMValGroup", headers, shas_by_day["today"] + ) + else: + raw_commits = fetch_range_of_commits( + package, + "ESMValGroup", + headers, + newer_sha=shas_by_day["today"], + older_sha=shas_by_day["yesterday"], + ) + commit_info = process_commit_info(raw_commits) + commit_details_by_package[package] = commit_info + return commit_details_by_package + + +def make_api_call(url, headers=None, params=None): + """ + Make a GET request to a given API url. + + Parameters + ---------- + url : str + The URL to make the request to. + headers : dict, optional + Headers to include in the request. + params : dict, optional + Query parameters to include in the request. + + Raises + ------ + HTTPError + If the request fails or returns with a status code other than 200. + TimeOutError + If the request times out. + ConnectionError + If there is a connection error. + TooManyRequestsError + If the API rate limit is exceeded. + + Returns + ------- + Response + The raw response from the API call. + """ + try: + response = requests.get( + url, headers=headers, params=params, timeout=10 + ) + if response.status_code != 200: + raise requests.exceptions.HTTPError( + f"Unexpected status code for url={url} headers={headers} " + f"params={params} - {response.status_code}: {response.text}" + ) + except requests.exceptions.HTTPError as http_err: + raise requests.exceptions.HTTPError( + f"HTTP error occurred: {http_err}" + ) from http_err + except requests.exceptions.Timeout as timeout_err: + raise requests.exceptions.Timeout( + f"Request timed out: {timeout_err}" + ) from timeout_err + except requests.exceptions.ConnectionError as conn_err: + raise requests.exceptions.ConnectionError( + f"Connection error occurred: {conn_err}" + ) from conn_err + return response + + +def fetch_single_commit(repo, owner, headers, sha): + """ + Fetch details of a single commit from the GitHub API. + + Parameters: + ---------- + repo: str + The name of the repository. E.g. "ESMValTool" + owner: str + The owner of the repository. E.g. "ESMValGroup" + headers: dict + Headers to include in the request. + sha: str + The SHA of the commit to fetch details for. + + Raises + ------ + HTTPError + If the commit is not found or if the request fails etc. + + Returns + ------- + dict + The raw commit data if found. + """ + url = f"{GITHUB_API_URL}/repos/{owner}/{repo}/commits/{sha}" + response = make_api_call(url, headers=headers) + raw_commit = response.json() + return raw_commit + + +def fetch_range_of_commits(repo, owner, headers, newer_sha, older_sha): + """ + Fetch details for a range of commits from the GitHub API. + + The endpoint will return a range of commits in chronlogical order, from + the newer SHA to the older SHA. The function fetches batches of 10 commits + to avoid hitting the API rate limits. NOTE: The GitHub API will raise a + HTTPError if the newer SHA is not found. + + Parameters: + ---------- + repo : str + The name of the repository. E.g. "ESMValTool" + owner : str + The owner of the repository. E.g. "ESMValGroup" + headers : dict + Headers to include in the request. + newer_sha : str + The SHA of the first commit to start fetching details for. + older_sha : str + The SHA of the commit to stop fetching at. + + Raises + ------ + HTTPError + If the newer SHA is not found (or if the request fails etc.) + ValueError + If too many pages are fetched, indicating a potential infinite loop. + + Returns + ------- + list[dict] + A list of raw commit data for the range of commits from newer_sha to + older_sha, in chronological order. + """ + url = f"{GITHUB_API_URL}/repos/{owner}/{repo}/commits" + params = { + "per_page": 10, + "sha": newer_sha, + } + page = 1 + range_raw_commits = [] + + fetched_end_sha = False + + while not fetched_end_sha: + params["page"] = page + response = make_api_call(url, headers=headers, params=params) + + page_raw_commits = response.json() + for raw_commit in page_raw_commits: + range_raw_commits.append(raw_commit) + if raw_commit["sha"].startswith(older_sha): + fetched_end_sha = True + break + + page += 1 + if page > 5: + raise ValueError( + "Too many pages fetched, likely an infinite loop. Check the " + "newer and older SHAs." + ) + + return range_raw_commits + + +def process_commit_info(raw_commit_info): + """ + Extract required commit details. + + Parameters: + ----------- + raw_commit_info : dict | list[dict] + Raw commit information from the GitHub API. Either a single commit or a + list of commits. + + Returns + ------- + list[dict] + A list of dictionaries containing processed commit information. + """ + + if not isinstance(raw_commit_info, list): + raw_commit_info = [raw_commit_info] + + processed_commit_info = [] + for raw_commit in raw_commit_info: + processed_comit = { + "sha": raw_commit["sha"][:7], + "author": raw_commit["commit"]["author"]["name"], + "message": raw_commit["commit"]["message"], + "date": raw_commit["commit"]["author"]["date"], + "url": raw_commit["html_url"], + "author_avatar": raw_commit["author"]["avatar_url"], + } + processed_commit_info.append(processed_comit) + return processed_commit_info diff --git a/esmvaltool/utils/recipe_test_workflow/app/generate_report/bin/generate_html_report.py b/esmvaltool/utils/recipe_test_workflow/app/generate_report/bin/generate_html_report.py index 0498aff7fc..df68bde847 100755 --- a/esmvaltool/utils/recipe_test_workflow/app/generate_report/bin/generate_html_report.py +++ b/esmvaltool/utils/recipe_test_workflow/app/generate_report/bin/generate_html_report.py @@ -1,17 +1,76 @@ #!/usr/bin/env python +"""Generate a HTML summary report from a Cylc SQLite database.""" + import os +import shutil import sqlite3 +import subprocess +import traceback from datetime import datetime +from pathlib import Path from jinja2 import Environment, FileSystemLoader, select_autoescape +from requests.exceptions import ConnectionError, HTTPError, Timeout + +# Import from the ESMValTool package for testing. +try: + from esmvaltool.utils.recipe_test_workflow.app.generate_report.bin.fetch_commit_info import ( + fetch_commit_details_from_github_api, + ) + from esmvaltool.utils.recipe_test_workflow.app.generate_report.bin.shas_via_git import ( + get_shas_from_git, + ) + from esmvaltool.utils.recipe_test_workflow.app.generate_report.bin.shas_via_singularity import ( + get_shas_from_singularity, + ) +# Import locally for running in Cylc. +except ImportError: + from fetch_commit_info import fetch_commit_details_from_github_api + from shas_via_git import get_shas_from_git + from shas_via_singularity import get_shas_from_singularity + +# Load environment variables required at all sites. CYLC_DB_PATH = os.environ.get("CYLC_DB_PATH") CYLC_TASK_CYCLE_POINT = os.environ.get("CYLC_TASK_CYCLE_POINT") -CYLC_WORKFLOW_SHARE_DIR = os.environ.get("CYLC_WORKFLOW_SHARE_DIR") +CYLC_TASK_CYCLE_YESTERDAY = os.environ.get("CYLC_TASK_CYCLE_YESTERDAY") +CYLC_WORKFLOW_RUN_DIR = os.environ.get("CYLC_WORKFLOW_RUN_DIR") +OUTPUT_DIR = os.environ.get("OUTPUT_DIR") REPORT_PATH = os.environ.get("REPORT_PATH") - - -def main(db_file_path=CYLC_DB_PATH): +SITE = os.environ.get("SITE") + +ESMVAL_VERSIONS_TODAY = None +ESMVAL_VERSIONS_YESTERDAY = None +REPOS = None + +if SITE == "dkrz": + ESMVAL_VERSIONS_TODAY = os.environ.get("ESMVAL_VERSIONS_CURRENT") + ESMVAL_VERSIONS_YESTERDAY = os.environ.get("ESMVAL_VERSIONS_PREVIOUS") + PRODUCTION = os.environ.get("PRODUCTION") + VM_PATH = os.environ.get("VM_PATH") + if VM_PATH: + VM_DEBUG_LOG_DIR = Path(VM_PATH) / "debug_logs" + +if SITE == "metoffice": + REPOS = { + "ESMValCore_today": os.environ.get("ESMVALCORE_DIR"), + "ESMValTool_today": os.environ.get("ESMVALTOOL_DIR"), + } + if CYLC_TASK_CYCLE_YESTERDAY: + path_to_yesterdays_cycle = Path(CYLC_TASK_CYCLE_YESTERDAY) + REPOS["ESMValCore_yesterday"] = path_to_yesterdays_cycle / "ESMValCore" + REPOS["ESMValTool_yesterday"] = path_to_yesterdays_cycle / "ESMValTool" + + +def main( + db_file_path=CYLC_DB_PATH, + site=SITE, + report_path=REPORT_PATH, + cylc_task_cycle_point=CYLC_TASK_CYCLE_POINT, + esmval_versions_today=ESMVAL_VERSIONS_TODAY, + esmval_versions_yesterday=ESMVAL_VERSIONS_YESTERDAY, + repos=REPOS, +): """ Main function to generate the HTML report. @@ -19,18 +78,98 @@ def main(db_file_path=CYLC_DB_PATH): ---------- db_file_path : str, default CYLC_DB_FILE_PATH The path to the SQLite database file. + site : str + The site the Recipe Test Workflow is being run at. + report_path : str + The path to output the HTML report. + cylc_task_cycle_point : str + The cycle point of the task as a string in ISO8601 format. + esmval_versions_today : str | None + The path to today's singularity container, if the site uses a + singularity container, or None. + esmval_versions_yesterday : str | None + The path to yesterday's singularity container, if the site uses a + singularity container and it exists, or None. + repos : dict[str, str] | None + A dictionary of git repos if the site uses git repos, or None. """ - raw_db_data = fetch_report_data(db_file_path) + commit_info = None + raw_db_data = fetch_report_data(db_file_path, cylc_task_cycle_point) processed_db_data = process_db_output(raw_db_data) - subheader = create_subheader() + + # Commits/SHAs will only be included for these sites. The report will run + # at other sites without commit/SHA information. + try: + if site == "dkrz": + sha_info = get_shas_from_singularity( + esmval_versions_today, esmval_versions_yesterday + ) + # Debug logs will only be added to report at DKRZ. + if PRODUCTION and VM_DEBUG_LOG_DIR == "True": + if VM_DEBUG_LOG_DIR.exists(): + shutil.rmtree(VM_DEBUG_LOG_DIR) + debug_log_processor(processed_db_data) + elif site == "metoffice": + sha_info = get_shas_from_git(repos) + # Catch the following errors so the report generates even if commit/SHA or + # debug processing errors. These errors are either propagated on purpose or + # indicate a probable minor issue. + except ( + ValueError, + KeyError, + IndexError, + HTTPError, + Timeout, + ConnectionError, + ): + print( + "Report generating with results only. Error while fetching commit " + "data and/or debug logs. See std.err log for details." + ) + traceback.print_exc() + + if sha_info: + commit_info = fetch_commit_details_from_github_api(sha_info) + add_report_messages_to_commits(commit_info) + + subheader = create_subheader(cylc_task_cycle_point) rendered_html = render_html_report( subheader=subheader, report_data=processed_db_data, + commit_info=commit_info, ) - write_report_to_file(rendered_html) + write_report_to_file(rendered_html, report_path) + + +def add_report_messages_to_commits(commit_info): + """ + Add report messages to a commit info dictionary in-place. + + Parameters + ---------- + commit_info : dict[str, list[dict]] + A dictionary where keys are package names and values are lists of + commit details. E.g. + { + "ESMValCore": [ + {"sha": "abcd123", "message": "Fix bug", ...}, + {"sha": "efgh456", "message": "Add feature", ...} + ], + "ESMValTool": [ + {"sha": "ijkl789", "message": "Update docs", ...} + ] + } + """ + if commit_info: + for package_commits in commit_info.values(): + package_commits[0]["report_flag"] = "Commit tested this cycle >>>" + if len(package_commits) > 1: + package_commits[-1]["report_flag"] = ( + "Commit tested last cycle >>>" + ) -def fetch_report_data(db_file_path, target_cycle_point=CYLC_TASK_CYCLE_POINT): +def fetch_report_data(db_file_path, target_cycle_point): """ Fetch report data for a single cycle from the Cylc SQLite database. @@ -38,9 +177,8 @@ def fetch_report_data(db_file_path, target_cycle_point=CYLC_TASK_CYCLE_POINT): ---------- db_file_path : str The path to the SQLite database file. - target_cycle_point : str, default CYLC_TASK_CYCLE_POINT - The cycle point to collect data for. Defaults to the current cylc - cycle. + target_cycle_point : str + The cycle point to collect data for. Returns ------- @@ -63,14 +201,6 @@ def process_db_task(task_name, status): """ Process db output data for a single task. - Create a dictionary in the format: - ``` - "process_task": { - "status": "succeeded", - "style": "color: green" - }, - ``` - Parameters ---------- task_name: str @@ -80,9 +210,9 @@ def process_db_task(task_name, status): Returns ------- - tuple[str, dict] - A tuple containing the name of the recipe as a string and the task data - as a dictionary. + tuple[str, str, dict] + A tuple containing the name of the recipe, the display name of the + recipe and the task data as a dictionary. """ styles = { "succeeded": "color: green", @@ -91,11 +221,12 @@ def process_db_task(task_name, status): task_name_parts = task_name.split("_", 1) recipe_name = task_name_parts[1] processed_task_name = task_name_parts[0] + "_task" - # Restore directories to a "/" - recipe_name = task_name_parts[1].replace("--", "/") + # Display recipes in directories with a "/" + recipe_display_name = task_name_parts[1].replace("--", "/") style = styles.get(status, "color: black") task_data = ( recipe_name, + recipe_display_name, {processed_task_name: {"status": status, "style": style}}, ) return task_data @@ -110,6 +241,7 @@ def process_db_output(report_data): ``` { "recipe_name": { + "recipe_display_name": "recipe_name", "process_task": { "status": "succeeded", "style": "color: green" @@ -131,23 +263,162 @@ def process_db_output(report_data): Returns ------- dict - A dictionary with recipe names as keys and tasks/task data as values. + A dictionary with recipe names as keys and tasks/task data and other + metadata as values. """ processed_db_data = {} # A tuple is required for the `startswith` func. tasks_to_include_in_report = ("process", "compare") for task_name, status in report_data: if task_name.startswith(tasks_to_include_in_report): - recipe, task_data = process_db_task(task_name, status) + recipe, recipe_display_name, task_data = process_db_task( + task_name, status + ) if not processed_db_data.get(recipe): processed_db_data[recipe] = task_data + processed_db_data[recipe]["recipe_display_name"] = ( + recipe_display_name + ) else: processed_db_data[recipe].update(task_data) sorted_processed_db_data = dict(sorted(processed_db_data.items())) return sorted_processed_db_data -def create_subheader(cylc_task_cycle_point=CYLC_TASK_CYCLE_POINT): +def copy_a_debug_log_to_vm(target_debug_log, recipe, task): + """ + Copy a debug log file to a VM directory ``debug_logs//``. + + Parameters + ---------- + target_debug_log : Path + The target log file. E.g. From the Cylc run directory. + recipe : str + The name of the recipe. + task : str + The name of the task e.g. "process_task", "compare_task". + + Returns + ------- + Path | None + The path to the debug log on the VM or none. + """ + if not VM_DEBUG_LOG_DIR or not PRODUCTION: + return None + file_name = target_debug_log.name + vm_debug_log_dir_for_recipe_task = VM_DEBUG_LOG_DIR / recipe / task + + if not vm_debug_log_dir_for_recipe_task.exists(): + vm_debug_log_dir_for_recipe_task.mkdir(parents=True, exist_ok=True) + + command = f"rsync -a {target_debug_log} {vm_debug_log_dir_for_recipe_task}" + subprocess.run(command, shell=True) + return vm_debug_log_dir_for_recipe_task / file_name + + +def esmvaltool_debug_log_processor(recipe): + """ + Copy the ESMValTool ``main_log_debug.txt`` to the VM, if it exists. + + Parameters + ---------- + recipe: str + The recipe. + + Returns + ------- + dict + A dict containing the path to the debug log file on the VM, or an empty + dict. + """ + if OUTPUT_DIR: + output_dir = Path(OUTPUT_DIR) + if output_dir.is_dir(): + # ESMValTool only uses the last part of a recipe name when creating + # it's directory. E.g. ``examples--recipe_python`` will be in + # ``recipe_python__