From 028127c0390ddea5fadc5746dc3267b0e1723ad8 Mon Sep 17 00:00:00 2001 From: Rob Taylor Date: Mon, 19 May 2025 19:26:03 +0100 Subject: [PATCH 1/4] Strengthen timeout handling in api call --- chipflow_lib/steps/silicon.py | 147 +++++++++++++++++++--------------- 1 file changed, 84 insertions(+), 63 deletions(-) diff --git a/chipflow_lib/steps/silicon.py b/chipflow_lib/steps/silicon.py index 357dcdbd..d1a545b9 100644 --- a/chipflow_lib/steps/silicon.py +++ b/chipflow_lib/steps/silicon.py @@ -21,7 +21,6 @@ logger = logging.getLogger(__name__) - class SiliconTop(StepBase, Elaboratable): def __init__(self, config={}): self._config = config @@ -191,68 +190,7 @@ def submit(self, rtlil_path, *, dry_run=False, wait=False): resp_data = resp.text # Handle response based on status code - if resp.status_code == 200: - logger.info(f"Submitted design: {resp_data}") - build_url = f"{chipflow_api_origin}/build/{resp_data['build_id']}" - build_status_url = f"{chipflow_api_origin}/build/{resp_data['build_id']}/status" - log_stream_url = f"{chipflow_api_origin}/build/{resp_data['build_id']}/logs?follow=true" - - print(f"Design submitted successfully! Build URL: {build_url}") - - # Poll the status API until the build is completed or failed - stream_event_counter = 0 - fail_counter = 0 - if wait: - while True: - logger.info("Polling build status...") - status_resp = requests.get( - build_status_url, - auth=(None, chipflow_api_key) - ) - if status_resp.status_code != 200: - fail_counter += 1 - logger.error(f"Failed to fetch build status {fail_counter} times: {status_resp.text}") - if fail_counter > 5: - logger.error(f"Failed to fetch build status {fail_counter} times. Exiting.") - raise ChipFlowError("Error while checking build status.") - - status_data = status_resp.json() - build_status = status_data.get("status") - logger.info(f"Build status: {build_status}") - - if build_status == "completed": - print("Build completed successfully!") - exit(0) - elif build_status == "failed": - print("Build failed.") - exit(1) - elif build_status == "running": - print("Build running.") - # Wait before polling again - # time.sleep(10) - # Attempt to stream logs rather than time.sleep - try: - if stream_event_counter > 1: - logger.warning("Log streaming may have been interrupted. Some logs may be missing.") - logger.warning(f"Check {build_url}") - stream_event_counter += 1 - with requests.get( - log_stream_url, - auth=(None, chipflow_api_key), - stream=True - ) as log_resp: - if log_resp.status_code == 200: - for line in log_resp.iter_lines(): - if line: - print(line.decode("utf-8")) # Print logs in real-time - sys.stdout.flush() - else: - logger.warning(f"Failed to stream logs: {log_resp.text}") - except requests.RequestException as e: - logger.error(f"Error while streaming logs: {e}") - pass - time.sleep(10) # Wait before polling again - else: + if resp.status_code != 200: # Log detailed information about the failed request logger.error(f"Request failed with status code {resp.status_code}") logger.error(f"Request URL: {resp.request.url}") @@ -268,3 +206,86 @@ def submit(self, rtlil_path, *, dry_run=False, wait=False): logger.error(f"Response body: {resp_data}") raise ChipFlowError(f"Failed to submit design: {resp_data}") + + logger.info(f"Submitted design: {resp_data}") + build_url = f"{chipflow_api_origin}/build/{resp_data['build_id']}" + build_status_url = f"{chipflow_api_origin}/build/{resp_data['build_id']}/status" + log_stream_url = f"{chipflow_api_origin}/build/{resp_data['build_id']}/logs?follow=true" + + print(f"Design submitted successfully! Build URL: {build_url}") + + # Poll the status API until the build is completed or failed + timeout = 10.0 + + def stream_until_fail_or_done(): + nonlocal timeout + fail_count = 0 + print_log_warning = False + while fail_count < (2*60//timeout): + try: + if fail_count > 1: + print_log_warning = True + with requests.get( + log_stream_url, + auth=(None, chipflow_api_key), + stream=True, timeout=timeout + ) as log_resp: + if log_resp.status_code == 200: + for line in log_resp.iter_lines(): + if line: + print(line.decode("utf-8")) # Print logs in real-time + sys.stdout.flush() + else: + logger.warning(f"Failed to stream logs: {log_resp.text}") + fail_count += 1 + except requests.Timeout: + fail_count +=1 + continue #go round again + except requests.RequestException as e: + if type(e) is requests.exceptions.ConnectionError and e.response is None: + fail_count +=1 + continue #try again + logger.error(f"Error while streaming logs: {type(e)}:{e} response={e.response}") + return "failed" + status_data = status_resp.json() + build_status = status_data.get("status") + if print_log_warning: + logger.warning("Log streaming may have been interrupted. Some logs may be missing.") + logger.warning(f"Check {build_url}") + + return build_status + + + if not wait: + exit(0) + + fail_count = 0 + while True: + logger.info("Polling build status...") + try: + status_resp = requests.get( + build_status_url, + auth=(None, chipflow_api_key), + timeout=timeout + ) + if status_resp.status_code != 200: + fail_count += 1 + logger.error(f"Failed to fetch build status {fail_count} times: {status_resp.text}") + if fail_count > 5: + logger.error(f"Failed to fetch build status {fail_count} times. Exiting.") + raise ChipFlowError("Error while checking build status.") + except requests.Timeout: + continue #go round again + + build_status = stream_until_fail_or_done() + if build_status == "completed": + print("Build completed successfully!") + exit(0) + elif build_status == "failed": + print("Build failed.") + exit(1) + elif build_status == "running": + print("Build running.") + # Wait before polling again + time.sleep(0.5) # Wait before polling again + From 9f331f514027eee3e3a199b515082b9af94671af Mon Sep 17 00:00:00 2001 From: Rob Taylor Date: Tue, 20 May 2025 22:51:02 +0100 Subject: [PATCH 2/4] Enable passing endpoint and origin to test-examples workflow --- .github/workflows/test-examples.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/test-examples.yml b/.github/workflows/test-examples.yml index 0c3232be..46925993 100644 --- a/.github/workflows/test-examples.yml +++ b/.github/workflows/test-examples.yml @@ -2,6 +2,15 @@ name: Reusable workflow to test everything in chipflow-examples works on: workflow_call: + inputs: + CHIPFLOW_BACKEND_VERSION: + required: false + type: string + default: 'branch-main' + CHIPFLOW_API_ORIGIN: + required: false + type: string + default: 'https://build.chipflow.org' secrets: CHIPFLOW_API_KEY: required: true @@ -66,3 +75,6 @@ jobs: pdm run chipflow silicon submit --wait $DRY env: CHIPFLOW_API_KEY: ${{ secrets.CHIPFLOW_API_KEY}} + CHIPFLOW_API_ORIGIN: ${{ inputs.CHIPFLOW_API_ORIGIN }} + CHIPFLOW_BACKEND_VERSION: ${{ inputs.CHIPFLOW_BACKEND_VERSION }} + From 3ce5046b0adf196dd2d612b52074cd617192b917 Mon Sep 17 00:00:00 2001 From: Rob Taylor Date: Tue, 20 May 2025 22:51:10 +0100 Subject: [PATCH 3/4] remove me --- .github/workflows/main.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 9f086125..2af6c295 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -45,5 +45,8 @@ jobs: test-submit: uses: ./.github/workflows/test-examples.yml + with: + CHIPFLOW_BACKEND_VERSION: 'branch-opentracing-batch' + CHIPFLOW_API_ORIGIN: "https://build-staging.chipflow.org" secrets: - CHIPFLOW_API_KEY: ${{ secrets.CHIPFLOW_API_KEY}} + CHIPFLOW_API_KEY: ${{ secrets.CHIPFLOW_API_KEY_STAGING }} From c72246bf5415bc660783f3d134993db5d37095f7 Mon Sep 17 00:00:00 2001 From: Rob Taylor Date: Tue, 20 May 2025 23:40:29 +0100 Subject: [PATCH 4/4] wip --- chipflow_lib/steps/silicon.py | 84 +++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 28 deletions(-) diff --git a/chipflow_lib/steps/silicon.py b/chipflow_lib/steps/silicon.py index d1a545b9..09bc8d35 100644 --- a/chipflow_lib/steps/silicon.py +++ b/chipflow_lib/steps/silicon.py @@ -216,10 +216,27 @@ def submit(self, rtlil_path, *, dry_run=False, wait=False): # Poll the status API until the build is completed or failed timeout = 10.0 + fail_count = 0 + + def poll_build_status(): + nonlocal timeout + nonlocal fail_count + logger.info("Polling build status...") + with requests.get( + build_status_url, + auth=(None, chipflow_api_key), + timeout=timeout + ) as status_resp: + if status_resp.status_code != 200: + fail_count += 1 + logger.error(f"Failed to fetch build status {fail_count} times: {status_resp.text}") + status_data = status_resp.json() + return status_data.get("status") + def stream_until_fail_or_done(): nonlocal timeout - fail_count = 0 + nonlocal fail_count print_log_warning = False while fail_count < (2*60//timeout): try: @@ -244,48 +261,59 @@ def stream_until_fail_or_done(): except requests.RequestException as e: if type(e) is requests.exceptions.ConnectionError and e.response is None: fail_count +=1 + logger.warn(f"Issue while streaming logs: {type(e)}:{e} response={e.response}. Trying again.") continue #try again logger.error(f"Error while streaming logs: {type(e)}:{e} response={e.response}") - return "failed" - status_data = status_resp.json() - build_status = status_data.get("status") + return False if print_log_warning: logger.warning("Log streaming may have been interrupted. Some logs may be missing.") logger.warning(f"Check {build_url}") - return build_status + return True if not wait: exit(0) fail_count = 0 + status = "waiting" while True: logger.info("Polling build status...") try: - status_resp = requests.get( - build_status_url, - auth=(None, chipflow_api_key), - timeout=timeout - ) - if status_resp.status_code != 200: - fail_count += 1 - logger.error(f"Failed to fetch build status {fail_count} times: {status_resp.text}") - if fail_count > 5: - logger.error(f"Failed to fetch build status {fail_count} times. Exiting.") - raise ChipFlowError("Error while checking build status.") + status = poll_build_status() except requests.Timeout: - continue #go round again - - build_status = stream_until_fail_or_done() - if build_status == "completed": - print("Build completed successfully!") - exit(0) - elif build_status == "failed": - print("Build failed.") + continue #go round again + except requests.RequestException as e: + if type(e) is requests.exceptions.ConnectionError and e.response is None: + fail_count +=1 + logger.warn(f"Issue while polling build: {type(e)}:{e} response={e.response}. Trying again.") + continue #try again + logger.error(f"Network error while polling build: {type(e)}:{e} response={e.response}") exit(1) - elif build_status == "running": - print("Build running.") - # Wait before polling again - time.sleep(0.5) # Wait before polling again + except Exception as e: + logger.error(f"Unexpected error while polling build: {type(e)}:{e}") + + match status: + case "completed": + print("Build completed successfully!") + exit(0) + case "failed": + print("Build failed.") + exit(1) + case "unknown": + continue # poll again + case "running": + print("Build running.") + + if not stream_until_fail_or_done(): + fail_count += 1 + logger.warn("Issue while streaming logs. Trying again.") + + if fail_count > 5: + logger.error(f"Failed to fetch build status {fail_count} times. Exiting.") + raise ChipFlowError("Error while checking build status.") + return "unknown" + + # Wait before polling again + time.sleep(0.5) # Wait before polling again