diff --git a/neurons/miner.py b/neurons/miner.py index 5f7b9500..0f119585 100644 --- a/neurons/miner.py +++ b/neurons/miner.py @@ -164,6 +164,6 @@ async def priority(self, synapse: template.protocol.Dummy) -> float: # This is the main function, which runs the miner. if __name__ == "__main__": with Miner() as miner: - while True: + while miner.is_running: bt.logging.info(f"Miner running... {time.time()}") - time.sleep(5) + time.sleep(1) diff --git a/template/base/miner.py b/template/base/miner.py index 1788e24b..e58d6f8a 100644 --- a/template/base/miner.py +++ b/template/base/miner.py @@ -69,8 +69,30 @@ def __init__(self, config=None): self.is_running: bool = False self.thread: Union[threading.Thread, None] = None self.lock = asyncio.Lock() + self.thread_lock = threading.Lock() + self.exception = None + + def get_exception(self): + with self.thread_lock: + return self.exception + + def set_exception(self, ex): + with self.thread_lock: + self.exception = ex + self.is_running = False def run(self): + """ + Entrypoint of worker thread. Provides try/except in order to prevent uncaught exceptions. + """ + try: + self.run_unsafe() + except Exception as e: + bt.logging.error("Exception caught in worker thread") + bt.logging.error(traceback.format_exc()) + self.set_exception(e) + + def run_unsafe(self): """ Initiates and manages the main loop for the miner on the Bittensor network. The main loop handles graceful shutdown on keyboard interrupts and logs unforeseen errors. @@ -105,6 +127,14 @@ def run(self): # Start starts the miner's axon, making it active on the network. self.axon.start() + t0 = time.time() + while time.time() - t0 < 1 and not self.axon.is_running(): + time.sleep(0.1) + if not self.axon.is_running(): + e = self.axon.exception + if e: + raise e + raise Exception("Failed to start axon for unknown reason") bt.logging.info(f"Miner starting at block: {self.block}") @@ -115,6 +145,16 @@ def run(self): self.block - self.metagraph.last_update[self.uid] < self.config.neuron.epoch_length ): + if not self.axon.is_running(): + # we may be faster than the exception is being set + ts = time.time() + while time.time() - ts < 3 and not self.axon.exception: + time.sleep(0.1) + e = self.axon.exception + if e: + raise e + else: + raise Exception("axon died without exception") # Wait before checking again. time.sleep(1)