From c2e8563780d989628e5971184a2316bbf65a3414 Mon Sep 17 00:00:00 2001 From: Jack Urbanek Date: Mon, 26 Jun 2023 14:53:10 -0400 Subject: [PATCH 1/2] Adding more permissivity to initialization socket failure cases --- .../architects/channels/websocket_channel.py | 15 +++++++++++++++ .../ec2/run_scripts/node/init_server.sh | 3 ++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/mephisto/abstractions/architects/channels/websocket_channel.py b/mephisto/abstractions/architects/channels/websocket_channel.py index ecf344422..7f5d6f032 100644 --- a/mephisto/abstractions/architects/channels/websocket_channel.py +++ b/mephisto/abstractions/architects/channels/websocket_channel.py @@ -23,6 +23,8 @@ logger = get_logger(name=__name__) +MAX_RETRIES = 3 + class WebsocketChannel(Channel): """ @@ -55,6 +57,7 @@ def __init__( self._is_alive = False self._is_closed = False self._socket_task: Optional[asyncio.Task] = None + self._retries = MAX_RETRIES def is_closed(self): """ @@ -146,6 +149,18 @@ async def run_socket(): pass else: await on_error(e) + except websockets.exceptions.InvalidStatusCode as e: + if self._retries == 0: + raise ConnectionRefusedError( + "Could not connect after retries" + ) from e + curr_retry = MAX_RETRIES - self._retries + logger.exception( + f"Status code error {repr(e)}, attempting retry {curr_retry}", + exc_info=True, + ) + await asyncio.sleep(1 + curr_retry) + self._retries += 1 except Exception as e: logger.exception( f"Socket error {repr(e)}, attempting restart", diff --git a/mephisto/abstractions/architects/ec2/run_scripts/node/init_server.sh b/mephisto/abstractions/architects/ec2/run_scripts/node/init_server.sh index c1c75f220..ba6c7bc03 100644 --- a/mephisto/abstractions/architects/ec2/run_scripts/node/init_server.sh +++ b/mephisto/abstractions/architects/ec2/run_scripts/node/init_server.sh @@ -1,7 +1,8 @@ #!/bin/bash echo "Installing basic requirements..." -sudo yum update -y >> /home/ec2-user/routing_server/setup/setup_log.txt 2>&1 +# Following is commented out until the aws linux2 repo is no longer lagging +# sudo yum update -y >> /home/ec2-user/routing_server/setup/setup_log.txt 2>&1 sudo yum install -y httpd >> /home/ec2-user/routing_server/setup/setup_log.txt 2>&1 echo "Downloading Node..." From a4c504c82b35887663330cca2eb2b121226544ad Mon Sep 17 00:00:00 2001 From: Jack Urbanek Date: Mon, 26 Jun 2023 14:55:16 -0400 Subject: [PATCH 2/2] Correct branch of exception --- .../architects/channels/websocket_channel.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mephisto/abstractions/architects/channels/websocket_channel.py b/mephisto/abstractions/architects/channels/websocket_channel.py index 7f5d6f032..0eac5ffbc 100644 --- a/mephisto/abstractions/architects/channels/websocket_channel.py +++ b/mephisto/abstractions/architects/channels/websocket_channel.py @@ -149,18 +149,6 @@ async def run_socket(): pass else: await on_error(e) - except websockets.exceptions.InvalidStatusCode as e: - if self._retries == 0: - raise ConnectionRefusedError( - "Could not connect after retries" - ) from e - curr_retry = MAX_RETRIES - self._retries - logger.exception( - f"Status code error {repr(e)}, attempting retry {curr_retry}", - exc_info=True, - ) - await asyncio.sleep(1 + curr_retry) - self._retries += 1 except Exception as e: logger.exception( f"Socket error {repr(e)}, attempting restart", @@ -176,6 +164,18 @@ async def run_socket(): f"Unhandled OSError exception in socket {e}, attempting restart" ) await asyncio.sleep(0.2) + except websockets.exceptions.InvalidStatusCode as e: + if self._retries == 0: + raise ConnectionRefusedError( + "Could not connect after retries" + ) from e + curr_retry = MAX_RETRIES - self._retries + logger.exception( + f"Status code error {repr(e)}, attempting retry {curr_retry}", + exc_info=True, + ) + await asyncio.sleep(1 + curr_retry) + self._retries += 1 except Exception as e: logger.exception(f"Unhandled exception in socket {e}, {repr(e)}") if self._is_closed: