From 65703c5b8184bcdf345d4db541c6ac46b6a49c71 Mon Sep 17 00:00:00 2001 From: alexliang Date: Tue, 1 Nov 2022 02:24:16 +0800 Subject: [PATCH 1/3] update fedml docker building files and add diagnosis cli. --- devops/scripts/build-fedml-docker.sh | 148 +++++++++--------- devops/scripts/push-fedml-docker.sh | 4 +- doc/en/starter/install/jetson.md | 4 +- doc/en/starter/install/rpi.md | 4 +- docker/arm64v8/Dockerfile | 6 +- docker/build-docker.sh | 15 +- docker/x86-64/Dockerfile | 6 +- python/fedml/cli/cli.py | 42 +++++ .../cli/edge_deployment/client_diagnosis.py | 88 +++++++++++ .../communication/s3/remote_storage.py | 29 ++++ 10 files changed, 257 insertions(+), 89 deletions(-) create mode 100644 python/fedml/cli/edge_deployment/client_diagnosis.py diff --git a/devops/scripts/build-fedml-docker.sh b/devops/scripts/build-fedml-docker.sh index 3096a0fe31..46ea9ae738 100755 --- a/devops/scripts/build-fedml-docker.sh +++ b/devops/scripts/build-fedml-docker.sh @@ -6,110 +6,112 @@ pwd=`pwd` export FEDML_VERSION=`cat python/setup.py |grep version= |awk -F'=' '{print $2}' |awk -F',' '{print $1}'|awk -F'"' '{print $2}'` # Build X86_64 docker -ARCH=x86_64 -OS=ubuntu18.04 -DISTRO=ubuntu1804 -PYTHON_VERSION=3.7 -PYTORCH_VERSION=1.12.1 -NCCL_VERSION=2.9.9 -CUDA_VERSION=11.3 -OUTPUT_IMAGE=fedml/fedml:latest-torch1.12.1-cuda11.3-cudnn8-devel -NVIDIA_BASE_IMAGE=nvidia/cuda:11.3.1-cudnn8-devel-ubuntu18.04 -PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113 -PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html -CURRENT_IMAGE=fedml/fedml:${FEDML_VERSION}-torch1.12.1-cuda11.3-cudnn8-devel +ARCH="x86_64" +OS="ubuntu18.04" +DISTRO="ubuntu1804" +PYTHON_VERSION="3.7" +PYTORCH_VERSION="1.12.1" +NCCL_VERSION="2.9.9" +CUDA_VERSION="11.3" +LIB_NCCL="2.9.9-1+cuda11.3" +OUTPUT_IMAGE="fedml/fedml:latest-torch1.12.1-cuda11.3-cudnn8-devel" +NVIDIA_BASE_IMAGE="nvidia/cuda:11.3.1-cudnn8-devel-ubuntu18.04" +PYTORCH_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu113" +PYTORCH_GEOMETRIC_URL="https://data.pyg.org/whl/torch-1.12.0+cu113.html" +CURRENT_IMAGE="fedml/fedml:${FEDML_VERSION}-torch1.12.1-cuda11.3-cudnn8-devel" cd ./docker bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \ - $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL + $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL $LIB_NCCL docker tag $OUTPUT_IMAGE $CURRENT_IMAGE cd $pwd # Build ARM_64 docker -ARCH=arm64 -OS=ubuntu20.04 -DISTRO=ubuntu2004 -PYTHON_VERSION=3.8 -PYTORCH_VERSION=1.12.1 -NCCL_VERSION=2.9.6 -CUDA_VERSION=11.3 -OUTPUT_IMAGE=fedml/fedml:latest-torch1.12.1-cuda11.3-cudnn8-devel-arm64 -NVIDIA_BASE_IMAGE=nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af -PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113 -PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html -CURRENT_IMAGE=fedml/fedml:${FEDML_VERSION}-torch1.12.1-cuda11.3-cudnn8-devel-arm64 +ARCH="arm64" +OS="ubuntu20.04" +DISTRO="ubuntu2004" +PYTHON_VERSION="3.8" +PYTORCH_VERSION="1.12.1" +NCCL_VERSION="2.9.6" +CUDA_VERSION="11.3" +LIB_NCCL="2.9.6-1+cuda11.3" +OUTPUT_IMAGE="fedml/fedml:latest-torch1.12.1-cuda11.3-cudnn8-devel-arm64" +NVIDIA_BASE_IMAGE="nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af" +PYTORCH_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu113" +PYTORCH_GEOMETRIC_URL="https://data.pyg.org/whl/torch-1.12.0+cu113.html" +CURRENT_IMAGE="fedml/fedml:${FEDML_VERSION}-torch1.12.1-cuda11.3-cudnn8-devel-arm64" +cd ./docker bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \ - $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL + $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL $LIB_NCCL -cd ./docker docker tag $OUTPUT_IMAGE $CURRENT_IMAGE cd $pwd # Build nvidia_jetson docker -ARCH=jetson -OS=ubuntu20.04 -DISTRO=ubuntu2004 -PYTHON_VERSION=3.7 -PYTORCH_VERSION=1.12.1 -NCCL_VERSION=2.9.6 -CUDA_VERSION=11.3 -OUTPUT_IMAGE=fedml/fedml:latest-nvidia-jetson-l4t-ml-r32.6.1-py3 -NVIDIA_BASE_IMAGE=nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af -PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113 -PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html -CURRENT_IMAGE=fedml/fedml:${FEDML_VERSION}-nvidia-jetson-l4t-ml-r32.6.1-py3 +ARCH="jetson" +OS="ubuntu20.04" +DISTRO="ubuntu2004" +PYTHON_VERSION="3.7" +PYTORCH_VERSION="1.12.1" +NCCL_VERSION="2.9.6" +CUDA_VERSION="11.3" +OUTPUT_IMAGE="fedml/fedml:latest-nvidia-jetson-l4t-ml-r32.6.1-py3" +NVIDIA_BASE_IMAGE="nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af" +PYTORCH_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu113" +PYTORCH_GEOMETRIC_URL="https://data.pyg.org/whl/torch-1.12.0+cu113.html" +CURRENT_IMAGE="fedml/fedml:${FEDML_VERSION}-nvidia-jetson-l4t-ml-r32.6.1-py3" cd ./docker bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \ - $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL + $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL $LIB_NCCL docker tag $OUTPUT_IMAGE $CURRENT_IMAGE cd $pwd # Build rpi32 docker -ARCH=rpi32 -OS=ubuntu20.04 -DISTRO=ubuntu2004 -PYTHON_VERSION=3.7 -PYTORCH_VERSION=1.12.1 -NCCL_VERSION=2.9.6 -CUDA_VERSION=11.3 -OUTPUT_IMAGE=fedml/fedml:latest-raspberrypi4-32-py37 -NVIDIA_BASE_IMAGE=nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af -PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113 -PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html -CURRENT_IMAGE=fedml/fedml:${FEDML_VERSION}-raspberrypi4-32-py37 - -cd ./docker -bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \ - $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL - -docker tag $OUTPUT_IMAGE $CURRENT_IMAGE - -cd $pwd +#ARCH="rpi32" +#OS="ubuntu20.04" +#DISTRO="ubuntu2004" +#PYTHON_VERSION="3.7" +#PYTORCH_VERSION="1.12.1" +#NCCL_VERSION="2.9.6" +#CUDA_VERSION="11.3" +#OUTPUT_IMAGE="fedml/fedml:latest-raspberrypi4-32-py37" +#NVIDIA_BASE_IMAGE="nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af" +#PYTORCH_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu113" +#PYTORCH_GEOMETRIC_URL="https://data.pyg.org/whl/torch-1.12.0+cu113.html" +#CURRENT_IMAGE="fedml/fedml:${FEDML_VERSION}-raspberrypi4-32-py37" +# +#cd ./docker +#bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \ +# $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL $LIB_NCCL +# +#docker tag $OUTPUT_IMAGE $CURRENT_IMAGE +# +#cd $pwd # Build rpi64 docker -ARCH=rpi64 -OS=ubuntu20.04 -DISTRO=ubuntu2004 -PYTHON_VERSION=3.7 -PYTORCH_VERSION=1.12.1 -NCCL_VERSION=2.9.6 -CUDA_VERSION=11.3 -OUTPUT_IMAGE=fedml/fedml:latest-raspberrypi4-64-py37 -NVIDIA_BASE_IMAGE=nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af -PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113 -PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html -CURRENT_IMAGE=fedml/fedml:${FEDML_VERSION}-raspberrypi4-64-py37 +ARCH="rpi64" +OS="ubuntu20.04" +DISTRO="ubuntu2004" +PYTHON_VERSION="3.7" +PYTORCH_VERSION="1.12.1" +NCCL_VERSION="2.9.6" +CUDA_VERSION="11.3" +OUTPUT_IMAGE="fedml/fedml:latest-raspberrypi4-64-py37" +NVIDIA_BASE_IMAGE="nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af" +PYTORCH_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu113" +PYTORCH_GEOMETRIC_URL="https://data.pyg.org/whl/torch-1.12.0+cu113.html" +CURRENT_IMAGE="fedml/fedml:${FEDML_VERSION}-raspberrypi4-64-py37" cd ./docker bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \ - $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL + $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL $LIB_NCCL docker tag $OUTPUT_IMAGE $CURRENT_IMAGE diff --git a/devops/scripts/push-fedml-docker.sh b/devops/scripts/push-fedml-docker.sh index fb1cdb2550..32442da0a5 100755 --- a/devops/scripts/push-fedml-docker.sh +++ b/devops/scripts/push-fedml-docker.sh @@ -13,8 +13,8 @@ if [[ $push_arm_arch_images != "" ]]; then docker push fedml/fedml:latest-nvidia-jetson-l4t-ml-r32.6.1-py3 docker push fedml/fedml:${FEDML_VERSION}-nvidia-jetson-l4t-ml-r32.6.1-py3 - docker push fedml/fedml:latest-raspberrypi4-32-py37 - docker push fedml/fedml:${FEDML_VERSION}-raspberrypi4-32-py37 +# docker push fedml/fedml:latest-raspberrypi4-32-py37 +# docker push fedml/fedml:${FEDML_VERSION}-raspberrypi4-32-py37 docker push fedml/fedml:latest-raspberrypi4-64-py37 docker push fedml/fedml:${FEDML_VERSION}-raspberrypi4-64-py37 diff --git a/doc/en/starter/install/jetson.md b/doc/en/starter/install/jetson.md index a5efb9afd4..c6faba4f04 100644 --- a/doc/en/starter/install/jetson.md +++ b/doc/en/starter/install/jetson.md @@ -3,12 +3,12 @@ ## Run FedML with Docker (Recommended) - Pull FedML RPI docker image ``` -docker pull fedml/fedml:nvidia-jetson-l4t-ml-r32.6.1-py3 +docker pull fedml/fedml:latest-nvidia-jetson-l4t-ml-r32.6.1-py3 ``` - Run Docker with "fedml login" ``` -docker run -t -i --runtime nvidia fedml/fedml:nvidia-jetson-l4t-ml-r32.6.1-py3 /bin/bash +docker run -t -i --runtime nvidia fedml/fedml:latest-nvidia-jetson-l4t-ml-r32.6.1-py3 /bin/bash root@8bc0de2ce0e0:/usr/src/app# fedml login 299 diff --git a/doc/en/starter/install/rpi.md b/doc/en/starter/install/rpi.md index e569ee6e77..a69fb2a9aa 100644 --- a/doc/en/starter/install/rpi.md +++ b/doc/en/starter/install/rpi.md @@ -3,12 +3,12 @@ ## Run FedML with Docker (Recommended) - Pull FedML RPI docker image ``` -docker pull fedml/fedml:raspberrypi4-64-py37 +docker pull fedml/fedml:latest-raspberrypi4-64-py37 ``` - Run Docker with "fedml login" ``` -docker run -t -i fedml/fedml:raspberrypi4-64-py37 /bin/bash +docker run -t -i fedml/fedml:latest-raspberrypi4-64-py37 /bin/bash root@8bc0de2ce0e0:/usr/src/app# fedml login 299 diff --git a/docker/arm64v8/Dockerfile b/docker/arm64v8/Dockerfile index 25e598d4fa..cda2b483e2 100644 --- a/docker/arm64v8/Dockerfile +++ b/docker/arm64v8/Dockerfile @@ -30,6 +30,8 @@ ARG PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113 ARG PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html +ARG LIB_NCCL=2.9.6-1+cuda11.3 + RUN echo ${NCCL_VERSION} RUN echo ${CUDA_VERSION} @@ -125,9 +127,7 @@ RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/ add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/${DISTRO}/${ARCH} /" && \ apt update && \ #export NCCL_VERSION_ENV=`echo $NCCL_VERSION | awk -F'-1' '{print $1}'` && \ -export NCCL_VERSION_ENV=$NCCL_VERSION-1 && \ -export CUDA_VERSION_ENV=`echo $CUDA_VERSION | sed 's/\.1//g'` && \ -apt install -y --allow-change-held-packages libnccl2=${NCCL_VERSION_ENV}+cuda${CUDA_VERSION_ENV} libnccl-dev=${NCCL_VERSION_ENV}+cuda${CUDA_VERSION_ENV} +apt install -y --allow-change-held-packages libnccl2=${LIB_NCCL} libnccl-dev=${LIB_NCCL} # *************************************************************************** # PyTorch (install from source) diff --git a/docker/build-docker.sh b/docker/build-docker.sh index 165c4f38c2..0130ccf0e9 100644 --- a/docker/build-docker.sh +++ b/docker/build-docker.sh @@ -9,22 +9,27 @@ NCCL_VERSION=$6 CUDA_VERSION=$7 OUTPUT_IMAGE=$8 NVIDIA_BASE_IMAGE="" -if [ $# -gt 9 ]; then +if [ $# -ge 9 ]; then NVIDIA_BASE_IMAGE=$9 fi -if [ $# -gt 10 ]; then +if [ $# -ge 10 ]; then PYTORCH_EXTRA_INDEX_URL=${10} else PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113 fi -if [ $# -gt 11 ]; then +if [ $# -ge 11 ]; then PYTORCH_GEOMETRIC_URL=${11} else PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html fi +if [ $# -ge 12 ]; then + LIB_NCCL=${12} +else + LIB_NCCL="null" +fi DOCKER_FILE_PATH="" if [[ "$ARCH" == "x86_64" ]]; then @@ -36,7 +41,7 @@ elif [[ "$ARCH" == "jetson" ]]; then elif [[ "$ARCH" == "rpi32" ]]; then DOCKER_FILE_PATH=./rpi/Dockerfile_32bit_armv7 elif [[ "$ARCH" == "rpi64" ]]; then - DOCKER_FILE_PATH=./rpi/Dockerfile_32bit_armv8 + DOCKER_FILE_PATH=./rpi/Dockerfile_64bit_armv8 fi if [ $DOCKER_FILE_PATH == "" ]; then @@ -55,6 +60,7 @@ if [[ $NVIDIA_BASE_IMAGE != "" ]]; then --build-arg NVIDIA_BASE_IMAGE=$NVIDIA_BASE_IMAGE \ --build-arg PYTORCH_EXTRA_INDEX_URL=$PYTORCH_EXTRA_INDEX_URL \ --build-arg PYTORCH_GEOMETRIC_URL=$PYTORCH_GEOMETRIC_URL \ + --build-arg LIB_NCCL=$LIB_NCCL \ --network=host \ -t $OUTPUT_IMAGE . else @@ -67,6 +73,7 @@ else --build-arg CUDA_VERSION=$CUDA_VERSION \ --build-arg PYTORCH_EXTRA_INDEX_URL=$PYTORCH_EXTRA_INDEX_URL \ --build-arg PYTORCH_GEOMETRIC_URL=$PYTORCH_GEOMETRIC_URL \ + --build-arg LIB_NCCL=$LIB_NCCL \ --network=host \ -t $OUTPUT_IMAGE . fi diff --git a/docker/x86-64/Dockerfile b/docker/x86-64/Dockerfile index aa4f6ecdfc..dc446fa414 100644 --- a/docker/x86-64/Dockerfile +++ b/docker/x86-64/Dockerfile @@ -29,6 +29,8 @@ ARG PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113 ARG PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html +ARG LIB_NCCL="2.9.9-1+cuda11.3" + RUN echo ${NCCL_VERSION} RUN echo ${CUDA_VERSION} @@ -124,9 +126,7 @@ RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/ add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/${DISTRO}/${ARCH} /" && \ apt update && \ #export NCCL_VERSION_ENV=`echo $NCCL_VERSION | awk -F'-1' '{print $1}'` && \ -export NCCL_VERSION_ENV=$NCCL_VERSION && \ -export CUDA_VERSION_ENV=`echo $CUDA_VERSION | sed 's/\.1//g'` && \ -apt install -y --allow-change-held-packages libnccl2=${NCCL_VERSION_ENV}+cuda${CUDA_VERSION_ENV} libnccl-dev=${NCCL_VERSION_ENV}+cuda${CUDA_VERSION_ENV} +apt install -y --allow-change-held-packages libnccl2=${LIB_NCCL} libnccl-dev=${LIB_NCCL} # *************************************************************************** # PyTorch (install from source) diff --git a/python/fedml/cli/cli.py b/python/fedml/cli/cli.py index 1b4b315302..f2e8ebb99b 100755 --- a/python/fedml/cli/cli.py +++ b/python/fedml/cli/cli.py @@ -18,6 +18,7 @@ from ..cli.server_deployment.docker_login import login_with_server_docker_mode from ..cli.server_deployment.docker_login import logout_with_server_docker_mode from ..cli.server_deployment.docker_login import logs_with_server_docker_mode +from ..cli.edge_deployment.client_diagnosis import ClientDiagnosis from ..cli.comm_utils import sys_utils @@ -552,6 +553,47 @@ def build_mlops_package( return 0 +@cli.command("diagnosis", help="Diagnosis for open.fedml.ai, AWS S3 service and MQTT service") +@click.option( + "--open", "-o", default=None, is_flag=True, help="check the connection to open.fedml.ai.", +) +@click.option( + "--s3", "-s", default=None, is_flag=True, help="check the connection to AWS S3 server.", +) +@click.option( + "--mqtt", "-m", default=None, is_flag=True, help="check the connection to mqtt.fedml.ai (1883).", +) +def mlops_diagnosis(open, s3, mqtt): + check_open = open + check_s3 = s3 + check_mqtt = mqtt + if open is None and s3 is None and mqtt is None: + check_open = True + check_s3 = True + check_mqtt = True + + if check_open: + is_open_connected = ClientDiagnosis.check_open_connection() + if is_open_connected: + click.echo("The connection to https://open.fedml.ai is OK.") + else: + click.echo("You can not connect to https://open.fedml.ai.") + + if check_s3: + is_s3_connected = ClientDiagnosis.check_s3_connection() + if is_s3_connected: + click.echo("The connection to AWS S3 is OK.") + else: + click.echo("You can not connect to AWS S3.") + + if check_mqtt: + is_mqtt_connected = ClientDiagnosis.check_mqtt_connection() + if is_mqtt_connected: + click.echo("The connection to mqtt.fedml.ai (port:1883) is OK.") + else: + click.echo("You can not connect to mqtt.fedml.ai (port:1883).") + + @cli.command( "env", help="collect the environment information to help debugging, including OS, Hardware Architecture, " diff --git a/python/fedml/cli/edge_deployment/client_diagnosis.py b/python/fedml/cli/edge_deployment/client_diagnosis.py new file mode 100644 index 0000000000..f7f76f0370 --- /dev/null +++ b/python/fedml/cli/edge_deployment/client_diagnosis.py @@ -0,0 +1,88 @@ +import time + +from ...core.mlops.mlops_configs import MLOpsConfigs +from ...core.distributed.communication.s3.remote_storage import S3Storage +from ...core.distributed.communication.mqtt.mqtt_manager import MqttManager + + +class Singleton(object): + def __new__(cls): + if not hasattr(cls, "_instance"): + orig = super(Singleton, cls) + cls._instance = orig.__new__(cls) + return cls._instance + + +class ClientDiagnosis(Singleton): + def __init__(self): + self.is_mqtt_connected = False + + @staticmethod + def check_open_connection(): + args = {"config_version": "release"} + try: + mqtt_config, s3_config = MLOpsConfigs.get_instance(args).fetch_configs() + except Exception as e: + return False + + return True + + @staticmethod + def check_s3_connection(): + args = {"config_version": "release"} + try: + mqtt_config, s3_config = MLOpsConfigs.get_instance(args).fetch_configs() + s3_storage = S3Storage(s3_config) + download_ret = s3_storage.test_s3_base_cmds("d31df596c32943c64015a7e2d6e0d5a4", "test-base-cmds") + if download_ret: + return True + except Exception as e: + return False + + return False + + @staticmethod + def check_mqtt_connection(): + args = {"config_version": "release"} + try: + mqtt_config, s3_config = MLOpsConfigs.get_instance(args).fetch_configs() + mqtt_mgr = MqttManager( + mqtt_config["BROKER_HOST"], + mqtt_config["BROKER_PORT"], + mqtt_config["MQTT_USER"], + mqtt_config["MQTT_PWD"], + mqtt_config["MQTT_KEEPALIVE"], + "fedml-diagnosis-id" + ) + diagnosis = ClientDiagnosis() + diagnosis.is_mqtt_connected = False + mqtt_mgr.add_connected_listener(diagnosis.on_mqtt_connected) + mqtt_mgr.add_disconnected_listener(diagnosis.on_mqtt_disconnected) + mqtt_mgr.connect() + mqtt_mgr.loop_start() + + count = 0 + while not diagnosis.is_mqtt_connected: + count += 1 + if count >= 15: + return False; + time.sleep(1) + + return True + except Exception as e: + print("MQTT connect exception: {}".format(str(e))) + return False + + return False + + def on_mqtt_connected(self, mqtt_client_object): + self.is_mqtt_connected = True + pass + + def on_mqtt_disconnected(self, mqtt_client_object): + self.is_mqtt_connected = False + + +if __name__ == "__main__": + pass + diff --git a/python/fedml/core/distributed/communication/s3/remote_storage.py b/python/fedml/core/distributed/communication/s3/remote_storage.py index 67920a46e0..96b94f3d01 100644 --- a/python/fedml/core/distributed/communication/s3/remote_storage.py +++ b/python/fedml/core/distributed/communication/s3/remote_storage.py @@ -155,6 +155,35 @@ def download_file(self, path_s3, path_local): if retry >= 3: logging.error(f"Download zip failed after max retry.") + def test_s3_base_cmds(self, message_key, message_body): + """ + test_s3_base_cmds + :param file_key: s3 message key + :param file_key: s3 message body + :return: + """ + retry = 0 + while retry < 3: + try: + global aws_s3_client + message_pkl = pickle.dumps(message_body) + aws_s3_client.put_object( + Body=message_pkl, Bucket=self.bucket_name, Key=message_key, ACL="public-read", + ) + obj = aws_s3_client.get_object(Bucket=self.bucket_name, Key=message_key) + message_pkl_downloaded = obj["Body"].read() + message_downloaded = pickle.loads(message_pkl_downloaded) + if str(message_body) == str(message_downloaded): + break + retry += 1 + except Exception as e: + raise Exception("S3 base commands test failed at retry count {}, exception: {}".format(str(retry), str(e))) + retry += 1 + if retry >= 3: + raise Exception(f"S3 base commands test failed after max retry.") + + return True + def delete_s3_zip(self, path_s3): """ delete s3 object From 60624b20b343eca864e74c8b9ed5d0de307a0e1a Mon Sep 17 00:00:00 2001 From: alexliang Date: Tue, 1 Nov 2022 02:28:22 +0800 Subject: [PATCH 2/3] update diagnosis readme. --- doc/en/mlops/api.md | 6 ++++++ python/fedml/cli/README.md | 8 +++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/doc/en/mlops/api.md b/doc/en/mlops/api.md index 57ce3827b9..261acf9e65 100644 --- a/doc/en/mlops/api.md +++ b/doc/en/mlops/api.md @@ -148,5 +148,11 @@ logs from edge server with docker mode: fedml logs --docker --docker-rank 1 ``` +## 6. Diagnosis +Diagnosis for connection to https://open.fedml.ai, AWS S3 and MQTT (mqtt.fedml.ai:1883) +``` +fedml diagnosis --open --s3 --mqtt +``` + You can also refer to a sanity check test example here: [https://github.com/FedML-AI/FedML/blob/master/test/fedml_user_code/cli/build.sh](https://github.com/FedML-AI/FedML/blob/master/test/fedml_user_code/cli/build.sh) \ No newline at end of file diff --git a/python/fedml/cli/README.md b/python/fedml/cli/README.md index ec49372473..a89cf53d98 100644 --- a/python/fedml/cli/README.md +++ b/python/fedml/cli/README.md @@ -102,4 +102,10 @@ fedml logs -s logs from edge server with docker mode: ``` fedml logs --docker --docker-rank 1 -``` \ No newline at end of file +``` + +## 6. Diagnosis +Diagnosis for connection to https://open.fedml.ai, AWS S3 and MQTT (mqtt.fedml.ai:1883) +``` +fedml diagnosis --open --s3 --mqtt +``` From e5756cf7c86690d654d7002ace28760aa4052689 Mon Sep 17 00:00:00 2001 From: alexliang Date: Tue, 1 Nov 2022 02:29:25 +0800 Subject: [PATCH 3/3] update version to 0.7.340. --- python/fedml/__init__.py | 2 +- python/setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/fedml/__init__.py b/python/fedml/__init__.py index fae53ca2f5..0e89f8c6a4 100644 --- a/python/fedml/__init__.py +++ b/python/fedml/__init__.py @@ -23,7 +23,7 @@ _global_training_type = None _global_comm_backend = None -__version__ = "0.7.339" +__version__ = "0.7.340" def init(args=None): diff --git a/python/setup.py b/python/setup.py index aac2159cd2..2eae2cf746 100755 --- a/python/setup.py +++ b/python/setup.py @@ -75,7 +75,7 @@ def finalize_options(self): setup( name="fedml", - version="0.7.339", + version="0.7.340", author="FedML Team", author_email="ch@fedml.ai", description="A research and production integrated edge-cloud library for "