Merge pull request #631 from FedML-AI/dev/v0.7.0

Dev/v0.7.0
FedML-AI · Oct 31, 2022 · f8bd2ba · f8bd2ba
2 parents e3ac995 + 0e0de3c
commit f8bd2ba
Show file tree

Hide file tree

Showing 14 changed files with 272 additions and 92 deletions.
diff --git a/devops/scripts/build-fedml-docker.sh b/devops/scripts/build-fedml-docker.sh
@@ -6,110 +6,112 @@ pwd=`pwd`
 export FEDML_VERSION=`cat python/setup.py |grep version= |awk -F'=' '{print $2}' |awk -F',' '{print $1}'|awk -F'"' '{print $2}'`
 
 # Build X86_64 docker
-ARCH=x86_64
-OS=ubuntu18.04
-DISTRO=ubuntu1804
-PYTHON_VERSION=3.7
-PYTORCH_VERSION=1.12.1
-NCCL_VERSION=2.9.9
-CUDA_VERSION=11.3
-OUTPUT_IMAGE=fedml/fedml:latest-torch1.12.1-cuda11.3-cudnn8-devel
-NVIDIA_BASE_IMAGE=nvidia/cuda:11.3.1-cudnn8-devel-ubuntu18.04
-PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113
-PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html
-CURRENT_IMAGE=fedml/fedml:${FEDML_VERSION}-torch1.12.1-cuda11.3-cudnn8-devel
+ARCH="x86_64"
+OS="ubuntu18.04"
+DISTRO="ubuntu1804"
+PYTHON_VERSION="3.7"
+PYTORCH_VERSION="1.12.1"
+NCCL_VERSION="2.9.9"
+CUDA_VERSION="11.3"
+LIB_NCCL="2.9.9-1+cuda11.3"
+OUTPUT_IMAGE="fedml/fedml:latest-torch1.12.1-cuda11.3-cudnn8-devel"
+NVIDIA_BASE_IMAGE="nvidia/cuda:11.3.1-cudnn8-devel-ubuntu18.04"
+PYTORCH_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu113"
+PYTORCH_GEOMETRIC_URL="https://data.pyg.org/whl/torch-1.12.0+cu113.html"
+CURRENT_IMAGE="fedml/fedml:${FEDML_VERSION}-torch1.12.1-cuda11.3-cudnn8-devel"
 
 cd ./docker
 bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \
-     $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL
+     $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL $LIB_NCCL
 
 docker tag $OUTPUT_IMAGE $CURRENT_IMAGE
 cd $pwd
 
 
 # Build ARM_64 docker
-ARCH=arm64
-OS=ubuntu20.04
-DISTRO=ubuntu2004
-PYTHON_VERSION=3.8
-PYTORCH_VERSION=1.12.1
-NCCL_VERSION=2.9.6
-CUDA_VERSION=11.3
-OUTPUT_IMAGE=fedml/fedml:latest-torch1.12.1-cuda11.3-cudnn8-devel-arm64
-NVIDIA_BASE_IMAGE=nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af
-PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113
-PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html
-CURRENT_IMAGE=fedml/fedml:${FEDML_VERSION}-torch1.12.1-cuda11.3-cudnn8-devel-arm64
+ARCH="arm64"
+OS="ubuntu20.04"
+DISTRO="ubuntu2004"
+PYTHON_VERSION="3.8"
+PYTORCH_VERSION="1.12.1"
+NCCL_VERSION="2.9.6"
+CUDA_VERSION="11.3"
+LIB_NCCL="2.9.6-1+cuda11.3"
+OUTPUT_IMAGE="fedml/fedml:latest-torch1.12.1-cuda11.3-cudnn8-devel-arm64"
+NVIDIA_BASE_IMAGE="nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af"
+PYTORCH_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu113"
+PYTORCH_GEOMETRIC_URL="https://data.pyg.org/whl/torch-1.12.0+cu113.html"
+CURRENT_IMAGE="fedml/fedml:${FEDML_VERSION}-torch1.12.1-cuda11.3-cudnn8-devel-arm64"
 
+cd ./docker
 bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \
-     $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL
+     $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL $LIB_NCCL
 
-cd ./docker
 docker tag $OUTPUT_IMAGE $CURRENT_IMAGE
 
 cd $pwd
 
 # Build nvidia_jetson docker
-ARCH=jetson
-OS=ubuntu20.04
-DISTRO=ubuntu2004
-PYTHON_VERSION=3.7
-PYTORCH_VERSION=1.12.1
-NCCL_VERSION=2.9.6
-CUDA_VERSION=11.3
-OUTPUT_IMAGE=fedml/fedml:latest-nvidia-jetson-l4t-ml-r32.6.1-py3
-NVIDIA_BASE_IMAGE=nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af
-PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113
-PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html
-CURRENT_IMAGE=fedml/fedml:${FEDML_VERSION}-nvidia-jetson-l4t-ml-r32.6.1-py3
+ARCH="jetson"
+OS="ubuntu20.04"
+DISTRO="ubuntu2004"
+PYTHON_VERSION="3.7"
+PYTORCH_VERSION="1.12.1"
+NCCL_VERSION="2.9.6"
+CUDA_VERSION="11.3"
+OUTPUT_IMAGE="fedml/fedml:latest-nvidia-jetson-l4t-ml-r32.6.1-py3"
+NVIDIA_BASE_IMAGE="nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af"
+PYTORCH_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu113"
+PYTORCH_GEOMETRIC_URL="https://data.pyg.org/whl/torch-1.12.0+cu113.html"
+CURRENT_IMAGE="fedml/fedml:${FEDML_VERSION}-nvidia-jetson-l4t-ml-r32.6.1-py3"
 
 cd ./docker
 bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \
-     $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL
+     $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL $LIB_NCCL
 
 docker tag $OUTPUT_IMAGE $CURRENT_IMAGE
 
 cd $pwd
 
 # Build rpi32 docker
-ARCH=rpi32
-OS=ubuntu20.04
-DISTRO=ubuntu2004
-PYTHON_VERSION=3.7
-PYTORCH_VERSION=1.12.1
-NCCL_VERSION=2.9.6
-CUDA_VERSION=11.3
-OUTPUT_IMAGE=fedml/fedml:latest-raspberrypi4-32-py37
-NVIDIA_BASE_IMAGE=nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af
-PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113
-PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html
-CURRENT_IMAGE=fedml/fedml:${FEDML_VERSION}-raspberrypi4-32-py37
-
-cd ./docker
-bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \
-     $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL
-
-docker tag $OUTPUT_IMAGE $CURRENT_IMAGE
-
-cd $pwd
+#ARCH="rpi32"
+#OS="ubuntu20.04"
+#DISTRO="ubuntu2004"
+#PYTHON_VERSION="3.7"
+#PYTORCH_VERSION="1.12.1"
+#NCCL_VERSION="2.9.6"
+#CUDA_VERSION="11.3"
+#OUTPUT_IMAGE="fedml/fedml:latest-raspberrypi4-32-py37"
+#NVIDIA_BASE_IMAGE="nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af"
+#PYTORCH_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu113"
+#PYTORCH_GEOMETRIC_URL="https://data.pyg.org/whl/torch-1.12.0+cu113.html"
+#CURRENT_IMAGE="fedml/fedml:${FEDML_VERSION}-raspberrypi4-32-py37"
+#
+#cd ./docker
+#bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \
+#     $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL $LIB_NCCL
+#
+#docker tag $OUTPUT_IMAGE $CURRENT_IMAGE
+#
+#cd $pwd
 
 # Build rpi64 docker
-ARCH=rpi64
-OS=ubuntu20.04
-DISTRO=ubuntu2004
-PYTHON_VERSION=3.7
-PYTORCH_VERSION=1.12.1
-NCCL_VERSION=2.9.6
-CUDA_VERSION=11.3
-OUTPUT_IMAGE=fedml/fedml:latest-raspberrypi4-64-py37
-NVIDIA_BASE_IMAGE=nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af
-PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113
-PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html
-CURRENT_IMAGE=fedml/fedml:${FEDML_VERSION}-raspberrypi4-64-py37
+ARCH="rpi64"
+OS="ubuntu20.04"
+DISTRO="ubuntu2004"
+PYTHON_VERSION="3.7"
+PYTORCH_VERSION="1.12.1"
+NCCL_VERSION="2.9.6"
+CUDA_VERSION="11.3"
+OUTPUT_IMAGE="fedml/fedml:latest-raspberrypi4-64-py37"
+NVIDIA_BASE_IMAGE="nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af"
+PYTORCH_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu113"
+PYTORCH_GEOMETRIC_URL="https://data.pyg.org/whl/torch-1.12.0+cu113.html"
+CURRENT_IMAGE="fedml/fedml:${FEDML_VERSION}-raspberrypi4-64-py37"
 
 cd ./docker
 bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \
-     $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL
+     $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL $LIB_NCCL
 
 docker tag $OUTPUT_IMAGE $CURRENT_IMAGE
 

diff --git a/devops/scripts/push-fedml-docker.sh b/devops/scripts/push-fedml-docker.sh
@@ -13,8 +13,8 @@ if [[ $push_arm_arch_images != "" ]]; then
   docker push fedml/fedml:latest-nvidia-jetson-l4t-ml-r32.6.1-py3
   docker push fedml/fedml:${FEDML_VERSION}-nvidia-jetson-l4t-ml-r32.6.1-py3
 
-  docker push fedml/fedml:latest-raspberrypi4-32-py37
-  docker push fedml/fedml:${FEDML_VERSION}-raspberrypi4-32-py37
+#  docker push fedml/fedml:latest-raspberrypi4-32-py37
+#  docker push fedml/fedml:${FEDML_VERSION}-raspberrypi4-32-py37
 
   docker push fedml/fedml:latest-raspberrypi4-64-py37
   docker push fedml/fedml:${FEDML_VERSION}-raspberrypi4-64-py37

diff --git a/doc/en/mlops/api.md b/doc/en/mlops/api.md
@@ -148,5 +148,11 @@ logs from edge server with docker mode:
 fedml logs --docker --docker-rank 1
 ```
 
+## 6. Diagnosis
+Diagnosis for connection to https://open.fedml.ai, AWS S3 and MQTT (mqtt.fedml.ai:1883)
+```
+fedml diagnosis --open --s3 --mqtt
+```
+
 You can also refer to a sanity check test example here:
 [https://github.com/FedML-AI/FedML/blob/master/test/fedml_user_code/cli/build.sh](https://github.com/FedML-AI/FedML/blob/master/test/fedml_user_code/cli/build.sh)
diff --git a/doc/en/starter/install/jetson.md b/doc/en/starter/install/jetson.md
@@ -3,12 +3,12 @@
 ## Run FedML with Docker (Recommended)
 - Pull FedML RPI docker image
 ```
-docker pull fedml/fedml:nvidia-jetson-l4t-ml-r32.6.1-py3
+docker pull fedml/fedml:latest-nvidia-jetson-l4t-ml-r32.6.1-py3
 ```
 
 - Run Docker with "fedml login"
 ```
-docker run -t -i --runtime nvidia fedml/fedml:nvidia-jetson-l4t-ml-r32.6.1-py3 /bin/bash
+docker run -t -i --runtime nvidia fedml/fedml:latest-nvidia-jetson-l4t-ml-r32.6.1-py3 /bin/bash
 
 root@8bc0de2ce0e0:/usr/src/app# fedml login 299
 

diff --git a/doc/en/starter/install/rpi.md b/doc/en/starter/install/rpi.md
@@ -3,12 +3,12 @@
 ## Run FedML with Docker (Recommended)
 - Pull FedML RPI docker image
 ```
-docker pull fedml/fedml:raspberrypi4-64-py37
+docker pull fedml/fedml:latest-raspberrypi4-64-py37
 ```
 
 - Run Docker with "fedml login"
 ```
-docker run -t -i fedml/fedml:raspberrypi4-64-py37 /bin/bash
+docker run -t -i fedml/fedml:latest-raspberrypi4-64-py37 /bin/bash
 
 root@8bc0de2ce0e0:/usr/src/app# fedml login 299
 

diff --git a/docker/arm64v8/Dockerfile b/docker/arm64v8/Dockerfile
@@ -30,6 +30,8 @@ ARG PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113
 
 ARG PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html
 
+ARG LIB_NCCL=2.9.6-1+cuda11.3
+
 RUN echo ${NCCL_VERSION}
 RUN echo ${CUDA_VERSION}
 
@@ -125,9 +127,7 @@ RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/
 add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/${DISTRO}/${ARCH} /" && \
 apt update && \
 #export NCCL_VERSION_ENV=`echo $NCCL_VERSION | awk -F'-1' '{print $1}'` && \
-export NCCL_VERSION_ENV=$NCCL_VERSION-1 && \
-export CUDA_VERSION_ENV=`echo $CUDA_VERSION | sed 's/\.1//g'` &&  \
-apt install -y --allow-change-held-packages libnccl2=${NCCL_VERSION_ENV}+cuda${CUDA_VERSION_ENV} libnccl-dev=${NCCL_VERSION_ENV}+cuda${CUDA_VERSION_ENV}
+apt install -y --allow-change-held-packages libnccl2=${LIB_NCCL} libnccl-dev=${LIB_NCCL}
 
 # ***************************************************************************
 # PyTorch (install from source)

diff --git a/docker/build-docker.sh b/docker/build-docker.sh
@@ -9,22 +9,27 @@ NCCL_VERSION=$6
 CUDA_VERSION=$7
 OUTPUT_IMAGE=$8
 NVIDIA_BASE_IMAGE=""
-if [ $# -gt 9 ]; then
+if [ $# -ge 9 ]; then
   NVIDIA_BASE_IMAGE=$9
 fi
 
-if [ $# -gt 10 ]; then
+if [ $# -ge 10 ]; then
   PYTORCH_EXTRA_INDEX_URL=${10}
 else
   PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113
 fi
 
-if [ $# -gt 11 ]; then
+if [ $# -ge 11 ]; then
   PYTORCH_GEOMETRIC_URL=${11}
 else
   PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html
 fi
 
+if [ $# -ge 12 ]; then
+  LIB_NCCL=${12}
+else
+  LIB_NCCL="null"
+fi
 
 DOCKER_FILE_PATH=""
 if [[ "$ARCH" == "x86_64" ]]; then
@@ -36,7 +41,7 @@ elif [[  "$ARCH" == "jetson" ]]; then
 elif [[  "$ARCH" == "rpi32" ]]; then
   DOCKER_FILE_PATH=./rpi/Dockerfile_32bit_armv7
 elif [[  "$ARCH" == "rpi64" ]]; then
-  DOCKER_FILE_PATH=./rpi/Dockerfile_32bit_armv8
+  DOCKER_FILE_PATH=./rpi/Dockerfile_64bit_armv8
 fi
 
 if [ $DOCKER_FILE_PATH == "" ]; then
@@ -55,6 +60,7 @@ if [[ $NVIDIA_BASE_IMAGE != "" ]]; then
     --build-arg NVIDIA_BASE_IMAGE=$NVIDIA_BASE_IMAGE \
     --build-arg PYTORCH_EXTRA_INDEX_URL=$PYTORCH_EXTRA_INDEX_URL \
     --build-arg PYTORCH_GEOMETRIC_URL=$PYTORCH_GEOMETRIC_URL \
+    --build-arg LIB_NCCL=$LIB_NCCL \
     --network=host \
     -t $OUTPUT_IMAGE .
 else
@@ -67,6 +73,7 @@ else
     --build-arg CUDA_VERSION=$CUDA_VERSION \
     --build-arg PYTORCH_EXTRA_INDEX_URL=$PYTORCH_EXTRA_INDEX_URL \
     --build-arg PYTORCH_GEOMETRIC_URL=$PYTORCH_GEOMETRIC_URL \
+    --build-arg LIB_NCCL=$LIB_NCCL \
     --network=host \
     -t $OUTPUT_IMAGE .
 fi

diff --git a/docker/x86-64/Dockerfile b/docker/x86-64/Dockerfile
@@ -29,6 +29,8 @@ ARG PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113
 
 ARG PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html
 
+ARG LIB_NCCL="2.9.9-1+cuda11.3"
+
 RUN echo ${NCCL_VERSION}
 RUN echo ${CUDA_VERSION}
 
@@ -124,9 +126,7 @@ RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/
 add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/${DISTRO}/${ARCH} /" && \
 apt update && \
 #export NCCL_VERSION_ENV=`echo $NCCL_VERSION | awk -F'-1' '{print $1}'` && \
-export NCCL_VERSION_ENV=$NCCL_VERSION && \
-export CUDA_VERSION_ENV=`echo $CUDA_VERSION | sed 's/\.1//g'` &&  \
-apt install -y --allow-change-held-packages libnccl2=${NCCL_VERSION_ENV}+cuda${CUDA_VERSION_ENV} libnccl-dev=${NCCL_VERSION_ENV}+cuda${CUDA_VERSION_ENV}
+apt install -y --allow-change-held-packages libnccl2=${LIB_NCCL} libnccl-dev=${LIB_NCCL}
 
 # ***************************************************************************
 # PyTorch (install from source)

diff --git a/python/fedml/__init__.py b/python/fedml/__init__.py
@@ -23,7 +23,7 @@
 _global_training_type = None
 _global_comm_backend = None
 
-__version__ = "0.7.339"
+__version__ = "0.7.340"
 
 
 def init(args=None):

diff --git a/python/fedml/cli/README.md b/python/fedml/cli/README.md
@@ -102,4 +102,10 @@ fedml logs -s
 logs from edge server with docker mode:
 ```
 fedml logs --docker --docker-rank 1
-```
+```
+
+## 6. Diagnosis
+Diagnosis for connection to https://open.fedml.ai, AWS S3 and MQTT (mqtt.fedml.ai:1883)
+```
+fedml diagnosis --open --s3 --mqtt
+```