Skip to content

Commit

Permalink
Merge pull request #631 from FedML-AI/dev/v0.7.0
Browse files Browse the repository at this point in the history
Dev/v0.7.0
  • Loading branch information
fedml-alex committed Oct 31, 2022
2 parents e3ac995 + 0e0de3c commit f8bd2ba
Show file tree
Hide file tree
Showing 14 changed files with 272 additions and 92 deletions.
148 changes: 75 additions & 73 deletions devops/scripts/build-fedml-docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,110 +6,112 @@ pwd=`pwd`
export FEDML_VERSION=`cat python/setup.py |grep version= |awk -F'=' '{print $2}' |awk -F',' '{print $1}'|awk -F'"' '{print $2}'`

# Build X86_64 docker
ARCH=x86_64
OS=ubuntu18.04
DISTRO=ubuntu1804
PYTHON_VERSION=3.7
PYTORCH_VERSION=1.12.1
NCCL_VERSION=2.9.9
CUDA_VERSION=11.3
OUTPUT_IMAGE=fedml/fedml:latest-torch1.12.1-cuda11.3-cudnn8-devel
NVIDIA_BASE_IMAGE=nvidia/cuda:11.3.1-cudnn8-devel-ubuntu18.04
PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113
PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html
CURRENT_IMAGE=fedml/fedml:${FEDML_VERSION}-torch1.12.1-cuda11.3-cudnn8-devel
ARCH="x86_64"
OS="ubuntu18.04"
DISTRO="ubuntu1804"
PYTHON_VERSION="3.7"
PYTORCH_VERSION="1.12.1"
NCCL_VERSION="2.9.9"
CUDA_VERSION="11.3"
LIB_NCCL="2.9.9-1+cuda11.3"
OUTPUT_IMAGE="fedml/fedml:latest-torch1.12.1-cuda11.3-cudnn8-devel"
NVIDIA_BASE_IMAGE="nvidia/cuda:11.3.1-cudnn8-devel-ubuntu18.04"
PYTORCH_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu113"
PYTORCH_GEOMETRIC_URL="https://data.pyg.org/whl/torch-1.12.0+cu113.html"
CURRENT_IMAGE="fedml/fedml:${FEDML_VERSION}-torch1.12.1-cuda11.3-cudnn8-devel"

cd ./docker
bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \
$OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL
$OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL $LIB_NCCL

docker tag $OUTPUT_IMAGE $CURRENT_IMAGE
cd $pwd


# Build ARM_64 docker
ARCH=arm64
OS=ubuntu20.04
DISTRO=ubuntu2004
PYTHON_VERSION=3.8
PYTORCH_VERSION=1.12.1
NCCL_VERSION=2.9.6
CUDA_VERSION=11.3
OUTPUT_IMAGE=fedml/fedml:latest-torch1.12.1-cuda11.3-cudnn8-devel-arm64
NVIDIA_BASE_IMAGE=nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af
PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113
PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html
CURRENT_IMAGE=fedml/fedml:${FEDML_VERSION}-torch1.12.1-cuda11.3-cudnn8-devel-arm64
ARCH="arm64"
OS="ubuntu20.04"
DISTRO="ubuntu2004"
PYTHON_VERSION="3.8"
PYTORCH_VERSION="1.12.1"
NCCL_VERSION="2.9.6"
CUDA_VERSION="11.3"
LIB_NCCL="2.9.6-1+cuda11.3"
OUTPUT_IMAGE="fedml/fedml:latest-torch1.12.1-cuda11.3-cudnn8-devel-arm64"
NVIDIA_BASE_IMAGE="nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af"
PYTORCH_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu113"
PYTORCH_GEOMETRIC_URL="https://data.pyg.org/whl/torch-1.12.0+cu113.html"
CURRENT_IMAGE="fedml/fedml:${FEDML_VERSION}-torch1.12.1-cuda11.3-cudnn8-devel-arm64"

cd ./docker
bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \
$OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL
$OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL $LIB_NCCL

cd ./docker
docker tag $OUTPUT_IMAGE $CURRENT_IMAGE

cd $pwd

# Build nvidia_jetson docker
ARCH=jetson
OS=ubuntu20.04
DISTRO=ubuntu2004
PYTHON_VERSION=3.7
PYTORCH_VERSION=1.12.1
NCCL_VERSION=2.9.6
CUDA_VERSION=11.3
OUTPUT_IMAGE=fedml/fedml:latest-nvidia-jetson-l4t-ml-r32.6.1-py3
NVIDIA_BASE_IMAGE=nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af
PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113
PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html
CURRENT_IMAGE=fedml/fedml:${FEDML_VERSION}-nvidia-jetson-l4t-ml-r32.6.1-py3
ARCH="jetson"
OS="ubuntu20.04"
DISTRO="ubuntu2004"
PYTHON_VERSION="3.7"
PYTORCH_VERSION="1.12.1"
NCCL_VERSION="2.9.6"
CUDA_VERSION="11.3"
OUTPUT_IMAGE="fedml/fedml:latest-nvidia-jetson-l4t-ml-r32.6.1-py3"
NVIDIA_BASE_IMAGE="nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af"
PYTORCH_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu113"
PYTORCH_GEOMETRIC_URL="https://data.pyg.org/whl/torch-1.12.0+cu113.html"
CURRENT_IMAGE="fedml/fedml:${FEDML_VERSION}-nvidia-jetson-l4t-ml-r32.6.1-py3"

cd ./docker
bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \
$OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL
$OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL $LIB_NCCL

docker tag $OUTPUT_IMAGE $CURRENT_IMAGE

cd $pwd

# Build rpi32 docker
ARCH=rpi32
OS=ubuntu20.04
DISTRO=ubuntu2004
PYTHON_VERSION=3.7
PYTORCH_VERSION=1.12.1
NCCL_VERSION=2.9.6
CUDA_VERSION=11.3
OUTPUT_IMAGE=fedml/fedml:latest-raspberrypi4-32-py37
NVIDIA_BASE_IMAGE=nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af
PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113
PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html
CURRENT_IMAGE=fedml/fedml:${FEDML_VERSION}-raspberrypi4-32-py37

cd ./docker
bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \
$OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL

docker tag $OUTPUT_IMAGE $CURRENT_IMAGE

cd $pwd
#ARCH="rpi32"
#OS="ubuntu20.04"
#DISTRO="ubuntu2004"
#PYTHON_VERSION="3.7"
#PYTORCH_VERSION="1.12.1"
#NCCL_VERSION="2.9.6"
#CUDA_VERSION="11.3"
#OUTPUT_IMAGE="fedml/fedml:latest-raspberrypi4-32-py37"
#NVIDIA_BASE_IMAGE="nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af"
#PYTORCH_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu113"
#PYTORCH_GEOMETRIC_URL="https://data.pyg.org/whl/torch-1.12.0+cu113.html"
#CURRENT_IMAGE="fedml/fedml:${FEDML_VERSION}-raspberrypi4-32-py37"
#
#cd ./docker
#bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \
# $OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL $LIB_NCCL
#
#docker tag $OUTPUT_IMAGE $CURRENT_IMAGE
#
#cd $pwd

# Build rpi64 docker
ARCH=rpi64
OS=ubuntu20.04
DISTRO=ubuntu2004
PYTHON_VERSION=3.7
PYTORCH_VERSION=1.12.1
NCCL_VERSION=2.9.6
CUDA_VERSION=11.3
OUTPUT_IMAGE=fedml/fedml:latest-raspberrypi4-64-py37
NVIDIA_BASE_IMAGE=nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af
PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113
PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html
CURRENT_IMAGE=fedml/fedml:${FEDML_VERSION}-raspberrypi4-64-py37
ARCH="rpi64"
OS="ubuntu20.04"
DISTRO="ubuntu2004"
PYTHON_VERSION="3.7"
PYTORCH_VERSION="1.12.1"
NCCL_VERSION="2.9.6"
CUDA_VERSION="11.3"
OUTPUT_IMAGE="fedml/fedml:latest-raspberrypi4-64-py37"
NVIDIA_BASE_IMAGE="nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04@sha256:8e3df8601e81c57e85c082e9bcc6c547641635730ef8516b2cfa9c9e6c1208af"
PYTORCH_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu113"
PYTORCH_GEOMETRIC_URL="https://data.pyg.org/whl/torch-1.12.0+cu113.html"
CURRENT_IMAGE="fedml/fedml:${FEDML_VERSION}-raspberrypi4-64-py37"

cd ./docker
bash build-docker.sh $ARCH $OS $DISTRO $PYTHON_VERSION $PYTORCH_VERSION $NCCL_VERSION $CUDA_VERSION \
$OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL
$OUTPUT_IMAGE $NVIDIA_BASE_IMAGE $PYTORCH_EXTRA_INDEX_URL $PYTORCH_GEOMETRIC_URL $LIB_NCCL

docker tag $OUTPUT_IMAGE $CURRENT_IMAGE

Expand Down
4 changes: 2 additions & 2 deletions devops/scripts/push-fedml-docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ if [[ $push_arm_arch_images != "" ]]; then
docker push fedml/fedml:latest-nvidia-jetson-l4t-ml-r32.6.1-py3
docker push fedml/fedml:${FEDML_VERSION}-nvidia-jetson-l4t-ml-r32.6.1-py3

docker push fedml/fedml:latest-raspberrypi4-32-py37
docker push fedml/fedml:${FEDML_VERSION}-raspberrypi4-32-py37
# docker push fedml/fedml:latest-raspberrypi4-32-py37
# docker push fedml/fedml:${FEDML_VERSION}-raspberrypi4-32-py37

docker push fedml/fedml:latest-raspberrypi4-64-py37
docker push fedml/fedml:${FEDML_VERSION}-raspberrypi4-64-py37
Expand Down
6 changes: 6 additions & 0 deletions doc/en/mlops/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,5 +148,11 @@ logs from edge server with docker mode:
fedml logs --docker --docker-rank 1
```

## 6. Diagnosis
Diagnosis for connection to https://open.fedml.ai, AWS S3 and MQTT (mqtt.fedml.ai:1883)
```
fedml diagnosis --open --s3 --mqtt
```

You can also refer to a sanity check test example here:
[https://github.com/FedML-AI/FedML/blob/master/test/fedml_user_code/cli/build.sh](https://github.com/FedML-AI/FedML/blob/master/test/fedml_user_code/cli/build.sh)
4 changes: 2 additions & 2 deletions doc/en/starter/install/jetson.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
## Run FedML with Docker (Recommended)
- Pull FedML RPI docker image
```
docker pull fedml/fedml:nvidia-jetson-l4t-ml-r32.6.1-py3
docker pull fedml/fedml:latest-nvidia-jetson-l4t-ml-r32.6.1-py3
```

- Run Docker with "fedml login"
```
docker run -t -i --runtime nvidia fedml/fedml:nvidia-jetson-l4t-ml-r32.6.1-py3 /bin/bash
docker run -t -i --runtime nvidia fedml/fedml:latest-nvidia-jetson-l4t-ml-r32.6.1-py3 /bin/bash
root@8bc0de2ce0e0:/usr/src/app# fedml login 299
Expand Down
4 changes: 2 additions & 2 deletions doc/en/starter/install/rpi.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
## Run FedML with Docker (Recommended)
- Pull FedML RPI docker image
```
docker pull fedml/fedml:raspberrypi4-64-py37
docker pull fedml/fedml:latest-raspberrypi4-64-py37
```

- Run Docker with "fedml login"
```
docker run -t -i fedml/fedml:raspberrypi4-64-py37 /bin/bash
docker run -t -i fedml/fedml:latest-raspberrypi4-64-py37 /bin/bash
root@8bc0de2ce0e0:/usr/src/app# fedml login 299
Expand Down
6 changes: 3 additions & 3 deletions docker/arm64v8/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ ARG PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113

ARG PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html

ARG LIB_NCCL=2.9.6-1+cuda11.3

RUN echo ${NCCL_VERSION}
RUN echo ${CUDA_VERSION}

Expand Down Expand Up @@ -125,9 +127,7 @@ RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/${DISTRO}/${ARCH} /" && \
apt update && \
#export NCCL_VERSION_ENV=`echo $NCCL_VERSION | awk -F'-1' '{print $1}'` && \
export NCCL_VERSION_ENV=$NCCL_VERSION-1 && \
export CUDA_VERSION_ENV=`echo $CUDA_VERSION | sed 's/\.1//g'` && \
apt install -y --allow-change-held-packages libnccl2=${NCCL_VERSION_ENV}+cuda${CUDA_VERSION_ENV} libnccl-dev=${NCCL_VERSION_ENV}+cuda${CUDA_VERSION_ENV}
apt install -y --allow-change-held-packages libnccl2=${LIB_NCCL} libnccl-dev=${LIB_NCCL}

# ***************************************************************************
# PyTorch (install from source)
Expand Down
15 changes: 11 additions & 4 deletions docker/build-docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,27 @@ NCCL_VERSION=$6
CUDA_VERSION=$7
OUTPUT_IMAGE=$8
NVIDIA_BASE_IMAGE=""
if [ $# -gt 9 ]; then
if [ $# -ge 9 ]; then
NVIDIA_BASE_IMAGE=$9
fi

if [ $# -gt 10 ]; then
if [ $# -ge 10 ]; then
PYTORCH_EXTRA_INDEX_URL=${10}
else
PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113
fi

if [ $# -gt 11 ]; then
if [ $# -ge 11 ]; then
PYTORCH_GEOMETRIC_URL=${11}
else
PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html
fi

if [ $# -ge 12 ]; then
LIB_NCCL=${12}
else
LIB_NCCL="null"
fi

DOCKER_FILE_PATH=""
if [[ "$ARCH" == "x86_64" ]]; then
Expand All @@ -36,7 +41,7 @@ elif [[ "$ARCH" == "jetson" ]]; then
elif [[ "$ARCH" == "rpi32" ]]; then
DOCKER_FILE_PATH=./rpi/Dockerfile_32bit_armv7
elif [[ "$ARCH" == "rpi64" ]]; then
DOCKER_FILE_PATH=./rpi/Dockerfile_32bit_armv8
DOCKER_FILE_PATH=./rpi/Dockerfile_64bit_armv8
fi

if [ $DOCKER_FILE_PATH == "" ]; then
Expand All @@ -55,6 +60,7 @@ if [[ $NVIDIA_BASE_IMAGE != "" ]]; then
--build-arg NVIDIA_BASE_IMAGE=$NVIDIA_BASE_IMAGE \
--build-arg PYTORCH_EXTRA_INDEX_URL=$PYTORCH_EXTRA_INDEX_URL \
--build-arg PYTORCH_GEOMETRIC_URL=$PYTORCH_GEOMETRIC_URL \
--build-arg LIB_NCCL=$LIB_NCCL \
--network=host \
-t $OUTPUT_IMAGE .
else
Expand All @@ -67,6 +73,7 @@ else
--build-arg CUDA_VERSION=$CUDA_VERSION \
--build-arg PYTORCH_EXTRA_INDEX_URL=$PYTORCH_EXTRA_INDEX_URL \
--build-arg PYTORCH_GEOMETRIC_URL=$PYTORCH_GEOMETRIC_URL \
--build-arg LIB_NCCL=$LIB_NCCL \
--network=host \
-t $OUTPUT_IMAGE .
fi
Expand Down
6 changes: 3 additions & 3 deletions docker/x86-64/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ ARG PYTORCH_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu113

ARG PYTORCH_GEOMETRIC_URL=https://data.pyg.org/whl/torch-1.12.0+cu113.html

ARG LIB_NCCL="2.9.9-1+cuda11.3"

RUN echo ${NCCL_VERSION}
RUN echo ${CUDA_VERSION}

Expand Down Expand Up @@ -124,9 +126,7 @@ RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/${DISTRO}/${ARCH} /" && \
apt update && \
#export NCCL_VERSION_ENV=`echo $NCCL_VERSION | awk -F'-1' '{print $1}'` && \
export NCCL_VERSION_ENV=$NCCL_VERSION && \
export CUDA_VERSION_ENV=`echo $CUDA_VERSION | sed 's/\.1//g'` && \
apt install -y --allow-change-held-packages libnccl2=${NCCL_VERSION_ENV}+cuda${CUDA_VERSION_ENV} libnccl-dev=${NCCL_VERSION_ENV}+cuda${CUDA_VERSION_ENV}
apt install -y --allow-change-held-packages libnccl2=${LIB_NCCL} libnccl-dev=${LIB_NCCL}

# ***************************************************************************
# PyTorch (install from source)
Expand Down
2 changes: 1 addition & 1 deletion python/fedml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
_global_training_type = None
_global_comm_backend = None

__version__ = "0.7.339"
__version__ = "0.7.340"


def init(args=None):
Expand Down
8 changes: 7 additions & 1 deletion python/fedml/cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,4 +102,10 @@ fedml logs -s
logs from edge server with docker mode:
```
fedml logs --docker --docker-rank 1
```
```

## 6. Diagnosis
Diagnosis for connection to https://open.fedml.ai, AWS S3 and MQTT (mqtt.fedml.ai:1883)
```
fedml diagnosis --open --s3 --mqtt
```
Loading

0 comments on commit f8bd2ba

Please sign in to comment.