Merge branch 'dev/v0.7.0' of https://github.com/FedML-AI/FedML into d…

…ev/v0.7.0 # Conflicts: # python/fedml/cross_silo/client/fedml_client_master_manager.py # python/fedml/cross_silo/server/fedml_server_manager.py
FedML-AI · Oct 4, 2022 · 2bcf242 · 2bcf242
2 parents d0766b7 + 7b7ff5d
commit 2bcf242
Show file tree

Hide file tree

Showing 77 changed files with 761 additions and 222 deletions.
diff --git a/python/app/fedcv/image_segmentation/config/gpu_mapping.yaml b/python/app/fedcv/image_segmentation/config/gpu_mapping.yaml
@@ -1,4 +1,3 @@
-# Please check "GPU_MAPPING.md" to see how to define the topology
 # You can define a cluster containing multiple GPUs within multiple machines by defining `gpu_mapping.yaml` as follows:
 
 # config_cluster0:

diff --git a/python/app/fedcv/image_segmentation/config/simulation/gpu_mapping.yaml b/python/app/fedcv/image_segmentation/config/simulation/gpu_mapping.yaml
@@ -1,4 +1,3 @@
-# Please check "GPU_MAPPING.md" to see how to define the topology
 # You can define a cluster containing multiple GPUs within multiple machines by defining `gpu_mapping.yaml` as follows:
 
 # config_cluster0:

diff --git a/python/app/fedcv/object_detection/model/yolov5/requirements.txt b/python/app/fedcv/object_detection/model/yolov5/requirements.txt
@@ -12,7 +12,7 @@ scipy>=1.4.1  # Google Colab version
 torch>=1.7.0,!=1.12.0  # https://github.com/ultralytics/yolov5/issues/8395
 torchvision>=0.8.1,!=0.13.0 # https://github.com/ultralytics/yolov5/issues/8395
 tqdm>=4.41.0
-protobuf<4.21.3  # https://github.com/ultralytics/yolov5/issues/8012
+protobuf>=4.21.6  # https://github.com/ultralytics/yolov5/issues/8012
 
 # Logging -------------------------------------
 tensorboard>=2.4.1

diff --git a/python/app/fedcv/object_detection/model/yolov7/requirements.txt b/python/app/fedcv/object_detection/model/yolov7/requirements.txt
@@ -11,7 +11,7 @@ scipy>=1.4.1
 torch>=1.7.0,!=1.12.0
 torchvision>=0.8.1,!=0.13.0
 tqdm>=4.41.0
-protobuf<4.21.3
+protobuf>=4.21.6
 
 # Logging -------------------------------------
 tensorboard>=2.4.1

diff --git a/...les/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/README.md b/...les/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/README.md
@@ -0,0 +1,31 @@
+## Training Script
+
+At the client side, the client ID (a.k.a rank) starts from 1.
+Please also modify config/fedml_config.yaml, changing the `worker_num` the as the number of clients you plan to run.
+
+At the server side, run the following script:
+```
+bash run_server.sh your_run_id
+```
+
+For client 1, run the following script:
+```
+bash run_client.sh 1 your_run_id
+```
+For client 2, run the following script:
+```
+bash run_client.sh 2 your_run_id
+```
+Note: please run the server first.
+
+## A Better User-experience with FedML MLOps (open.fedml.ai)
+To reduce the difficulty and complexity of these CLI commands. We recommend you to use our MLOps (open.fedml.ai).
+FedML MLOps provides:
+- Install Client Agent and Login
+- Inviting Collaborators and group management
+- Project Management
+- Experiment Tracking (visualizing training results)
+- monitoring device status
+- visualizing system performance (including profiling flow chart)
+- distributed logging
+- model serving
diff --git a/...on/examples/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/__init__.py b/...on/examples/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/__init__.py
diff --git a/...les/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/config/bootstrap.sh b/...les/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/config/bootstrap.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+# pip install fedml==0.7.15
+#pip install --upgrade fedml
+
+### don't modify this part ###
+echo "[FedML]Bootstrap Finished"
+##############################
diff --git a/...mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/config/foolsgold/fedml_config.yaml b/...mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/config/foolsgold/fedml_config.yaml
@@ -0,0 +1,65 @@
+common_args:
+  training_type: "cross_silo"
+  scenario: "horizontal"
+  using_mlops: false
+  random_seed: 0
+  config_version: release
+
+environment_args:
+  bootstrap: config/bootstrap.sh
+
+data_args:
+  dataset: "cifar10"
+  data_cache_dir: ~/fedml_data
+  partition_method: "homo"
+  partition_alpha: 0.5
+
+model_args:
+  model: "resnet56"
+  model_file_cache_folder: "./model_file_cache" # will be filled by the server automatically
+  global_model_file_path: "./model_file_cache/global_model.pt"
+
+train_args:
+  federated_optimizer: "FedAvg"
+  # for CLI running, this can be None; in MLOps deployment, `client_id_list` will be replaced with real-time selected devices
+  client_id_list:
+  # for FoolsGold Defense, if use_memory is true, then client_num_in_total should be equal to client_number_per_round
+  client_num_in_total: 8
+  client_num_per_round: 8
+  comm_round: 10
+  epochs: 1
+  batch_size: 10
+  client_optimizer: sgd
+  learning_rate: 0.03
+  weight_decay: 0.001
+
+validation_args:
+  frequency_of_the_test: 1
+
+device_args:
+  worker_num: 8
+  using_gpu: true
+  gpu_mapping_file: config/foolsgold/gpu_mapping.yaml
+  gpu_mapping_key: mapping_default
+
+comm_args:
+  backend: "MPI"
+
+tracking_args:
+  # the default log path is at ~/fedml-client/fedml/logs/ and ~/fedml-server/fedml/logs/
+  enable_wandb: false
+  wandb_key: ee0b5f53d949c84cee7decbe7a629e63fb2f8408
+  wandb_project: fedml
+  wandb_name: fedml_torch_fedavg_mnist_lr
+
+
+attack_args:
+  enable_attack: true
+  attack_type: byzantine
+  attack_mode: random
+  byzantine_client_num: 1
+
+# for FoolsGold Defense, if use_memory is true, then client_num_in_total should be equal to client_number_per_round
+defense_args:
+  enable_defense: true
+  defense_type: foolsgold
diff --git a/.../mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/config/foolsgold/gpu_mapping.yaml b/.../mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/config/foolsgold/gpu_mapping.yaml
@@ -0,0 +1,3 @@
+# this is used for 4 clients and 1 server training within a single machine which has 8 GPUs, but you hope to skip the GPU device ID.
+mapping_default:
+  host1: [3, 2, 2, 2]  # assume we only have 4 GPUs
diff --git a/...cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/config/gpu_mapping.yaml b/...cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/config/gpu_mapping.yaml
@@ -0,0 +1,60 @@
+# You can define a cluster containing multiple GPUs within multiple machines by defining `gpu_mapping.yaml` as follows:
+
+# config_cluster0:
+#     host_name_node0: [num_of_processes_on_GPU0, num_of_processes_on_GPU1, num_of_processes_on_GPU2, num_of_processes_on_GPU3, ..., num_of_processes_on_GPU_n]
+#     host_name_node1: [num_of_processes_on_GPU0, num_of_processes_on_GPU1, num_of_processes_on_GPU2, num_of_processes_on_GPU3, ..., num_of_processes_on_GPU_n]
+#     host_name_node_m: [num_of_processes_on_GPU0, num_of_processes_on_GPU1, num_of_processes_on_GPU2, num_of_processes_on_GPU3, ..., num_of_processes_on_GPU_n]
+
+
+# this is used for 10 clients and 1 server training within a single machine which has 4 GPUs
+mapping_default:
+    ChaoyangHe-GPU-RTX2080Tix4: [3, 3, 3, 2]
+
+# this is used for 4 clients and 1 server training within a single machine which has 4 GPUs
+mapping_config1_5:
+    host1: [2, 1, 1, 1]
+
+# this is used for 10 clients and 1 server training within a single machine which has 4 GPUs
+mapping_config2_11:
+    host1: [3, 3, 3, 2]
+
+# this is used for 10 clients and 1 server training within a single machine which has 8 GPUs
+mapping_config3_11:
+    host1: [2, 2, 2, 1, 1, 1, 1, 1]
+
+# this is used for 4 clients and 1 server training within a single machine which has 8 GPUs, but you hope to skip the GPU device ID.
+mapping_config4_5:
+    host1: [1, 0, 0, 1, 1, 0, 1, 1]
+
+# this is used for 4 clients and 1 server training using 6 machines, each machine has 2 GPUs inside, but you hope to use the second GPU.
+mapping_config5_6:
+    host1: [0, 1]
+    host2: [0, 1]
+    host3: [0, 1]
+    host4: [0, 1]
+    host5: [0, 1]
+# this is used for 4 clients and 1 server training using 2 machines, each machine has 2 GPUs inside, but you hope to use the second GPU.
+mapping_config5_2:
+    gpu-worker2: [1,1]
+    gpu-worker1: [2,1]
+
+# this is used for 10 clients and 1 server training using 4 machines, each machine has 2 GPUs inside, but you hope to use the second GPU.
+mapping_config5_4:
+    gpu-worker2: [1,1]
+    gpu-worker1: [2,1]
+    gpu-worker3: [3,1]
+    gpu-worker4: [1,1]
+
+# for grpc GPU mapping
+mapping_FedML_gRPC:
+    hostname_node_server: [1]
+    hostname_node_1: [1, 0, 0, 0]
+    hostname_node_2: [1, 0, 0, 0]
+
+# for torch RPC GPU mapping
+mapping_FedML_tRPC:
+    lambda-server1: [0, 0, 0, 0, 2, 2, 1, 1]
+    lambda-server2: [2, 1, 1, 1, 0, 0, 0, 0]
+
+#mapping_FedML_tRPC:
+#    lambda-server1: [0, 0, 0, 0, 3, 3, 3, 2]
diff --git a/.../examples/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/run_client.sh b/.../examples/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/run_client.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+RANK=$1
+RUN_ID=$2
+python3 torch_client.py --cf config/foolsgold/fedml_config.yaml --rank $RANK --role client --run_id $RUN_ID
diff --git a/python/examples/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/run_mpi.sh b/python/examples/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/run_mpi.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+WORKER_NUM=$1
+
+PROCESS_NUM=`expr $WORKER_NUM + 1`
+echo $PROCESS_NUM
+
+hostname > mpi_host_file
+
+mpirun -np $PROCESS_NUM \
+-hostfile mpi_host_file \
+python torch_mpi.py --cf config/foolsgold/fedml_config.yaml
diff --git a/.../examples/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/run_server.sh b/.../examples/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/run_server.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+RUN_ID=$1
+python3 torch_server.py --cf config/foolsgold/fedml_config.yaml --rank 0 --role server --run_id $RUN_ID
diff --git a/...xamples/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/torch_client.py b/...xamples/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/torch_client.py
@@ -0,0 +1,31 @@
+import logging
+
+import fedml
+from fedml import FedMLRunner
+from fedml.model.cv.resnet import resnet56
+
+
+def create_model():
+    # please download the pre-trained weight file from
+    # https://github.com/chenyaofo/pytorch-cifar-models/releases/download/resnet/cifar10_resnet44-2a3cabcb.pt
+    pre_trained_model_path = "./config/resnet56_on_cifar10.pth"
+    model = resnet56(10, pretrained=True, path=pre_trained_model_path)
+    logging.info("load pretrained model successfully")
+    return model
+
+
+if __name__ == "__main__":
+    args = fedml.init()
+
+    # init device
+    device = fedml.device.get_device(args)
+
+    # load data
+    dataset, output_dim = fedml.data.load(args)
+
+    # load model
+    model = create_model()
+
+    # start training
+    fedml_runner = FedMLRunner(args, device, dataset, model)
+    fedml_runner.run()
diff --git a/...n/examples/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/torch_mpi.py b/...n/examples/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/torch_mpi.py
@@ -0,0 +1,34 @@
+import logging
+import fedml
+from fedml import FedMLRunner
+from fedml.model.cv.resnet import resnet56
+
+
+def create_model():
+    """
+    load pretrained model...
+    please download the pre-trained weight file from
+    https://github.com/FedML-AI/FedML/blob/fedml_v0.6_before_fundraising/fedml_api/model/cv/pretrained/CIFAR10/resnet56/best.pth
+    and rename the file to ``resnet56_on_cifar10.pth''
+    """
+    pre_trained_model_path = "./config/resnet56_on_cifar10.pth"
+    model = resnet56(10, pretrained=True, path=pre_trained_model_path)
+    logging.info("load pretrained model successfully")
+    return model
+
+
+if __name__ == "__main__":
+    args = fedml.init()
+
+    # init device
+    device = fedml.device.get_device(args)
+
+    # load data
+    dataset, output_dim = fedml.data.load(args)
+
+    # load model
+    model = fedml.model.create(args, output_dim)
+
+    # start training
+    fedml_runner = FedMLRunner(args, device, dataset, model)
+    fedml_runner.run()
diff --git a/...xamples/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/torch_server.py b/...xamples/cross_silo/mqtt_s3_fedavg_attack_defense_cifar10_resnet56_example/torch_server.py
@@ -0,0 +1,31 @@
+import logging
+
+import fedml
+from fedml import FedMLRunner
+from fedml.model.cv.resnet import resnet56
+
+
+def create_model():
+    # please download the pre-trained weight file from
+    # https://github.com/chenyaofo/pytorch-cifar-models/releases/download/resnet/cifar10_resnet44-2a3cabcb.pt
+    pre_trained_model_path = "./config/resnet56_on_cifar10.pth"
+    model = resnet56(10, pretrained=True, path=pre_trained_model_path)
+    logging.info("load pretrained model successfully")
+    return model
+
+
+if __name__ == "__main__":
+    args = fedml.init()
+
+    # init device
+    device = fedml.device.get_device(args)
+
+    # load data
+    dataset, output_dim = fedml.data.load(args)
+
+    # load model
+    model = create_model()
+
+    # start training
+    fedml_runner = FedMLRunner(args, device, dataset, model)
+    fedml_runner.run()
diff --git a/...les/cross_silo/mqtt_s3_fedavg_defense_mnist_lr_example/config/foolsgold/fedml_config.yaml b/...les/cross_silo/mqtt_s3_fedavg_defense_mnist_lr_example/config/foolsgold/fedml_config.yaml
@@ -9,13 +9,13 @@ environment_args:
   bootstrap: config/bootstrap.sh
 
 data_args:
-  dataset: "cifar10"
+  dataset: "mnist"
   data_cache_dir: ~/fedml_data
   partition_method: "homo"
   partition_alpha: 0.5
 
 model_args:
-  model: "resnet56"
+  model: "lr"
   model_file_cache_folder: "./model_file_cache" # will be filled by the server automatically
   global_model_file_path: "./model_file_cache/global_model.pt"
 
@@ -24,8 +24,8 @@ train_args:
   # for CLI running, this can be None; in MLOps deployment, `client_id_list` will be replaced with real-time selected devices
   client_id_list:
   # for FoolsGold Defense, if use_memory is true, then client_num_in_total should be equal to client_number_per_round
-  client_num_in_total: 8
-  client_num_per_round: 8
+  client_num_in_total: 4
+  client_num_per_round: 4
   comm_round: 10
   epochs: 1
   batch_size: 10
@@ -37,13 +37,13 @@ validation_args:
   frequency_of_the_test: 1
 
 device_args:
-  worker_num: 8
-  using_gpu: true
+  worker_num: 4
+  using_gpu: false
   gpu_mapping_file: config/foolsgold/gpu_mapping.yaml
   gpu_mapping_key: mapping_default
 
 comm_args:
-  backend: "MPI"
+  backend: "MQTT_S3"
 
 tracking_args:
   # the default log path is at ~/fedml-client/fedml/logs/ and ~/fedml-server/fedml/logs/

diff --git a/...xamples/cross_silo/mqtt_s3_fedavg_defense_mnist_lr_example/config/resnet56_on_cifar10.pth b/...xamples/cross_silo/mqtt_s3_fedavg_defense_mnist_lr_example/config/resnet56_on_cifar10.pth
diff --git a/python/examples/cross_silo/mqtt_s3_fedavg_defense_mnist_lr_example/torch_client.py b/python/examples/cross_silo/mqtt_s3_fedavg_defense_mnist_lr_example/torch_client.py
@@ -1,13 +1,16 @@
 import logging
-
 import fedml
 from fedml import FedMLRunner
 from fedml.model.cv.resnet import resnet56
 
 
 def create_model():
-    # please download the pre-trained weight file from
-    # https://github.com/chenyaofo/pytorch-cifar-models/releases/download/resnet/cifar10_resnet44-2a3cabcb.pt
+    """
+    loading pretrained model...
+    please download the pre-trained weight file from
+    https://github.com/FedML-AI/FedML/blob/fedml_v0.6_before_fundraising/fedml_api/model/cv/pretrained/CIFAR10/resnet56/best.pth
+    and rename the file to ``resnet56_on_cifar10.pth''
+    """
     pre_trained_model_path = "./config/resnet56_on_cifar10.pth"
     model = resnet56(10, pretrained=True, path=pre_trained_model_path)
     logging.info("load pretrained model successfully")
@@ -24,7 +27,7 @@ def create_model():
     dataset, output_dim = fedml.data.load(args)
 
     # load model
-    model = create_model()
+    model = fedml.model.create(args, output_dim)
 
     # start training
     fedml_runner = FedMLRunner(args, device, dataset, model)