Merge pull request #566 from FedML-AI/dev/v0.7.0

Dev/v0.7.0
FedML-AI · Sep 11, 2022 · d5221b9 · d5221b9
2 parents e2e859c + b14723c
commit d5221b9
Show file tree

Hide file tree

Showing 81 changed files with 480 additions and 758 deletions.
diff --git a/...smoke_test_cross_silo_fedavg_dp_linux.yml → ...moke_test_cross_silo_fedavg_cdp_linux.yml b/...smoke_test_cross_silo_fedavg_dp_linux.yml → ...moke_test_cross_silo_fedavg_cdp_linux.yml
@@ -16,7 +16,7 @@ on:
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
-  cross-silo-dp-test:
+  cross-silo-cdp-test:
     defaults:
       run:
         shell: bash
@@ -46,23 +46,23 @@ jobs:
 
       - name: server - cross-silo - cdp
         run: |
-          cd examples/cross_silo/mqtt_s3_fedavg_central_dp_mnist_lr_example
+          cd examples/cross_silo/mqtt_s3_fedavg_cdp_mnist_lr_example
           run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }}
           echo ${run_id}
           bash run_server.sh $run_id
         if:   ${{ matrix.client-index == '0' }}
 
       - name: client 1 - cross-silo - cdp
         run: |
-          cd examples/cross_silo/mqtt_s3_fedavg_central_dp_mnist_lr_example
+          cd examples/cross_silo/mqtt_s3_fedavg_cdp_mnist_lr_example
           run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }}
           echo ${run_id}
           bash run_client.sh 1 $run_id
         if:   ${{ matrix.client-index == '1' }}
 
       - name: client 2 - cross-silo - cdp
         run: |
-          cd examples/cross_silo/mqtt_s3_fedavg_central_dp_mnist_lr_example
+          cd examples/cross_silo/mqtt_s3_fedavg_cdp_mnist_lr_example
           run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }}
           echo ${run_id}
           bash run_client.sh 2 $run_id

diff --git a/.github/workflows/smoke_test_cross_silo_fedavg_ldp_linux.yml b/.github/workflows/smoke_test_cross_silo_fedavg_ldp_linux.yml
@@ -0,0 +1,69 @@
+# This is a basic workflow to help you get started with Actions
+
+name: LDP-Linux
+
+# Controls when the workflow will run
+on:
+  # Triggers the workflow on push or pull request events but only for the master branch
+  schedule:
+    # Nightly build at 12:12 A.M.
+    - cron: "12 12 */1 * *"
+  pull_request:
+    branches: [ master,  test/v0.7.0, dev/0.7.0 ]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  cross-silo-ldp-test:
+    defaults:
+      run:
+        shell: bash
+        working-directory: python
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest]
+        arch: [X64]
+        python-version: ['3.8']
+        client-index: ['0', '1', '2']
+#        exclude:
+#          - os: macos-latest
+#            python-version: '3.8'
+#          - os: windows-latest
+#            python-version: '3.6'
+    runs-on: [self-hosted, runner-linux, devops]
+    timeout-minutes: 15
+    steps:
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: actions/checkout@v3
+      - name: pip install -e ./
+        run: |
+          pip install -e ./
+
+      - name: server - cross-silo - ldp
+        run: |
+          cd examples/cross_silo/mqtt_s3_fedavg_ldp_mnist_lr_example
+          run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }}
+          echo ${run_id}
+          bash run_server.sh $run_id
+        if:   ${{ matrix.client-index == '0' }}
+
+      - name: client 1 - cross-silo - ldp
+        run: |
+          cd examples/cross_silo/mqtt_s3_fedavg_ldp_mnist_lr_example
+          run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }}
+          echo ${run_id}
+          bash run_client.sh 1 $run_id
+        if:   ${{ matrix.client-index == '1' }}
+
+      - name: client 2 - cross-silo - ldp
+        run: |
+          cd examples/cross_silo/mqtt_s3_fedavg_ldp_mnist_lr_example
+          run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }}
+          echo ${run_id}
+          bash run_client.sh 2 $run_id
+        if: ${{ matrix.client-index == '2' }}
diff --git a/.github/workflows/smoke_test_ml_engines_linux.yml b/.github/workflows/smoke_test_ml_engines_linux.yml
@@ -153,7 +153,6 @@ jobs:
 
       - name: server - mxnet - fedavg
         run: |
-          sudo apt-get install libquadmath0
           cd examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example
           run_id=mxnet-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }}
           echo ${run_id}
@@ -162,7 +161,6 @@ jobs:
 
       - name: client 1 - mxnet - fedavg
         run: |
-          sudo apt-get install libquadmath0
           cd examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example
           run_id=mxnet-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }}
           echo ${run_id}
@@ -171,7 +169,6 @@ jobs:
 
       - name: client 2 - mxnet - fedavg
         run: |
-          sudo apt-get install libquadmath0
           cd examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example
           run_id=mxnet-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }}
           echo ${run_id}

diff --git a/python/examples/README.md b/python/examples/README.md
@@ -34,7 +34,7 @@
 | mxnet_mqtt_s3_fedavg_mnist_lr_example      | Octopus (cross-silo) | FedAvg              | MNIST   | Logistic Regression | MQTT_S3               | mxnet                    | [Link](cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example)        | [Link](cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example/README.md)        |
 | mqtt_s3_fedavg_attack_mnist_lr_example     | Octopus (cross-silo) | FedAvg              | MNIST   | Logistic Regression | MQTT_S3               | pytorch                  | [Link](cross_silo/mqtt_s3_fedavg_attack_mnist_lr_example)       | [Link](cross_silo/mqtt_s3_fedavg_attack_mnist_lr_example/README.md)       |
 | mqtt_s3_fedavg_defense_mnist_lr_example    | Octopus (cross-silo) | FedAvg              | MNIST   | Logistic Regression | MQTT_S3               | pytorch                  | [Link](cross_silo/mqtt_s3_fedavg_defense_mnist_lr_example)      | [Link](cross_silo/mqtt_s3_fedavg_defense_mnist_lr_example/README.md)      |
-| mqtt_s3_fedavg_central_dp_mnist_lr_example | Octopus (cross-silo) | FedAvg              | MNIST   | Logistic Regression | MQTT_S3               | pytorch                  | [Link](cross_silo/mqtt_s3_fedavg_central_dp_mnist_lr_example)   | [Link](cross_silo/mqtt_s3_fedavg_central_dp_mnist_lr_example/README.md)   |
+| mqtt_s3_fedavg_central_dp_mnist_lr_example | Octopus (cross-silo) | FedAvg              | MNIST   | Logistic Regression | MQTT_S3               | pytorch                  | [Link](cross_silo/mqtt_s3_fedavg_dp_mnist_lr_example)   | [Link](cross_silo/mqtt_s3_fedavg_dp_mnist_lr_example/README.md)   |
 | mqtt_s3_fedavg_local_dp_mnist_lr_example   | Octopus (cross-silo) | FedAvg              | MNIST   | Logistic Regression | MQTT_S3               | pytorch                  | [Link](cross_silo/mqtt_s3_fedavg_local_dp_mnist_lr_example)     | [Link](cross_silo/mqtt_s3_fedavg_local_dp_mnist_lr_example/README.md)     |
 
 

diff --git a/...avg_central_dp_mnist_lr_example/README.md → ..._s3_fedavg_cdp_mnist_lr_example/README.md b/...avg_central_dp_mnist_lr_example/README.md → ..._s3_fedavg_cdp_mnist_lr_example/README.md
diff --git a/...g_central_dp_mnist_lr_example/__init__.py → ...3_fedavg_cdp_mnist_lr_example/__init__.py b/...g_central_dp_mnist_lr_example/__init__.py → ...3_fedavg_cdp_mnist_lr_example/__init__.py
diff --git a/...l_dp_mnist_lr_example/config/bootstrap.sh → ..._cdp_mnist_lr_example/config/bootstrap.sh b/...l_dp_mnist_lr_example/config/bootstrap.sh → ..._cdp_mnist_lr_example/config/bootstrap.sh
diff --git a/...mnist_lr_example/config/fedml_config.yaml → ...mnist_lr_example/config/fedml_config.yaml b/...mnist_lr_example/config/fedml_config.yaml → ...mnist_lr_example/config/fedml_config.yaml
@@ -24,7 +24,7 @@ train_args:
   client_id_list:
   client_num_in_total: 1000
   client_num_per_round: 2
-  comm_round: 50
+  comm_round: 5 # we use 5 for quick sanity check. please modify a reasonable value
   epochs: 1
   batch_size: 10
   client_optimizer: sgd
@@ -54,7 +54,7 @@ tracking_args:
 # example:
 dp_args:
   enable_dp: true
-  dp_type: cdp # cdp or ldp
+  dp_solution_type: cdp
   epsilon: 0.5
   delta: 0.1
   sensitivity: 1

diff --git a/...central_dp_mnist_lr_example/run_client.sh → ...fedavg_cdp_mnist_lr_example/run_client.sh b/...central_dp_mnist_lr_example/run_client.sh → ...fedavg_cdp_mnist_lr_example/run_client.sh
diff --git a/...central_dp_mnist_lr_example/run_server.sh → ...fedavg_cdp_mnist_lr_example/run_server.sh b/...central_dp_mnist_lr_example/run_server.sh → ...fedavg_cdp_mnist_lr_example/run_server.sh
diff --git a/...ntral_dp_mnist_lr_example/torch_client.py → ...davg_cdp_mnist_lr_example/torch_client.py b/...ntral_dp_mnist_lr_example/torch_client.py → ...davg_cdp_mnist_lr_example/torch_client.py
diff --git a/...ntral_dp_mnist_lr_example/torch_server.py → ...davg_cdp_mnist_lr_example/torch_server.py b/...ntral_dp_mnist_lr_example/torch_server.py → ...davg_cdp_mnist_lr_example/torch_server.py
diff --git a/.../examples/cross_silo/mqtt_s3_fedavg_defense_mnist_lr_example/config/krum/gpu_mapping.yaml b/.../examples/cross_silo/mqtt_s3_fedavg_defense_mnist_lr_example/config/krum/gpu_mapping.yaml
@@ -0,0 +1,60 @@
+# You can define a cluster containing multiple GPUs within multiple machines by defining `gpu_mapping.yaml` as follows:
+
+# config_cluster0:
+#     host_name_node0: [num_of_processes_on_GPU0, num_of_processes_on_GPU1, num_of_processes_on_GPU2, num_of_processes_on_GPU3, ..., num_of_processes_on_GPU_n]
+#     host_name_node1: [num_of_processes_on_GPU0, num_of_processes_on_GPU1, num_of_processes_on_GPU2, num_of_processes_on_GPU3, ..., num_of_processes_on_GPU_n]
+#     host_name_node_m: [num_of_processes_on_GPU0, num_of_processes_on_GPU1, num_of_processes_on_GPU2, num_of_processes_on_GPU3, ..., num_of_processes_on_GPU_n]
+
+
+# this is used for 10 clients and 1 server training within a single machine which has 4 GPUs
+mapping_default:
+    ChaoyangHe-GPU-RTX2080Tix4: [3, 3, 3, 2]
+
+# this is used for 4 clients and 1 server training within a single machine which has 4 GPUs
+mapping_config1_5:
+    host1: [2, 1, 1, 1]
+
+# this is used for 10 clients and 1 server training within a single machine which has 4 GPUs
+mapping_config2_11:
+    host1: [3, 3, 3, 2]
+
+# this is used for 10 clients and 1 server training within a single machine which has 8 GPUs
+mapping_config3_11:
+    host1: [2, 2, 2, 1, 1, 1, 1, 1]
+
+# this is used for 4 clients and 1 server training within a single machine which has 8 GPUs, but you hope to skip the GPU device ID.
+mapping_config4_5:
+    host1: [1, 0, 0, 1, 1, 0, 1, 1]
+
+# this is used for 4 clients and 1 server training using 6 machines, each machine has 2 GPUs inside, but you hope to use the second GPU.
+mapping_config5_6:
+    host1: [0, 1]
+    host2: [0, 1]
+    host3: [0, 1]
+    host4: [0, 1]
+    host5: [0, 1]
+# this is used for 4 clients and 1 server training using 2 machines, each machine has 2 GPUs inside, but you hope to use the second GPU.
+mapping_config5_2:
+    gpu-worker2: [1,1]
+    gpu-worker1: [2,1]
+
+# this is used for 10 clients and 1 server training using 4 machines, each machine has 2 GPUs inside, but you hope to use the second GPU.
+mapping_config5_4:
+    gpu-worker2: [1,1]
+    gpu-worker1: [2,1]
+    gpu-worker3: [3,1]
+    gpu-worker4: [1,1]
+
+# for grpc GPU mapping
+mapping_FedML_gRPC:
+    hostname_node_server: [1]
+    hostname_node_1: [1, 0, 0, 0]
+    hostname_node_2: [1, 0, 0, 0]
+
+# for torch RPC GPU mapping
+mapping_FedML_tRPC:
+    lambda-server1: [0, 0, 0, 0, 2, 2, 1, 1]
+    lambda-server2: [2, 1, 1, 1, 0, 0, 0, 0]
+
+#mapping_FedML_tRPC:
+#    lambda-server1: [0, 0, 0, 0, 3, 3, 3, 2]
diff --git a/...r_example/custom_data_and_model/README.md → ..._s3_fedavg_ldp_mnist_lr_example/README.md b/...r_example/custom_data_and_model/README.md → ..._s3_fedavg_ldp_mnist_lr_example/README.md
diff --git a/...avg_local_dp_mnist_lr_example/__init__.py → ...3_fedavg_ldp_mnist_lr_example/__init__.py b/...avg_local_dp_mnist_lr_example/__init__.py → ...3_fedavg_ldp_mnist_lr_example/__init__.py
diff --git a/...custom_data_and_model/config/bootstrap.sh → ..._ldp_mnist_lr_example/config/bootstrap.sh b/...custom_data_and_model/config/bootstrap.sh → ..._ldp_mnist_lr_example/config/bootstrap.sh
diff --git a/...example/one_line/config/fedml_config.yaml → ...mnist_lr_example/config/fedml_config.yaml b/...example/one_line/config/fedml_config.yaml → ...mnist_lr_example/config/fedml_config.yaml
@@ -3,7 +3,6 @@ common_args:
   scenario: "horizontal"
   using_mlops: false
   random_seed: 0
-  config_version: release
 
 environment_args:
   bootstrap: config/bootstrap.sh
@@ -25,15 +24,15 @@ train_args:
   client_id_list:
   client_num_in_total: 1000
   client_num_per_round: 2
-  comm_round: 10
+  comm_round: 5 # we use 5 for quick sanity check. please modify a reasonable value
   epochs: 1
   batch_size: 10
   client_optimizer: sgd
   learning_rate: 0.03
   weight_decay: 0.001
 
 validation_args:
-  frequency_of_the_test: 1
+  frequency_of_the_test: 5
 
 device_args:
   using_gpu: false
@@ -42,9 +41,8 @@ device_args:
 
 comm_args:
   backend: "MQTT_S3"
-  mqtt_config_path:
-  s3_config_path:
-  grpc_ipconfig_path: ./config/grpc_ipconfig.csv
+  mqtt_config_path: config/mqtt_config.yaml
+  s3_config_path: config/s3_config.yaml
 
 tracking_args:
   # the default log path is at ~/fedml-client/fedml/logs/ and ~/fedml-server/fedml/logs/
@@ -53,11 +51,10 @@ tracking_args:
   wandb_project: fedml
   wandb_name: fedml_torch_fedavg_mnist_lr
 
-
 # example:
 dp_args:
   enable_dp: true
-  dp_type: ldp # cdp or ldp
+  dp_solution_type: ldp
   epsilon: 0.5
   delta: 0.1
   sensitivity: 1

diff --git a/...ample/custom_data_and_model/run_client.sh → ...fedavg_ldp_mnist_lr_example/run_client.sh b/...ample/custom_data_and_model/run_client.sh → ...fedavg_ldp_mnist_lr_example/run_client.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env bash
 RANK=$1
-python3 torch_client.py --cf config/fedml_config.yaml --rank $RANK --role client
+RUN_ID=$2
+python3 torch_client.py --cf config/fedml_config.yaml --rank $RANK --role client --run_id $RUN_ID
diff --git a/...ist_lr_example/step_by_step/run_server.sh → ...fedavg_ldp_mnist_lr_example/run_server.sh b/...ist_lr_example/step_by_step/run_server.sh → ...fedavg_ldp_mnist_lr_example/run_server.sh
@@ -1,3 +1,3 @@
 #!/usr/bin/env bash
-
-python3 torch_server.py --cf config/fedml_config.yaml --rank 0 --role server
+RUN_ID=$1
+python3 torch_server.py --cf config/fedml_config.yaml --rank 0 --role server --run_id $RUN_ID
diff --git a/...t_lr_example/step_by_step/torch_client.py → ...davg_ldp_mnist_lr_example/torch_client.py b/...t_lr_example/step_by_step/torch_client.py → ...davg_ldp_mnist_lr_example/torch_client.py
diff --git a/...t_lr_example/step_by_step/torch_server.py → ...davg_ldp_mnist_lr_example/torch_server.py b/...t_lr_example/step_by_step/torch_server.py → ...davg_ldp_mnist_lr_example/torch_server.py
diff --git a/...les/cross_silo/mqtt_s3_fedavg_local_dp_mnist_lr_example/custom_data_and_model/__init__.py b/...les/cross_silo/mqtt_s3_fedavg_local_dp_mnist_lr_example/custom_data_and_model/__init__.py
diff --git a/...ss_silo/mqtt_s3_fedavg_local_dp_mnist_lr_example/custom_data_and_model/build_mlops_pkg.sh b/...ss_silo/mqtt_s3_fedavg_local_dp_mnist_lr_example/custom_data_and_model/build_mlops_pkg.sh
diff --git a/...o/mqtt_s3_fedavg_local_dp_mnist_lr_example/custom_data_and_model/config/fedml_config.yaml b/...o/mqtt_s3_fedavg_local_dp_mnist_lr_example/custom_data_and_model/config/fedml_config.yaml
diff --git a/...s/cross_silo/mqtt_s3_fedavg_local_dp_mnist_lr_example/custom_data_and_model/run_server.sh b/...s/cross_silo/mqtt_s3_fedavg_local_dp_mnist_lr_example/custom_data_and_model/run_server.sh