Fix federated learning demos and tests (dmlc#9488)

SYangster · web-flow · commit 12fe2fc06c47 · 2023-08-16T15:25:05.000+08:00
diff --git a/demo/nvflare/.gitignore b/demo/nvflare/.gitignore
@@ -0,0 +1 @@
+!config
diff --git a/demo/nvflare/config/config_fed_client.json b/demo/nvflare/config/config_fed_client.json
@@ -0,0 +1,23 @@
+{
+  "format_version": 2,
+  "executors": [
+    {
+      "tasks": [
+        "train"
+      ],
+      "executor": {
+        "path": "trainer.XGBoostTrainer",
+        "args": {
+          "server_address": "localhost:9091",
+          "world_size": 2,
+          "server_cert_path": "server-cert.pem",
+          "client_key_path": "client-key.pem",
+          "client_cert_path": "client-cert.pem",
+          "use_gpus": false
+        }
+      }
+    }
+  ],
+  "task_result_filters": [],
+  "task_data_filters": []
+}
diff --git a/demo/nvflare/config/config_fed_server.json b/demo/nvflare/config/config_fed_server.json
@@ -0,0 +1,22 @@
+{
+  "format_version": 2,
+  "server": {
+    "heart_beat_timeout": 600
+  },
+  "task_data_filters": [],
+  "task_result_filters": [],
+  "workflows": [
+    {
+      "id": "server_workflow",
+      "path": "controller.XGBoostController",
+      "args": {
+        "port": 9091,
+        "world_size": 2,
+        "server_key_path": "server-key.pem",
+        "server_cert_path": "server-cert.pem",
+        "client_cert_path": "client-cert.pem"
+      }
+    }
+  ],
+  "components": []
+}
diff --git a/demo/nvflare/horizontal/README.md b/demo/nvflare/horizontal/README.md
@@ -6,7 +6,7 @@ This directory contains a demo of Horizontal Federated Learning using
 ## Training with CPU only
 
 To run the demo, first build XGBoost with the federated learning plugin enabled (see the
-[README](../../plugin/federated/README.md)).
+[README](../../../plugin/federated/README.md)).
 
 Install NVFlare (note that currently NVFlare only supports Python 3.8):
 ```shell
diff --git a/demo/nvflare/horizontal/prepare_data.sh b/demo/nvflare/horizontal/prepare_data.sh
@@ -16,7 +16,7 @@ split -n l/${world_size} --numeric-suffixes=1 -a 1 ../../data/agaricus.txt.test
 
 nvflare poc -n 2 --prepare
 mkdir -p /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
-cp -fr config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
+cp -fr ../config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
 cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
 for (( site=1; site<=world_size; site++ )); do
   cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$site"/
diff --git a/demo/nvflare/vertical/README.md b/demo/nvflare/vertical/README.md
@@ -6,7 +6,7 @@ This directory contains a demo of Vertical Federated Learning using
 ## Training with CPU only
 
 To run the demo, first build XGBoost with the federated learning plugin enabled (see the
-[README](../../plugin/federated/README.md)).
+[README](../../../plugin/federated/README.md)).
 
 Install NVFlare (note that currently NVFlare only supports Python 3.8):
 ```shell
diff --git a/demo/nvflare/vertical/custom/trainer.py b/demo/nvflare/vertical/custom/trainer.py
@@ -16,7 +16,7 @@ class SupportedTasks(object):
 
 class XGBoostTrainer(Executor):
     def __init__(self, server_address: str, world_size: int, server_cert_path: str,
-                 client_key_path: str, client_cert_path: str):
+                 client_key_path: str, client_cert_path: str, use_gpus: bool):
         """Trainer for federated XGBoost.
 
         Args:
@@ -32,6 +32,7 @@ def __init__(self, server_address: str, world_size: int, server_cert_path: str,
         self._server_cert_path = server_cert_path
         self._client_key_path = client_key_path
         self._client_cert_path = client_cert_path
+        self._use_gpus = use_gpus
 
     def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext,
                 abort_signal: Signal) -> Shareable:
@@ -81,6 +82,8 @@ def _do_training(self, fl_ctx: FLContext):
                 'objective': 'binary:logistic',
                 'eval_metric': 'auc',
             }
+            if self._use_gpus:
+                self.log_info(fl_ctx, 'GPUs are not currently supported by vertical federated XGBoost')
 
             # specify validations set to watch performance
             watchlist = [(dtest, "eval"), (dtrain, "train")]
diff --git a/demo/nvflare/vertical/prepare_data.sh b/demo/nvflare/vertical/prepare_data.sh
@@ -56,7 +56,7 @@ fi
 
 nvflare poc -n 2 --prepare
 mkdir -p /tmp/nvflare/poc/admin/transfer/vertical-xgboost
-cp -fr config custom /tmp/nvflare/poc/admin/transfer/vertical-xgboost
+cp -fr ../config custom /tmp/nvflare/poc/admin/transfer/vertical-xgboost
 cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
 for (( site=1; site<=world_size; site++ )); do
   cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"${site}"/
diff --git a/tests/test_distributed/test_federated/runtests-federated.sh b/tests/test_distributed/test_federated/runtests-federated.sh
@@ -11,7 +11,7 @@ openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout server-key.pem -out se
 openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout client-key.pem -out client-cert.pem -subj "/C=US/CN=localhost"
 
 # Split train and test files manually to simulate a federated environment.
-split -n l/"${world_size}" -d ../../demo/data/agaricus.txt.train agaricus.txt.train-
-split -n l/"${world_size}" -d ../../demo/data/agaricus.txt.test agaricus.txt.test-
+split -n l/"${world_size}" -d ../../../demo/data/agaricus.txt.train agaricus.txt.train-
+split -n l/"${world_size}" -d ../../../demo/data/agaricus.txt.test agaricus.txt.test-
 
 python test_federated.py "${world_size}"
diff --git a/tests/test_distributed/test_federated/test_federated.py b/tests/test_distributed/test_federated/test_federated.py
@@ -35,14 +35,14 @@ def run_worker(port: int, world_size: int, rank: int, with_ssl: bool, with_gpu:
     # Always call this before using distributed module
     with xgb.collective.CommunicatorContext(**communicator_env):
         # Load file, file will not be sharded in federated mode.
-        dtrain = xgb.DMatrix('agaricus.txt.train-%02d' % rank)
-        dtest = xgb.DMatrix('agaricus.txt.test-%02d' % rank)
+        dtrain = xgb.DMatrix('agaricus.txt.train-%02d?format=libsvm' % rank)
+        dtest = xgb.DMatrix('agaricus.txt.test-%02d?format=libsvm' % rank)
 
         # Specify parameters via map, definition are same as c++ version
         param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
         if with_gpu:
-            param['tree_method'] = 'gpu_hist'
-            param['gpu_id'] = rank
+            param['tree_method'] = 'hist'
+            param['device'] = f"cuda:{rank}"
 
         # Specify validations set to watch performance
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]