Skip to content

Commit fc4f9f5

Browse files
authored
[Distributed] Fix ps address list sort by index. (#945)
Signed-off-by: 泊霆 <[email protected]>
1 parent c2e664a commit fc4f9f5

File tree

4 files changed

+14
-8
lines changed

4 files changed

+14
-8
lines changed

tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
#include <memory>
1818
#include <vector>
1919

20+
#include <google/protobuf/map.h>
2021
#include "include/json/json.h"
2122
#include "grpc/support/alloc.h"
2223
#include "grpcpp/grpcpp.h"
@@ -89,7 +90,7 @@ Status ElasticGrpcServer::UpdateServerDef(const string& cluster_def_str, int& be
8990
return errors::Internal("PARSE TF_CONFIG/cluster ERROR");
9091
}
9192

92-
std::unordered_set<string> ps_addrs_vec;
93+
std::set<string> ps_addrs_vec; //ordered
9394
after_part_num = cluster_json["cluster"]["ps"].size();
9495
for (auto& value: cluster_json["cluster"]["ps"]) {
9596
ps_addrs_vec.emplace(value.asString());
@@ -111,21 +112,25 @@ Status ElasticGrpcServer::UpdateServerDef(const string& cluster_def_str, int& be
111112
}
112113
for (auto ps_addr: ps_addrs_vec) {
113114
if (target_string_set.find(ps_addr) == target_string_set.end()) {
114-
job->mutable_tasks()->insert({idx, ps_addr});
115+
job->mutable_tasks()->insert({idx++, ps_addr});
115116
tf_config_json["cluster"]["ps"].append(ps_addr);
116117
}
117118
}
118119
break;
119120
} else {
120121
LOG(INFO) << "SCALING DOWN, partition_num is: " << after_part_num;
122+
google::protobuf::Map< google::protobuf::int32, std::string > tasks;
123+
Json::Value arr_value(Json::arrayValue);
124+
int idx = 0;
121125
for (int i = 0; i < before_part_num; ++i) {
122126
string tmp_string = tf_config_json["cluster"]["ps"][i].asString();
123-
if (ps_addrs_vec.find(tmp_string) == ps_addrs_vec.end()) {
124-
Json::Value ps_addr;
125-
tf_config_json["cluster"]["ps"].removeIndex(i, &ps_addr);
126-
job->mutable_tasks()->erase(i);
127+
if (ps_addrs_vec.find(tmp_string) != ps_addrs_vec.end()) {
128+
arr_value.append(tf_config_json["cluster"]["ps"][i]);
129+
tasks[idx++] = tmp_string;
127130
}
128131
}
132+
tf_config_json["cluster"]["ps"].swap(arr_value);
133+
job->mutable_tasks()->swap(tasks);
129134
}
130135
}
131136
}

tensorflow/contrib/elastic_grpc_server/elastic_service.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ limitations under the License.
2424
#include <grpcpp/server.h>
2525
#include "grpcpp/server_builder.h"
2626

27-
using namespace des;
27+
using namespace deeprec;
2828

2929
using grpc::Server;
3030
using grpc::ServerAsyncResponseWriter;

tensorflow/core/protobuf/elastic_training.proto

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
syntax = "proto3";
22

3-
package des;
3+
package deeprec;
44

55
enum Code {
66
OK = 0;

tensorflow/python/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4747,6 +4747,7 @@ py_library(
47474747
":platform",
47484748
":protos_all_py",
47494749
":session_run_hook",
4750+
"//tensorflow/core:elastic_service_pb_py",
47504751
":training_util",
47514752
":util",
47524753
],

0 commit comments

Comments
 (0)