You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
## If not working, investigate why, and relaunch a new worker
1132
-
if (inherits(res, "error") ||!identical(res, truth)) {
1133
-
if (debug) {
1134
-
mdebug("Worker is non-functional")
1135
-
if (inherits(res, "error")) {
1136
-
mdebug("Error received: ", conditionMessage(res))
1137
-
} else {
1138
-
mdebug("Result received: ", sQuote(res))
1124
+
maxTries<-3L
1125
+
for (kkinmaxTries:1) {
1126
+
okay<-TRUE
1127
+
res<- tryCatch({
1128
+
suppressWarnings({
1129
+
clusterCall(cl=cl, identity, truth)[[1]]
1130
+
})
1131
+
}, error=identity)
1132
+
1133
+
## If not working, investigate why, and relaunch a new worker
1134
+
if (inherits(res, "error") ||!identical(res, truth)) {
1135
+
if (debug) {
1136
+
mdebug("Worker is non-functional")
1137
+
if (inherits(res, "error")) {
1138
+
mdebug("Error received: ", conditionMessage(res))
1139
+
} else {
1140
+
mdebug("Result received: ", sQuote(res))
1141
+
}
1139
1142
}
1140
-
}
1141
-
okay<-FALSE
1142
-
1143
-
## Is the connection working?
1144
-
node<-cl[[1]]
1145
-
con<-node[["con"]]
1146
-
connectionOkay<-NA
1147
-
if (inherits(con, "connection")) {
1148
-
connectionOkay<- isConnectionValid(con)
1149
-
if (debug) mdebug("Connection is valid: ", connectionOkay)
1150
-
}
1151
-
1152
-
if (is.na(connectionOkay) ||connectionOkay) {
1153
-
## If the node does not use a connection, or the connection is working,
1154
-
## we can only assume the worker is also alive. If so, we should try to
1155
-
## kill the worker.
1156
-
res<- suppressWarnings(killNode(node))
1157
-
if (debug) mdebugf("Killed %s: %s", class(node)[1], res)
1158
-
} else {
1159
-
## If connection is not working, we could assume the worker is no longer
1160
-
## alive, but it could also be a network issues. In either case, we
1161
-
## should try to kill it, just in case.
1162
-
res<- suppressWarnings(killNode(node))
1163
-
if (debug) mdebugf("Killed %s: %s", class(node)[1], res)
1164
-
}
1165
-
if (kk==1L) {
1166
-
stop(FutureError(sprintf("Failed to find a functional cluster worker, after attempting to relaunch the parallel worker %d times", maxTries)))
1167
-
}
1168
-
} else {
1169
-
if (debug) mdebug("Worker is functional")
1170
-
break
1171
-
}
1143
+
okay<-FALSE
1172
1144
1173
-
## Relaunch worker?
1174
-
if (!okay) {
1175
-
if (debug) mdebugf_push("Restarting non-alive cluster node %d ...", node_idx)
1176
-
node2<- tryCatch({
1177
-
cloneNode(node)
1178
-
}, error=identity)
1179
-
if (inherits(node2, "error")) {
1180
-
msg<- sprintf("One of the future workers of class %s, part of a cluster of class %s, was interrupted and attempts to relaunch it failed", sQuote(class(node)[1]), sQuote(class(cl)[1]))
1181
-
if (inherits(node, c("SOCKnode", "SOCK0node")) &&
1182
-
!inherits(node, c("RichSOCKnode"))) {
1183
-
msg<- sprintf("%s. If you created your cluster with parallel::makeCluster(), try with parallelly::makeClusterPSOCK() instead", msg)
1145
+
## Is the connection working?
1146
+
node<-cl[[1]]
1147
+
con<-node[["con"]]
1148
+
connectionOkay<-NA
1149
+
if (inherits(con, "connection")) {
1150
+
connectionOkay<- isConnectionValid(con)
1151
+
if (debug) mdebug("Connection is valid: ", connectionOkay)
1152
+
}
1153
+
1154
+
if (is.na(connectionOkay) ||connectionOkay) {
1155
+
## If the node does not use a connection, or the connection is working,
1156
+
## we can only assume the worker is also alive. If so, we should try to
1157
+
## kill the worker.
1158
+
res<- suppressWarnings(killNode(node))
1159
+
if (debug) mdebugf("Killed %s: %s", class(node)[1], res)
1160
+
} else {
1161
+
## If connection is not working, we could assume the worker is no longer
1162
+
## alive, but it could also be a network issues. In either case, we
1163
+
## should try to kill it, just in case.
1164
+
res<- suppressWarnings(killNode(node))
1165
+
if (debug) mdebugf("Killed %s: %s", class(node)[1], res)
1166
+
}
1167
+
if (kk==1L) {
1168
+
stop(FutureError(sprintf("Failed to find a functional cluster worker, after attempting to relaunch the parallel worker %d times", maxTries)))
1184
1169
}
1185
-
msg<- sprintf("%s. The reported reason was: %s", msg, conditionMessage(node2))
1186
-
stop(FutureError(msg))
1187
1170
} else {
1188
-
node<-node2
1171
+
if (debug) mdebug("Worker is functional")
1172
+
break
1189
1173
}
1190
1174
1191
-
cl[[1]] <-node
1192
-
1193
-
workers[[node_idx]] <-node
1194
-
backend[["workers"]] <-workers
1195
-
1196
-
if (debug) {
1197
-
mdebug("Re-launched cluster node:")
1198
-
mprint(node)
1199
-
mdebugf_pop()
1175
+
## Relaunch worker?
1176
+
if (!okay) {
1177
+
if (debug) mdebugf_push("Restarting non-alive cluster node %d ...", node_idx)
1178
+
node2<- tryCatch({
1179
+
cloneNode(node)
1180
+
}, error=identity)
1181
+
if (inherits(node2, "error")) {
1182
+
msg<- sprintf("One of the future workers of class %s, part of a cluster of class %s, was interrupted and attempts to relaunch it failed", sQuote(class(node)[1]), sQuote(class(cl)[1]))
1183
+
if (inherits(node, c("SOCKnode", "SOCK0node")) &&
1184
+
!inherits(node, c("RichSOCKnode"))) {
1185
+
msg<- sprintf("%s. If you created your cluster with parallel::makeCluster(), try with parallelly::makeClusterPSOCK() instead", msg)
1186
+
}
1187
+
msg<- sprintf("%s. The reported reason was: %s", msg, conditionMessage(node2))
1188
+
stop(FutureError(msg))
1189
+
} else {
1190
+
node<-node2
1191
+
}
1192
+
1193
+
cl[[1]] <-node
1194
+
1195
+
workers[[node_idx]] <-node
1196
+
backend[["workers"]] <-workers
1197
+
1198
+
if (debug) {
1199
+
mdebug("Re-launched cluster node:")
1200
+
mprint(node)
1201
+
mdebugf_pop()
1202
+
}
1200
1203
}
1201
-
}
1202
-
1203
-
## Try again
1204
-
Sys.sleep(0.1)
1205
-
} ## for (kk in maxTries:1)
1204
+
1205
+
## Try again
1206
+
Sys.sleep(0.1)
1207
+
} ## for (kk in maxTries:1)
1208
+
} ## if (validateWorker)
1206
1209
1207
1210
## Assert that there is no other registered future that is using
0 commit comments