Skip to content

Commit 9fc3c01

Browse files
dcuiSasha Levin
authored andcommitted
Tools: hv: Reopen the devices if read() or write() returns errors
The state machine in the hv_utils driver can run out of order in some corner cases, e.g. if the kvp daemon doesn't call write() fast enough due to some reason, kvp_timeout_func() can run first and move the state to HVUTIL_READY; next, when kvp_on_msg() is called it returns -EINVAL since kvp_transaction.state is smaller than HVUTIL_USERSPACE_REQ; later, the daemon's write() gets an error -EINVAL, and the daemon will exit(). We can reproduce the issue by sending a SIGSTOP signal to the daemon, wait for 1 minute, and send a SIGCONT signal to the daemon: the daemon will exit() quickly. We can fix the issue by forcing a reset of the device (which means the daemon can close() and open() the device again) and doing extra necessary clean-up. Signed-off-by: Dexuan Cui <[email protected]> Reviewed-by: Michael Kelley <[email protected]> Signed-off-by: Sasha Levin <[email protected]>
1 parent 3a6fb6c commit 9fc3c01

File tree

3 files changed

+91
-31
lines changed

3 files changed

+91
-31
lines changed

tools/hv/hv_fcopy_daemon.c

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ static int hv_start_fcopy(struct hv_start_fcopy *smsg)
8080

8181
error = 0;
8282
done:
83+
if (error)
84+
target_fname[0] = '\0';
8385
return error;
8486
}
8587

@@ -108,15 +110,29 @@ static int hv_copy_data(struct hv_do_fcopy *cpmsg)
108110
return ret;
109111
}
110112

113+
/*
114+
* Reset target_fname to "" in the two below functions for hibernation: if
115+
* the fcopy operation is aborted by hibernation, the daemon should remove the
116+
* partially-copied file; to achieve this, the hv_utils driver always fakes a
117+
* CANCEL_FCOPY message upon suspend, and later when the VM resumes back,
118+
* the daemon calls hv_copy_cancel() to remove the file; if a file is copied
119+
* successfully before suspend, hv_copy_finished() must reset target_fname to
120+
* avoid that the file can be incorrectly removed upon resume, since the faked
121+
* CANCEL_FCOPY message is spurious in this case.
122+
*/
111123
static int hv_copy_finished(void)
112124
{
113125
close(target_fd);
126+
target_fname[0] = '\0';
114127
return 0;
115128
}
116129
static int hv_copy_cancel(void)
117130
{
118131
close(target_fd);
119-
unlink(target_fname);
132+
if (strlen(target_fname) > 0) {
133+
unlink(target_fname);
134+
target_fname[0] = '\0';
135+
}
120136
return 0;
121137

122138
}
@@ -131,7 +147,7 @@ void print_usage(char *argv[])
131147

132148
int main(int argc, char *argv[])
133149
{
134-
int fcopy_fd;
150+
int fcopy_fd = -1;
135151
int error;
136152
int daemonize = 1, long_index = 0, opt;
137153
int version = FCOPY_CURRENT_VERSION;
@@ -141,7 +157,7 @@ int main(int argc, char *argv[])
141157
struct hv_do_fcopy copy;
142158
__u32 kernel_modver;
143159
} buffer = { };
144-
int in_handshake = 1;
160+
int in_handshake;
145161

146162
static struct option long_options[] = {
147163
{"help", no_argument, 0, 'h' },
@@ -170,6 +186,12 @@ int main(int argc, char *argv[])
170186
openlog("HV_FCOPY", 0, LOG_USER);
171187
syslog(LOG_INFO, "starting; pid is:%d", getpid());
172188

189+
reopen_fcopy_fd:
190+
if (fcopy_fd != -1)
191+
close(fcopy_fd);
192+
/* Remove any possible partially-copied file on error */
193+
hv_copy_cancel();
194+
in_handshake = 1;
173195
fcopy_fd = open("/dev/vmbus/hv_fcopy", O_RDWR);
174196

175197
if (fcopy_fd < 0) {
@@ -196,7 +218,7 @@ int main(int argc, char *argv[])
196218
len = pread(fcopy_fd, &buffer, sizeof(buffer), 0);
197219
if (len < 0) {
198220
syslog(LOG_ERR, "pread failed: %s", strerror(errno));
199-
exit(EXIT_FAILURE);
221+
goto reopen_fcopy_fd;
200222
}
201223

202224
if (in_handshake) {
@@ -231,9 +253,14 @@ int main(int argc, char *argv[])
231253

232254
}
233255

256+
/*
257+
* pwrite() may return an error due to the faked CANCEL_FCOPY
258+
* message upon hibernation. Ignore the error by resetting the
259+
* dev file, i.e. closing and re-opening it.
260+
*/
234261
if (pwrite(fcopy_fd, &error, sizeof(int), 0) != sizeof(int)) {
235262
syslog(LOG_ERR, "pwrite failed: %s", strerror(errno));
236-
exit(EXIT_FAILURE);
263+
goto reopen_fcopy_fd;
237264
}
238265
}
239266
}

tools/hv/hv_kvp_daemon.c

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ enum {
7676
DNS
7777
};
7878

79-
static int in_hand_shake = 1;
79+
static int in_hand_shake;
8080

8181
static char *os_name = "";
8282
static char *os_major = "";
@@ -1360,7 +1360,7 @@ void print_usage(char *argv[])
13601360

13611361
int main(int argc, char *argv[])
13621362
{
1363-
int kvp_fd, len;
1363+
int kvp_fd = -1, len;
13641364
int error;
13651365
struct pollfd pfd;
13661366
char *p;
@@ -1400,14 +1400,6 @@ int main(int argc, char *argv[])
14001400
openlog("KVP", 0, LOG_USER);
14011401
syslog(LOG_INFO, "KVP starting; pid is:%d", getpid());
14021402

1403-
kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR | O_CLOEXEC);
1404-
1405-
if (kvp_fd < 0) {
1406-
syslog(LOG_ERR, "open /dev/vmbus/hv_kvp failed; error: %d %s",
1407-
errno, strerror(errno));
1408-
exit(EXIT_FAILURE);
1409-
}
1410-
14111403
/*
14121404
* Retrieve OS release information.
14131405
*/
@@ -1423,6 +1415,18 @@ int main(int argc, char *argv[])
14231415
exit(EXIT_FAILURE);
14241416
}
14251417

1418+
reopen_kvp_fd:
1419+
if (kvp_fd != -1)
1420+
close(kvp_fd);
1421+
in_hand_shake = 1;
1422+
kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR | O_CLOEXEC);
1423+
1424+
if (kvp_fd < 0) {
1425+
syslog(LOG_ERR, "open /dev/vmbus/hv_kvp failed; error: %d %s",
1426+
errno, strerror(errno));
1427+
exit(EXIT_FAILURE);
1428+
}
1429+
14261430
/*
14271431
* Register ourselves with the kernel.
14281432
*/
@@ -1456,9 +1460,7 @@ int main(int argc, char *argv[])
14561460
if (len != sizeof(struct hv_kvp_msg)) {
14571461
syslog(LOG_ERR, "read failed; error:%d %s",
14581462
errno, strerror(errno));
1459-
1460-
close(kvp_fd);
1461-
return EXIT_FAILURE;
1463+
goto reopen_kvp_fd;
14621464
}
14631465

14641466
/*
@@ -1617,13 +1619,17 @@ int main(int argc, char *argv[])
16171619
break;
16181620
}
16191621

1620-
/* Send the value back to the kernel. */
1622+
/*
1623+
* Send the value back to the kernel. Note: the write() may
1624+
* return an error due to hibernation; we can ignore the error
1625+
* by resetting the dev file, i.e. closing and re-opening it.
1626+
*/
16211627
kvp_done:
16221628
len = write(kvp_fd, hv_msg, sizeof(struct hv_kvp_msg));
16231629
if (len != sizeof(struct hv_kvp_msg)) {
16241630
syslog(LOG_ERR, "write failed; error: %d %s", errno,
16251631
strerror(errno));
1626-
exit(EXIT_FAILURE);
1632+
goto reopen_kvp_fd;
16271633
}
16281634
}
16291635

tools/hv/hv_vss_daemon.c

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
#include <stdbool.h>
2929
#include <dirent.h>
3030

31+
static bool fs_frozen;
32+
3133
/* Don't use syslog() in the function since that can cause write to disk */
3234
static int vss_do_freeze(char *dir, unsigned int cmd)
3335
{
@@ -155,18 +157,27 @@ static int vss_operate(int operation)
155157
continue;
156158
}
157159
error |= vss_do_freeze(ent->mnt_dir, cmd);
158-
if (error && operation == VSS_OP_FREEZE)
159-
goto err;
160+
if (operation == VSS_OP_FREEZE) {
161+
if (error)
162+
goto err;
163+
fs_frozen = true;
164+
}
160165
}
161166

162167
endmntent(mounts);
163168

164169
if (root_seen) {
165170
error |= vss_do_freeze("/", cmd);
166-
if (error && operation == VSS_OP_FREEZE)
167-
goto err;
171+
if (operation == VSS_OP_FREEZE) {
172+
if (error)
173+
goto err;
174+
fs_frozen = true;
175+
}
168176
}
169177

178+
if (operation == VSS_OP_THAW && !error)
179+
fs_frozen = false;
180+
170181
goto out;
171182
err:
172183
save_errno = errno;
@@ -175,6 +186,7 @@ static int vss_operate(int operation)
175186
endmntent(mounts);
176187
}
177188
vss_operate(VSS_OP_THAW);
189+
fs_frozen = false;
178190
/* Call syslog after we thaw all filesystems */
179191
if (ent)
180192
syslog(LOG_ERR, "FREEZE of %s failed; error:%d %s",
@@ -196,13 +208,13 @@ void print_usage(char *argv[])
196208

197209
int main(int argc, char *argv[])
198210
{
199-
int vss_fd, len;
211+
int vss_fd = -1, len;
200212
int error;
201213
struct pollfd pfd;
202214
int op;
203215
struct hv_vss_msg vss_msg[1];
204216
int daemonize = 1, long_index = 0, opt;
205-
int in_handshake = 1;
217+
int in_handshake;
206218
__u32 kernel_modver;
207219

208220
static struct option long_options[] = {
@@ -232,6 +244,18 @@ int main(int argc, char *argv[])
232244
openlog("Hyper-V VSS", 0, LOG_USER);
233245
syslog(LOG_INFO, "VSS starting; pid is:%d", getpid());
234246

247+
reopen_vss_fd:
248+
if (vss_fd != -1)
249+
close(vss_fd);
250+
if (fs_frozen) {
251+
if (vss_operate(VSS_OP_THAW) || fs_frozen) {
252+
syslog(LOG_ERR, "failed to thaw file system: err=%d",
253+
errno);
254+
exit(EXIT_FAILURE);
255+
}
256+
}
257+
258+
in_handshake = 1;
235259
vss_fd = open("/dev/vmbus/hv_vss", O_RDWR);
236260
if (vss_fd < 0) {
237261
syslog(LOG_ERR, "open /dev/vmbus/hv_vss failed; error: %d %s",
@@ -284,8 +308,7 @@ int main(int argc, char *argv[])
284308
if (len != sizeof(struct hv_vss_msg)) {
285309
syslog(LOG_ERR, "read failed; error:%d %s",
286310
errno, strerror(errno));
287-
close(vss_fd);
288-
return EXIT_FAILURE;
311+
goto reopen_vss_fd;
289312
}
290313

291314
op = vss_msg->vss_hdr.operation;
@@ -312,14 +335,18 @@ int main(int argc, char *argv[])
312335
default:
313336
syslog(LOG_ERR, "Illegal op:%d\n", op);
314337
}
338+
339+
/*
340+
* The write() may return an error due to the faked VSS_OP_THAW
341+
* message upon hibernation. Ignore the error by resetting the
342+
* dev file, i.e. closing and re-opening it.
343+
*/
315344
vss_msg->error = error;
316345
len = write(vss_fd, vss_msg, sizeof(struct hv_vss_msg));
317346
if (len != sizeof(struct hv_vss_msg)) {
318347
syslog(LOG_ERR, "write failed; error: %d %s", errno,
319348
strerror(errno));
320-
321-
if (op == VSS_OP_FREEZE)
322-
vss_operate(VSS_OP_THAW);
349+
goto reopen_vss_fd;
323350
}
324351
}
325352

0 commit comments

Comments
 (0)