Skip to content

Commit a8c816c

Browse files
Fix possible corruption in RDMA partition during migration; Add loadtests (#4580)
#2988 Fixing a similar problem to the one in #3241, but this time RDMA-only. Enabling data integrity feature in local-mirror* loadtests.
1 parent a6599a2 commit a8c816c

17 files changed

+131
-31
lines changed

cloud/blockstore/libs/rdma/fake/client.cpp

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -305,46 +305,46 @@ class TExecuteRequestActor final
305305
private:
306306
NProto::TError ExecuteRequest(const TActorContext& ctx)
307307
{
308-
auto [proto, error] =
308+
auto [res, error] =
309309
TBlockStoreProtocol::Serializer()->Parse(Request->RequestBuffer);
310310
if (HasError(error)) {
311311
return error;
312312
}
313313

314-
switch (proto.MsgId) {
314+
switch (res.MsgId) {
315315
case TBlockStoreProtocol::ReadDeviceBlocksRequest:
316-
Y_ABORT_IF(proto.Data);
316+
Y_ABORT_IF(res.Data);
317317
return SendReadBlocksRequest(
318318
ctx,
319319
std::move(static_cast<NProto::TReadDeviceBlocksRequest&>(
320-
*proto.Proto)));
320+
*res.Proto)));
321321

322322
case TBlockStoreProtocol::WriteDeviceBlocksRequest:
323323
return SendWriteBlocksRequest(
324324
ctx,
325325
std::move(static_cast<NProto::TWriteDeviceBlocksRequest&>(
326-
*proto.Proto)),
327-
proto.Data);
326+
*res.Proto)),
327+
res.Data);
328328

329329
case TBlockStoreProtocol::ZeroDeviceBlocksRequest:
330-
Y_ABORT_IF(proto.Data);
330+
Y_ABORT_IF(res.Data);
331331
return SendZeroBlocksRequest(
332332
ctx,
333333
std::move(static_cast<NProto::TZeroDeviceBlocksRequest&>(
334-
*proto.Proto)));
334+
*res.Proto)));
335335

336336
case TBlockStoreProtocol::ChecksumDeviceBlocksRequest:
337-
Y_ABORT_IF(proto.Data);
337+
Y_ABORT_IF(res.Data);
338338
return SendChecksumBlocksRequest(
339339
ctx,
340340
std::move(
341341
static_cast<NProto::TChecksumDeviceBlocksRequest&>(
342-
*proto.Proto)));
342+
*res.Proto)));
343343

344344
default:
345345
return MakeError(
346346
E_NOT_IMPLEMENTED,
347-
TStringBuilder() << "MsgId: " << proto.MsgId);
347+
TStringBuilder() << "MsgId: " << res.MsgId);
348348
}
349349
}
350350

cloud/blockstore/libs/storage/partition_nonrepl/part_nonrepl_rdma_actor_writeblocks.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,8 @@ void TNonreplicatedPartitionRdmaActor::HandleWriteBlocksLocal(
405405
}
406406
}
407407

408-
ui64 blocks = 0;
408+
ui64 blocks =
409+
deviceRequests[0].BlockRange.Start - msg->Record.GetStartIndex();
409410
for (const auto& deviceRequest: deviceRequests) {
410411
auto ep = AgentId2Endpoint[deviceRequest.Device.GetAgentId()];
411412
Y_ABORT_UNLESS(ep);

cloud/blockstore/tests/loadtest/local-data-integrity/test.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
from cloud.blockstore.config.server_pb2 import TServerConfig, TServerAppConfig, \
99
TKikimrServiceConfig, TChecksumFlags
1010
from cloud.blockstore.config.storage_pb2 import TStorageServiceConfig
11-
from cloud.blockstore.public.sdk.python.protos import STORAGE_MEDIA_SSD, STORAGE_MEDIA_SSD_MIRROR2
11+
from cloud.blockstore.public.sdk.python.protos import STORAGE_MEDIA_SSD, \
12+
STORAGE_MEDIA_SSD_MIRROR2, DIVP_ENABLED_FORCED
1213

1314
from cloud.blockstore.tests.python.lib.disk_agent_runner import LocalDiskAgent
1415
from cloud.blockstore.tests.python.lib.nbs_runner import LocalNbs
@@ -364,7 +365,7 @@ def __run_test(test_case):
364365
devices_per_agent,
365366
disk_agent_config_patch=TDiskAgentConfig(
366367
DedicatedDiskAgent=True,
367-
EnableDataIntegrityValidationForDrBasedDisks=True,
368+
DataIntegrityValidationPolicyForDrBasedDisks=DIVP_ENABLED_FORCED,
368369
ChaosConfig=chaos_config),
369370
agent_count=test_case.agent_count,
370371
)

cloud/blockstore/tests/loadtest/local-mirror-lagging/local-mirror2-basic.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ Vertices {
55
BlockSize: 4096
66
StorageMediaKind: STORAGE_MEDIA_SSD_MIRROR2
77
}
8+
StartEndpointRequest {
9+
IpcType: IPC_GRPC
10+
VolumeAccessMode: VOLUME_ACCESS_READ_WRITE
11+
VolumeMountMode: VOLUME_MOUNT_LOCAL
12+
}
813
ArtificialLoadSpec {
914
Ranges {
1015
Start: 261144

cloud/blockstore/tests/loadtest/local-mirror-lagging/local-mirror2-fresh-device-migration.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,11 @@ Vertices {
3636
Vertices {
3737
Test {
3838
VolumeName: "vol0"
39-
39+
StartEndpointRequest {
40+
IpcType: IPC_GRPC
41+
VolumeAccessMode: VOLUME_ACCESS_READ_WRITE
42+
VolumeMountMode: VOLUME_MOUNT_LOCAL
43+
}
4044
ArtificialLoadSpec {
4145
Ranges {
4246
Start: 261144

cloud/blockstore/tests/loadtest/local-mirror-lagging/local-mirror2-migration.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,11 @@ Vertices {
3535
Vertices {
3636
Test {
3737
VolumeName: "vol0"
38-
38+
StartEndpointRequest {
39+
IpcType: IPC_GRPC
40+
VolumeAccessMode: VOLUME_ACCESS_READ_WRITE
41+
VolumeMountMode: VOLUME_MOUNT_LOCAL
42+
}
3943
ArtificialLoadSpec {
4044
Ranges {
4145
Start: 261144

cloud/blockstore/tests/loadtest/local-mirror-lagging/local-mirror2-small-restart-interval.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ Vertices {
55
BlockSize: 4096
66
StorageMediaKind: STORAGE_MEDIA_SSD_MIRROR2
77
}
8+
StartEndpointRequest {
9+
IpcType: IPC_GRPC
10+
VolumeAccessMode: VOLUME_ACCESS_READ_WRITE
11+
VolumeMountMode: VOLUME_MOUNT_LOCAL
12+
}
813
ArtificialLoadSpec {
914
Ranges {
1015
Start: 200000

cloud/blockstore/tests/loadtest/local-mirror-lagging/local-mirror3-basic.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ Vertices {
55
BlockSize: 4096
66
StorageMediaKind: STORAGE_MEDIA_SSD_MIRROR3
77
}
8+
StartEndpointRequest {
9+
IpcType: IPC_GRPC
10+
VolumeAccessMode: VOLUME_ACCESS_READ_WRITE
11+
VolumeMountMode: VOLUME_MOUNT_LOCAL
12+
}
813
ArtificialLoadSpec {
914
Ranges {
1015
Start: 261144

cloud/blockstore/tests/loadtest/local-mirror-lagging/test.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@
66
from cloud.blockstore.config.client_pb2 import TClientAppConfig, TClientConfig
77
from cloud.blockstore.config.disk_pb2 import DEVICE_ERASE_METHOD_NONE, TDiskAgentConfig
88
from cloud.blockstore.config.server_pb2 import TServerConfig, TServerAppConfig, \
9-
TKikimrServiceConfig
9+
TKikimrServiceConfig, TChecksumFlags
1010
from cloud.blockstore.config.storage_pb2 import TStorageServiceConfig
11+
from cloud.blockstore.public.sdk.python.protos import DIVP_ENABLED_FORCED
1112

1213
from cloud.blockstore.tests.python.lib.disk_agent_runner import LocalDiskAgent
1314
from cloud.blockstore.tests.python.lib.nbs_runner import LocalNbs
@@ -248,6 +249,7 @@ def __run_test(test_case, use_rdma):
248249
devices_per_agent,
249250
disk_agent_config_patch=TDiskAgentConfig(
250251
DedicatedDiskAgent=True,
252+
DataIntegrityValidationPolicyForDrBasedDisks=DIVP_ENABLED_FORCED,
251253
# in tests, only one disk is created and it lives until the end,
252254
# so we can set DEVICE_ERASE_METHOD_NONE to speed up testing
253255
DeviceEraseMethod=DEVICE_ERASE_METHOD_NONE),
@@ -266,6 +268,8 @@ def __run_test(test_case, use_rdma):
266268
server_app_config.ServerConfig.RdmaClientEnabled = use_rdma
267269
server_app_config.ServerConfig.EndpointStorageType = EEndpointStorageType.ENDPOINT_STORAGE_FILE
268270
server_app_config.ServerConfig.EndpointStorageDir = endpoint_storage_dir
271+
server_app_config.ServerConfig.ChecksumFlags.CopyFrom(TChecksumFlags())
272+
server_app_config.ServerConfig.ChecksumFlags.EnableDataIntegrityClient = True
269273
server_app_config.KikimrServiceConfig.CopyFrom(TKikimrServiceConfig())
270274

271275
storage = TStorageServiceConfig()

cloud/blockstore/tests/loadtest/local-mirror/local-mirror2-agent-removal.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@ Vertices {
1515
Test {
1616
Name: "shoot1"
1717
VolumeName: "vol0"
18+
StartEndpointRequest {
19+
IpcType: IPC_GRPC
20+
VolumeAccessMode: VOLUME_ACCESS_READ_WRITE
21+
VolumeMountMode: VOLUME_MOUNT_LOCAL
22+
}
1823
ArtificialLoadSpec {
1924
Ranges {
2025
Start: 0
@@ -36,6 +41,11 @@ Vertices {
3641
Test {
3742
Name: "shoot2"
3843
VolumeName: "vol0"
44+
StartEndpointRequest {
45+
IpcType: IPC_GRPC
46+
VolumeAccessMode: VOLUME_ACCESS_READ_WRITE
47+
VolumeMountMode: VOLUME_MOUNT_LOCAL
48+
}
3949
ArtificialLoadSpec {
4050
Ranges {
4151
Start: 0
@@ -107,6 +117,11 @@ Vertices {
107117
Test {
108118
Name: "shoot3"
109119
VolumeName: "vol0"
120+
StartEndpointRequest {
121+
IpcType: IPC_GRPC
122+
VolumeAccessMode: VOLUME_ACCESS_READ_WRITE
123+
VolumeMountMode: VOLUME_MOUNT_LOCAL
124+
}
110125
ArtificialLoadSpec {
111126
Ranges {
112127
Start: 0

0 commit comments

Comments
 (0)