|
1 | 1 | #include "ddisk_actor.h" |
2 | 2 |
|
3 | 3 | #include <ydb/core/util/stlog.h> |
4 | | - |
5 | 4 | #include <ydb/core/util/pb.h> |
6 | 5 |
|
| 6 | +#define XXH_INLINE_ALL |
| 7 | +#include <contrib/libs/xxhash/xxhash.h> |
| 8 | + |
7 | 9 | namespace NKikimr::NDDisk { |
8 | 10 |
|
9 | | - void TDDiskActor::Handle(TEvWritePersistentBuffer::TPtr ev) { |
10 | | - if (!CheckQuery(*ev, &Counters.Interface.WritePersistentBuffer)) { |
11 | | - return; |
| 11 | + void TDDiskActor::Handle(TEvPrivate::TEvHandlePersistentBufferEventForChunk::TPtr ev) { |
| 12 | + auto chunkIdx = ev->Get()->ChunkIndex; |
| 13 | + Y_ABORT_UNLESS(chunkIdx); |
| 14 | + ProcessPersistentBufferQueue(); |
| 15 | + } |
| 16 | + |
| 17 | + ui64 CalculateChecksum(const TRope::TIterator begin, size_t numBytes) { |
| 18 | + XXH3_state_t state; |
| 19 | + XXH3_64bits_reset(&state); |
| 20 | + |
| 21 | + for (auto it = begin; numBytes && it.Valid(); it.AdvanceToNextContiguousBlock()) { |
| 22 | + const size_t n = Min(numBytes, it.ContiguousSize()); |
| 23 | + XXH3_64bits_update(&state, it.ContiguousData(), n); |
| 24 | + numBytes -= n; |
12 | 25 | } |
13 | 26 |
|
14 | | - const auto& record = ev->Get()->Record; |
| 27 | + return XXH3_64bits_digest(&state); |
| 28 | + } |
| 29 | + |
| 30 | + std::vector<std::tuple<ui32, ui32, TRope>> TDDiskActor::SlicePersistentBuffer(ui64 tabletId, ui64 vchunkIndex, |
| 31 | + ui64 lsn, ui32 offsetInBytes, ui32 sizeInBytes, TRope&& payload, const std::vector<TPersistentBufferSectorInfo>& sectors) { |
| 32 | + auto headerData = TRcBuf::Uninitialized(SectorSize); |
| 33 | + TPersistentBufferHeader *header = (TPersistentBufferHeader*)headerData.GetDataMut(); |
| 34 | + memcpy(header->Signature, TPersistentBufferHeader::PersistentBufferHeaderSignature, 16); |
| 35 | + header->TabletId = tabletId; |
| 36 | + header->VChunkIndex = vchunkIndex; |
| 37 | + header->OffsetInBytes = offsetInBytes; |
| 38 | + header->Size = sizeInBytes; |
| 39 | + header->Lsn = lsn; |
| 40 | + |
| 41 | + for (ui32 i = 1; i < sectors.size(); ++i) { |
| 42 | + auto& loc = header->Locations[i - 1]; |
| 43 | + loc = sectors[i]; |
| 44 | + auto it = payload.Position(SectorSize * (i - 1)); |
| 45 | + if (memcmp((*it).first, header->Signature, 16) != 0) { |
| 46 | + loc.HasSignatureCorrection = true; |
| 47 | + *payload.Position(SectorSize * (i - 1)).ContiguousDataMut() = 0; |
| 48 | + } |
| 49 | + loc.Checksum = CalculateChecksum(payload.Position(SectorSize * (i - 1)), SectorSize); |
| 50 | + } |
| 51 | + header->HeaderChecksum = 0; |
| 52 | + std::vector<std::tuple<ui32, ui32, TRope>> parts; |
| 53 | + parts.reserve(sectors.size()); |
| 54 | + for (ui32 sectorIdx = 0, first = 0; sectorIdx <= sectors.size(); sectorIdx++) { |
| 55 | + if (sectorIdx == sectors.size() |
| 56 | + || sectors[first].ChunkIdx != sectors[sectorIdx].ChunkIdx |
| 57 | + || sectors[first].SectorIdx != sectors[sectorIdx].SectorIdx + sectorIdx - first) { |
| 58 | + TRope data; |
| 59 | + ui32 partSize = (sectorIdx - (first == 0 ? 1 : first)) * SectorSize; |
| 60 | + if (first == 0) { |
| 61 | + data = headerData; |
| 62 | + auto cs = CalculateChecksum(data.Position(TPersistentBufferHeader::HeaderChecksumOffset + TPersistentBufferHeader::HeaderChecksumSize) |
| 63 | + , SectorSize - TPersistentBufferHeader::HeaderChecksumOffset - TPersistentBufferHeader::HeaderChecksumSize); |
| 64 | + memcpy(data.Position(TPersistentBufferHeader::HeaderChecksumOffset).ContiguousDataMut(), &cs, TPersistentBufferHeader::HeaderChecksumSize); |
| 65 | + |
| 66 | + } |
| 67 | + payload.ExtractFront(partSize, &data); |
| 68 | + parts.emplace_back(sectors[first].ChunkIdx, sectors[first].SectorIdx * SectorSize, std::move(data)); |
| 69 | + first = sectorIdx; |
| 70 | + } |
| 71 | + } |
| 72 | + return parts; |
| 73 | + } |
| 74 | + |
| 75 | + void TDDiskActor::ProcessPersistentBufferQueue() { |
| 76 | + Y_ABORT_UNLESS(!PendingPersistentBufferEvents.empty()); |
| 77 | + auto& temp = PendingPersistentBufferEvents.front().Ev; |
| 78 | + const auto& record = temp->Get<TEvWritePersistentBuffer>()->Record; |
15 | 79 | const TQueryCredentials creds(record.GetCredentials()); |
16 | 80 | const TBlockSelector selector(record.GetSelector()); |
17 | 81 | const ui64 lsn = record.GetLsn(); |
| 82 | + ui32 sectorsCnt = selector.Size / SectorSize; |
| 83 | + const auto sectors = PersistentBufferSpaceAllocator.Occupy(sectorsCnt); |
| 84 | + if (sectors.size() == 0) { |
| 85 | + IssuePersistentBufferChunkAllocation(); |
| 86 | + return; |
| 87 | + } |
| 88 | + Y_ABORT_UNLESS(sectors.size() == sectorsCnt && sectorsCnt <= MaxSectorsPerBuffer); |
18 | 89 |
|
19 | | - Counters.Interface.WritePersistentBuffer.Request(selector.Size); |
20 | | - |
21 | | - auto span = std::move(NWilson::TSpan(TWilson::DDiskTopLevel, std::move(ev->TraceId), "DDisk.WritePersistentBuffer", |
| 90 | + const TWriteInstruction instr(record.GetInstruction()); |
| 91 | + TRope payload; |
| 92 | + if (instr.PayloadId) { |
| 93 | + payload = temp->Get<TEvWritePersistentBuffer>()->GetPayload(*instr.PayloadId); |
| 94 | + } |
| 95 | + auto span = std::move(NWilson::TSpan(TWilson::DDiskTopLevel, std::move(temp->TraceId), "DDisk.WritePersistentBuffer", |
22 | 96 | NWilson::EFlags::NONE, TActivationContext::ActorSystem()) |
23 | 97 | .Attribute("tablet_id", static_cast<long>(creds.TabletId)) |
24 | 98 | .Attribute("vchunk_index", static_cast<long>(selector.VChunkIndex)) |
25 | 99 | .Attribute("offset_in_bytes", selector.OffsetInBytes) |
26 | 100 | .Attribute("size", selector.Size) |
27 | 101 | .Attribute("lsn", static_cast<long>(lsn))); |
| 102 | + Counters.Interface.WritePersistentBuffer.Request(selector.Size); |
28 | 103 |
|
29 | | - const TWriteInstruction instr(record.GetInstruction()); |
30 | | - TRope data; |
31 | | - if (instr.PayloadId) { |
32 | | - data = ev->Get()->GetPayload(*instr.PayloadId); |
| 104 | + auto parts = SlicePersistentBuffer(creds.TabletId, |
| 105 | + selector.VChunkIndex, lsn, selector.OffsetInBytes, selector.Size, TRope(payload), sectors); |
| 106 | + |
| 107 | + auto& inflightRecord = PersistentBufferWriteInflight[{creds.TabletId, selector.VChunkIndex, lsn}]; |
| 108 | + inflightRecord = { |
| 109 | + .Sender = temp->Sender, |
| 110 | + .Cookie = temp->Cookie, |
| 111 | + .Session = temp->InterconnectSession, |
| 112 | + .OffsetInBytes = selector.OffsetInBytes, |
| 113 | + .Size = selector.Size, |
| 114 | + .Sectors = std::move(sectors), |
| 115 | + .Data = std::move(payload), |
| 116 | + .Span = std::move(span), |
| 117 | + }; |
| 118 | + |
| 119 | + for(auto& [chunkIdx, offset, data] : parts) { |
| 120 | + const ui64 cookie = NextCookie++; |
| 121 | + inflightRecord.WriteCookies.insert(cookie); |
| 122 | + |
| 123 | + Send(BaseInfo.PDiskActorID, new NPDisk::TEvChunkWriteRaw( |
| 124 | + PDiskParams->Owner, |
| 125 | + PDiskParams->OwnerRound, |
| 126 | + chunkIdx, |
| 127 | + offset, |
| 128 | + std::move(data)), 0, cookie); |
| 129 | + |
| 130 | + WriteCallbacks.try_emplace(cookie, [this, writeCookie = cookie, tabletId = creds.TabletId, |
| 131 | + vchunkIndex = selector.VChunkIndex, lsn = lsn](NPDisk::TEvChunkWriteRawResult& /*ev*/) { |
| 132 | + auto itInflight = PersistentBufferWriteInflight.find({tabletId, vchunkIndex, lsn}); |
| 133 | + Y_ABORT_UNLESS(itInflight != PersistentBufferWriteInflight.end()); |
| 134 | + auto& inflight = itInflight->second; |
| 135 | + auto eraseCnt = inflight.WriteCookies.erase(writeCookie); |
| 136 | + Y_ABORT_UNLESS(eraseCnt == 1); |
| 137 | + if (inflight.WriteCookies.empty()) { |
| 138 | + Counters.Interface.WritePersistentBuffer.Reply(true); |
| 139 | + inflight.Span.End(); |
| 140 | + auto& buffer = PersistentBuffers[{tabletId, vchunkIndex}]; |
| 141 | + auto [it, inserted] = buffer.Records.try_emplace(lsn); |
| 142 | + TPersistentBuffer::TRecord& pr = it->second; |
| 143 | + if (inserted) { |
| 144 | + pr = { |
| 145 | + .OffsetInBytes = inflight.OffsetInBytes, |
| 146 | + .Size = inflight.Size, |
| 147 | + .Sectors = std::move(inflight.Sectors), |
| 148 | + .Data = std::move(inflight.Data), |
| 149 | + }; |
| 150 | + } else { |
| 151 | + Y_ABORT_UNLESS(pr.OffsetInBytes == inflight.OffsetInBytes); |
| 152 | + Y_ABORT_UNLESS(pr.Size == inflight.Size); |
| 153 | + Y_ABORT_UNLESS(pr.Data == inflight.Data); |
| 154 | + } |
| 155 | + auto replyEv = std::make_unique<TEvWritePersistentBufferResult>(NKikimrBlobStorage::NDDisk::TReplyStatus::OK); |
| 156 | + auto h = std::make_unique<IEventHandle>(inflight.Sender, SelfId(), replyEv.release(), 0, inflight.Cookie); |
| 157 | + if (inflight.Session) { |
| 158 | + h->Rewrite(TEvInterconnect::EvForward, inflight.Session); |
| 159 | + } |
| 160 | + TActivationContext::Send(h.release()); |
| 161 | + PersistentBufferWriteInflight.erase(itInflight); |
| 162 | + } |
| 163 | + }); |
33 | 164 | } |
34 | 165 |
|
35 | | - auto& buffer = PersistentBuffers[{creds.TabletId, selector.VChunkIndex}]; |
36 | | - auto [it, inserted] = buffer.Records.try_emplace(lsn); |
37 | | - TPersistentBuffer::TRecord& pr = it->second; |
38 | | - if (inserted) { |
39 | | - pr = { |
40 | | - .OffsetInBytes = selector.OffsetInBytes, |
41 | | - .Size = selector.Size, |
42 | | - .Data = std::move(data), |
43 | | - }; |
44 | | - } else { |
45 | | - Y_ABORT_UNLESS(pr.OffsetInBytes == selector.OffsetInBytes); |
46 | | - Y_ABORT_UNLESS(pr.Size == selector.Size); |
47 | | - Y_ABORT_UNLESS(pr.Data == data); |
| 166 | + PendingPersistentBufferEvents.pop(); |
| 167 | + if (!PendingPersistentBufferEvents.empty()) { |
| 168 | + ProcessPersistentBufferQueue(); |
48 | 169 | } |
| 170 | + } |
49 | 171 |
|
50 | | - Counters.Interface.WritePersistentBuffer.Reply(true); |
51 | | - span.End(); |
52 | | - SendReply(*ev, std::make_unique<TEvWritePersistentBufferResult>(NKikimrBlobStorage::NDDisk::TReplyStatus::OK)); |
| 172 | + void TDDiskActor::Handle(TEvWritePersistentBuffer::TPtr ev) { |
| 173 | + if (!CheckQuery(*ev, &Counters.Interface.WritePersistentBuffer)) { |
| 174 | + return; |
| 175 | + } |
| 176 | + const auto& record = ev->Get()->Record; |
| 177 | + const TBlockSelector selector(record.GetSelector()); |
| 178 | + if (selector.Size > MaxSectorsPerBuffer * SectorSize) { |
| 179 | + Counters.Interface.WritePersistentBuffer.Request(selector.Size); |
| 180 | + Counters.Interface.WritePersistentBuffer.Reply(false); |
| 181 | + SendReply(*ev, std::make_unique<TEvWritePersistentBufferResult>( |
| 182 | + NKikimrBlobStorage::NDDisk::TReplyStatus::INCORRECT_REQUEST, |
| 183 | + TStringBuilder() << "persistent buffer write limit " |
| 184 | + << (MaxSectorsPerBuffer * SectorSize) << " bytes, received " << selector.Size << " bytes")); |
| 185 | + return; |
| 186 | + } |
| 187 | + PendingPersistentBufferEvents.emplace(ev, "WaitingPersistentBufferWrite"); |
| 188 | + ProcessPersistentBufferQueue(); |
53 | 189 | } |
54 | 190 |
|
55 | 191 | void TDDiskActor::Handle(TEvReadPersistentBuffer::TPtr ev) { |
@@ -200,6 +336,8 @@ namespace NKikimr::NDDisk { |
200 | 336 | Y_ABORT_UNLESS(pr.OffsetInBytes == selector.OffsetInBytes); |
201 | 337 | Y_ABORT_UNLESS(pr.Size == selector.Size); |
202 | 338 |
|
| 339 | + PersistentBufferSpaceAllocator.Free(pr.Sectors); |
| 340 | + |
203 | 341 | buffer.Records.erase(jt); |
204 | 342 | if (buffer.Records.empty()) { |
205 | 343 | PersistentBuffers.erase(it); |
|
0 commit comments