diff --git a/CMakeLists.txt b/CMakeLists.txt index f3bb015d5c..8d98f803be 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,6 +200,7 @@ ExternalProject_Add(gflags -DGFLAGS_NAMESPACE=gflags -DBUILD_STATIC_LIBS=ON -DBUILD_SHARED_LIBS=OFF + -DREGISTER_INSTALL_PREFIX=OFF BUILD_COMMAND make -j${CPU_CORE} ) @@ -273,10 +274,10 @@ ExternalProject_Add(glog -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_INSTALL_PREFIX=${STAGED_INSTALL_PREFIX} -DCMAKE_BUILD_TYPE=${LIB_BUILD_TYPE} - -DWITH_GFLAGS=ON + -DWITH_GFLAGS=OFF -DBUILD_TESTING=OFF -DBUILD_SHARED_LIBS=OFF - -DWITH_UNWIND=${LIBUNWIND_ON} + -DWITH_UNWIND=OFF -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH} BUILD_COMMAND make -j${CPU_CORE} @@ -457,9 +458,9 @@ set(LZ4_INCLUDE_DIR ${INSTALL_INCLUDEDIR}) ExternalProject_Add(zlib DEPENDS URL - https://github.com/madler/zlib/releases/download/v1.2.13/zlib-1.2.13.tar.gz + https://github.com/madler/zlib/releases/download/v1.3.1/zlib-1.3.1.tar.gz URL_HASH - MD5=9b8aa094c4e5765dabf4da391f00d15c + MD5=9855b6d802d7fe5b7bd5b196a2271655 DOWNLOAD_NO_PROGRESS 1 UPDATE_COMMAND @@ -571,6 +572,8 @@ ExternalProject_Add(protobuf 1 LOG_INSTALL 1 + LOG_OUTPUT_ON_FAILURE + ON SOURCE_SUBDIR cmake CMAKE_ARGS @@ -595,9 +598,159 @@ else() endif() set(PROTOBUF_INCLUDE_DIR ${INSTALL_INCLUDEDIR}) -set(PROTOBUF_LIBRARY ${INSTALL_LIBDIR}/${LIB_PROTOBUF}) +if(${OS_VERSION} MATCHES "Rocky" OR ${OS_VERSION} MATCHES "CentOS") + set(PROTOBUF_LIBRARY ${INSTALL_LIBDIR_64}/${LIB_PROTOBUF}) +else() + set(PROTOBUF_LIBRARY ${INSTALL_LIBDIR}/${LIB_PROTOBUF}) +endif() set(PROTOBUF_PROTOC ${STAGED_INSTALL_PREFIX}/bin/protoc) +ExternalProject_Add(leveldb + DEPENDS + snappy + URL + https://github.com/google/leveldb/archive/refs/tags/1.23.tar.gz + URL_HASH + MD5=afbde776fb8760312009963f09a586c7 + DOWNLOAD_NO_PROGRESS + 1 + UPDATE_COMMAND + "" + LOG_CONFIGURE + 1 + LOG_BUILD + 1 + LOG_INSTALL + 1 + CMAKE_ARGS + -DCMAKE_POLICY_VERSION_MINIMUM=3.5 + -DCMAKE_INSTALL_PREFIX=${STAGED_INSTALL_PREFIX} + -DCMAKE_BUILD_TYPE=${LIB_BUILD_TYPE} + -DLEVELDB_BUILD_TESTS=OFF + -DLEVELDB_BUILD_BENCHMARKS=OFF + -DBUILD_SHARED_LIBS=OFF + -DHAVE_SNAPPY=ON + -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH} + -DCMAKE_CXX_FLAGS=-I${INSTALL_INCLUDEDIR} + -DCMAKE_EXE_LINKER_FLAGS=-L${INSTALL_LIBDIR}\ -L${INSTALL_LIBDIR_64} + -DCMAKE_SHARED_LINKER_FLAGS=-L${INSTALL_LIBDIR}\ -L${INSTALL_LIBDIR_64} + -DCMAKE_MODULE_LINKER_FLAGS=-L${INSTALL_LIBDIR}\ -L${INSTALL_LIBDIR_64} + BUILD_ALWAYS + 1 + BUILD_COMMAND + make -j${CPU_CORE} +) + +if(${OS_VERSION} MATCHES "Rocky" OR ${OS_VERSION} MATCHES "CentOS") + set(LEVELDB_LIBRARY ${INSTALL_LIBDIR_64}/libleveldb.a) +else() + set(LEVELDB_LIBRARY ${INSTALL_LIBDIR}/libleveldb.a) +endif() +set(LEVELDB_INCLUDE_DIR ${INSTALL_INCLUDEDIR}) + +ExternalProject_Add(brpc + DEPENDS + gflags + protobuf + leveldb + glog + snappy + zlib + URL + https://github.com/apache/brpc/archive/refs/tags/1.6.0.tar.gz + URL_HASH + MD5=0d37cea25bd006e89806f461ef7e39ba + DOWNLOAD_NO_PROGRESS + 1 + UPDATE_COMMAND + "" + LOG_CONFIGURE + 1 + LOG_BUILD + 1 + LOG_INSTALL + 1 + LOG_OUTPUT_ON_FAILURE + ON + CMAKE_ARGS + -DCMAKE_POLICY_VERSION_MINIMUM=3.5 + -DCMAKE_INSTALL_PREFIX=${STAGED_INSTALL_PREFIX} + -DCMAKE_BUILD_TYPE=${LIB_BUILD_TYPE} + -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH} + -DWITH_GLOG=ON + -DWITH_SNAPPY=ON + -DBUILD_SHARED_LIBS=OFF + -DDOWNLOAD_GTEST=OFF + BUILD_ALWAYS + 1 + BUILD_COMMAND + make -j${CPU_CORE} +) + +if(${OS_VERSION} MATCHES "Rocky" OR ${OS_VERSION} MATCHES "CentOS") + set(BRPC_LIBRARY ${INSTALL_LIBDIR_64}/libbrpc.a) +else() + set(BRPC_LIBRARY ${INSTALL_LIBDIR}/libbrpc.a) +endif() +set(BRPC_INCLUDE_DIR ${INSTALL_INCLUDEDIR}) + +ExternalProject_Add(braft + DEPENDS + gflags + protobuf + leveldb + brpc + glog + snappy + zlib + URL + https://github.com/baidu/braft/archive/refs/tags/v1.1.2.tar.gz + URL_HASH + MD5=f1d0307cf45449bbec9b64ca81b5f808 + DOWNLOAD_NO_PROGRESS + 1 + UPDATE_COMMAND + "" + LOG_CONFIGURE + 1 + LOG_BUILD + 1 + LOG_INSTALL + 1 + LOG_OUTPUT_ON_FAILURE + ON + CMAKE_ARGS + -DCMAKE_POLICY_VERSION_MINIMUM=3.5 + -DCMAKE_INSTALL_PREFIX=${STAGED_INSTALL_PREFIX} + -DCMAKE_BUILD_TYPE=${LIB_BUILD_TYPE} + -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH} + -DWITH_GLOG=ON + -DBUILD_SHARED_LIBS=OFF + -DBUILD_UNIT_TESTS=OFF + -DDOWNLOAD_GTEST=OFF + BUILD_ALWAYS + 1 + BUILD_COMMAND + make -j${CPU_CORE} braft-static + INSTALL_COMMAND + ${CMAKE_COMMAND} -E copy_directory /output/include/braft ${INSTALL_INCLUDEDIR}/braft + COMMAND ${CMAKE_COMMAND} -E copy /output/lib/libbraft.a ${INSTALL_LIBDIR}/libbraft.a +) + +# For CentOS/Rocky, also copy to lib64 +if(${OS_VERSION} MATCHES "Rocky" OR ${OS_VERSION} MATCHES "CentOS") + ExternalProject_Add_Step(braft copy_to_lib64 + COMMAND ${CMAKE_COMMAND} -E make_directory ${INSTALL_LIBDIR_64} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${INSTALL_LIBDIR}/libbraft.a ${INSTALL_LIBDIR_64}/libbraft.a + DEPENDEES install + COMMENT "Copying braft to lib64 for CentOS/Rocky" + ) + set(BRAFT_LIBRARY ${INSTALL_LIBDIR_64}/libbraft.a) +else() + set(BRAFT_LIBRARY ${INSTALL_LIBDIR}/libbraft.a) +endif() +set(BRAFT_INCLUDE_DIR ${INSTALL_INCLUDEDIR}) + ExternalProject_Add(rocksdb DEPENDS gflags @@ -750,8 +903,21 @@ endif() set(ROCKSDB_INCLUDE_DIR ${INSTALL_INCLUDEDIR}) set(ROCKSDB_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/${EP_BASE_SUFFIX}/Source/rocksdb) +# Generate protobuf files before compiling praft (which depends on them) +set(PROTO_FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/pika_inner_message.proto ${CMAKE_CURRENT_SOURCE_DIR}/src/rsync_service.proto) +custom_protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS ${PROTO_FILES}) +message("pika PROTO_SRCS = ${PROTO_SRCS}") +message("pika PROTO_HDRS = ${PROTO_HDRS}") + +# Create a custom target for generated proto files +add_custom_target(pika_proto_gen + DEPENDS ${PROTO_SRCS} ${PROTO_HDRS} protobuf + COMMENT "Generating Pika protobuf files" +) + add_subdirectory(src/pstd) add_subdirectory(src/net) +add_subdirectory(src/praft) # praft 必须在 storage 之前,因为 storage 需要 binlog.pb.h add_subdirectory(src/storage) add_subdirectory(src/cache) if (USE_PIKA_TOOLS) @@ -794,10 +960,7 @@ set(PIKA_BUILD_VERSION_CC ${CMAKE_BINARY_DIR}/pika_build_version.cc message("PIKA_BUILD_VERSION_CC : " ${PIKA_BUILD_VERSION_CC}) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/src/build_version.cc.in ${PIKA_BUILD_VERSION_CC} @ONLY) -set(PROTO_FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/pika_inner_message.proto ${CMAKE_CURRENT_SOURCE_DIR}/src/rsync_service.proto) -custom_protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS ${PROTO_FILES}) -message("pika PROTO_SRCS = ${PROTO_SRCS}") -message("pika PROTO_HDRS = ${PROTO_HDRS}") +# PROTO_SRCS and PROTO_HDRS are already generated above add_executable(${PROJECT_NAME} ${DIR_SRCS} @@ -821,6 +984,9 @@ add_dependencies(${PROJECT_NAME} zlib ${LIBGPERF_NAME} ${LIBJEMALLOC_NAME} + leveldb + brpc + braft rocksdb protobuf pstd @@ -839,10 +1005,14 @@ target_include_directories(${PROJECT_NAME} target_link_libraries(${PROJECT_NAME} cache storage + praft net pstd ${GLOG_LIBRARY} librocksdb.a + ${BRAFT_LIBRARY} + ${BRPC_LIBRARY} + ${LEVELDB_LIBRARY} ${LIB_PROTOBUF} ${LIB_GFLAGS} ${LIB_FMT} @@ -854,6 +1024,49 @@ target_link_libraries(${PROJECT_NAME} ${LIBUNWIND_LIBRARY} ${JEMALLOC_LIBRARY}) +# Add platform-specific libraries for brpc and braft +if(CMAKE_SYSTEM_NAME MATCHES "Darwin") + # macOS frameworks + target_link_libraries(${PROJECT_NAME} + "-framework CoreFoundation" + "-framework CoreGraphics" + "-framework CoreData" + "-framework CoreText" + "-framework Security" + "-framework Foundation" + "-framework ApplicationServices" + "-framework SystemConfiguration" + "-framework AppKit" + "-Wl,-undefined,dynamic_lookup" # Allow undefined symbols (gperftools optional) + ) + find_library(OPENSSL_CRYPTO_LIBRARY NAMES crypto libcrypto) + find_library(OPENSSL_SSL_LIBRARY NAMES ssl libssl) + if(OPENSSL_CRYPTO_LIBRARY) + target_link_libraries(${PROJECT_NAME} ${OPENSSL_CRYPTO_LIBRARY}) + endif() + if(OPENSSL_SSL_LIBRARY) + target_link_libraries(${PROJECT_NAME} ${OPENSSL_SSL_LIBRARY}) + endif() +elseif(CMAKE_SYSTEM_NAME MATCHES "Linux") + # Linux system libraries for brpc and braft + target_link_libraries(${PROJECT_NAME} + rt # Real-time extensions + dl # Dynamic linking + ) + # Find and link OpenSSL for Linux + find_package(OpenSSL) + if(OPENSSL_FOUND) + target_link_libraries(${PROJECT_NAME} OpenSSL::SSL OpenSSL::Crypto) + else() + # Fallback: try to find libraries directly + find_library(OPENSSL_CRYPTO_LIBRARY NAMES crypto libcrypto) + find_library(OPENSSL_SSL_LIBRARY NAMES ssl libssl) + if(OPENSSL_CRYPTO_LIBRARY AND OPENSSL_SSL_LIBRARY) + target_link_libraries(${PROJECT_NAME} ${OPENSSL_SSL_LIBRARY} ${OPENSSL_CRYPTO_LIBRARY}) + endif() + endif() +endif() + option(USE_SSL "Enable SSL support" OFF) add_custom_target( clang-tidy diff --git a/conf/pika.conf b/conf/pika.conf index 5317fcf452..338f657402 100644 --- a/conf/pika.conf +++ b/conf/pika.conf @@ -632,3 +632,29 @@ cache-lfu-decay-time: 1 # which serves for the scenario of codis-pika cluster reelection # You'd better [DO NOT MODIFY IT UNLESS YOU KNOW WHAT YOU ARE DOING] internal-used-unfinished-full-sync : + +################### +## Raft Configuration +################### + +# Enable Raft consensus protocol for distributed consensus +# When enabled, Pika will use Raft to ensure data consistency across nodes +# Default value is no +raft-enabled : no + +# Raft group identifier +# This is used to identify the Raft cluster +# All nodes in the same Raft cluster should have the same group-id +raft-group-id : pika_raft_group + +# Raft election timeout in milliseconds +# This is the time to wait before starting a new election if no heartbeat is received +# A larger value reduces the chance of unnecessary elections but increases failover time +# Default value is 1000ms (1 second) +raft-election-timeout-ms : 1000 + +# Raft snapshot interval in seconds +# This determines how often Raft takes snapshots of the state machine +# Snapshots are used to compact the log and speed up node recovery +# Default value is 3600 seconds (1 hour) +raft-snapshot-interval-s : 3600 diff --git a/include/pika_bit.h b/include/pika_bit.h index 94e7767b16..189c9530d9 100644 --- a/include/pika_bit.h +++ b/include/pika_bit.h @@ -63,6 +63,7 @@ class BitSetCmd : public Cmd { std::string key_; int64_t bit_offset_; int64_t on_; + int32_t bit_val_ = 0; // For async mode rocksdb::Status s_; void Clear() override { key_ = ""; @@ -169,6 +170,7 @@ class BitOpCmd : public Cmd { rocksdb::Status s_; std::vector src_keys_; storage::BitOpType op_; + int64_t result_length_ = 0; // For async mode void Clear() override { dest_key_ = ""; src_keys_.clear(); diff --git a/include/pika_command.h b/include/pika_command.h index 669f58a7a9..059fbdfafb 100644 --- a/include/pika_command.h +++ b/include/pika_command.h @@ -67,6 +67,10 @@ const std::string kCmdNameLastSave = "lastsave"; const std::string kCmdNameCache = "cache"; const std::string kCmdNameClearCache = "clearcache"; +// Raft commands +const std::string kCmdNameRaftCluster = "raft.cluster"; +const std::string kCmdNameRaftNode = "raft.node"; + // Migrate slot const std::string kCmdNameSlotsMgrtSlot = "slotsmgrtslot"; const std::string kCmdNameSlotsMgrtTagSlot = "slotsmgrttagslot"; @@ -617,6 +621,11 @@ class Cmd : public std::enable_shared_from_this { uint32_t aclCategory_ = 0; bool cache_missed_in_rtc_{false}; + // Raft async mode helper functions + bool IsRaftLeader() const; + bool IsRaftEnabled() const; + bool ShouldUseAsyncMode() const; + private: virtual void DoInitial() = 0; virtual void Clear(){}; diff --git a/include/pika_conf.h b/include/pika_conf.h index 80d5abe8f0..77828a675b 100644 --- a/include/pika_conf.h +++ b/include/pika_conf.h @@ -883,6 +883,25 @@ class PikaConf : public pstd::BaseConf { int cache_maxmemory_policy() { return cache_maxmemory_policy_; } int cache_maxmemory_samples() { return cache_maxmemory_samples_; } int cache_lfu_decay_time() { return cache_lfu_decay_time_; } + + // Raft configuration getters + bool raft_enabled() { + std::shared_lock l(rwlock_); + return raft_enabled_; + } + std::string raft_group_id() { + std::shared_lock l(rwlock_); + return raft_group_id_; + } + int raft_election_timeout_ms() { + std::shared_lock l(rwlock_); + return raft_election_timeout_ms_; + } + int raft_snapshot_interval_s() { + std::shared_lock l(rwlock_); + return raft_snapshot_interval_s_; + } + int Load(); int ConfigRewrite(); int ConfigRewriteSlaveOf(); @@ -1065,6 +1084,12 @@ class PikaConf : public pstd::BaseConf { //Internal used metrics Persisted by pika.conf std::unordered_set internal_used_unfinished_full_sync_; + + // Raft configuration + bool raft_enabled_ = false; + std::string raft_group_id_; + int raft_election_timeout_ms_ = 1000; + int raft_snapshot_interval_s_ = 3600; }; #endif diff --git a/include/pika_db.h b/include/pika_db.h index bcaf3f8b16..890786a69b 100644 --- a/include/pika_db.h +++ b/include/pika_db.h @@ -94,6 +94,8 @@ class DB : public std::enable_shared_from_this, public pstd::noncopyable { std::shared_ptr storage() const; void GetBgSaveMetaData(std::vector* fileNames, std::string* snapshot_uuid); void BgSaveDB(); + pstd::Status CreateCheckpoint(const std::string& checkpoint_dir); + pstd::Status LoadDBFromCheckpoint(const std::string& checkpoint_dir); void SetBinlogIoError(); void SetBinlogIoErrorrelieve(); bool IsBinlogIoError(); diff --git a/include/pika_geo.h b/include/pika_geo.h index 70b287da03..a6db94fcc6 100644 --- a/include/pika_geo.h +++ b/include/pika_geo.h @@ -67,6 +67,8 @@ class GeoAddCmd : public Cmd { private: std::string key_; std::vector pos_; + rocksdb::Status s_; + int32_t count_ = 0; void DoInitial() override; }; diff --git a/include/pika_hash.h b/include/pika_hash.h index 7e1c29e241..d723c781c4 100644 --- a/include/pika_hash.h +++ b/include/pika_hash.h @@ -108,6 +108,7 @@ class HSetCmd : public Cmd { std::string key_, field_, value_; std::vector fields_; std::vector fields_values_; + int32_t ret_ = 0; // For async mode: 1 if field is new, 0 if updated void DoInitial() override; rocksdb::Status s_; }; @@ -154,6 +155,7 @@ class HIncrbyCmd : public Cmd { private: std::string key_, field_; int64_t by_ = 0; + int64_t new_value_ = 0; // For async mode: result after increment void DoInitial() override; rocksdb::Status s_; }; @@ -176,6 +178,7 @@ class HIncrbyfloatCmd : public Cmd { private: std::string key_, field_, by_; + std::string new_value_; // For async mode: result after increment void DoInitial() override; rocksdb::Status s_; }; @@ -291,6 +294,7 @@ class HSetnxCmd : public Cmd { private: std::string key_, field_, value_; + int32_t ret_ = 0; // For async mode: 1 if field was set, 0 if already exists void DoInitial() override; rocksdb::Status s_; }; diff --git a/include/pika_hyperloglog.h b/include/pika_hyperloglog.h index 77c374642f..ca291cde22 100644 --- a/include/pika_hyperloglog.h +++ b/include/pika_hyperloglog.h @@ -27,6 +27,7 @@ class PfAddCmd : public Cmd { private: std::string key_; std::vector values_; + bool update_ = false; void DoInitial() override; void Clear() override { values_.clear(); } }; diff --git a/include/pika_kv.h b/include/pika_kv.h index 82939d29d9..45987e15ec 100644 --- a/include/pika_kv.h +++ b/include/pika_kv.h @@ -92,6 +92,7 @@ class DelCmd : public Cmd { private: std::vector keys_; int64_t split_res_ = 0; + int64_t deleted_count_ = 0; // For async mode: number of deleted keys void DoInitial() override; rocksdb::Status s_; }; @@ -236,6 +237,7 @@ class GetsetCmd : public Cmd { private: std::string key_; std::string new_value_; + std::string old_value_; // For async mode void DoInitial() override; rocksdb::Status s_; }; @@ -260,6 +262,7 @@ class AppendCmd : public Cmd { std::string key_; std::string value_; std::string new_value_; + int32_t new_len_ = 0; // For async mode void DoInitial() override; rocksdb::Status s_; int32_t expired_timestamp_sec_ = 0; @@ -519,6 +522,7 @@ class SetrangeCmd : public Cmd { std::string key_; int64_t offset_ = 0; std::string value_; + int32_t new_len_ = 0; // For async mode void DoInitial() override; rocksdb::Status s_; }; diff --git a/include/pika_list.h b/include/pika_list.h index 49031b074e..6995d8cb83 100644 --- a/include/pika_list.h +++ b/include/pika_list.h @@ -59,6 +59,7 @@ class LInsertCmd : public Cmd { storage::BeforeOrAfter dir_{storage::After}; std::string pivot_; std::string value_; + int64_t llen_ = 0; // For async mode: list length after insert void DoInitial() override; rocksdb::Status s_; }; @@ -149,6 +150,7 @@ class LPopCmd : public Cmd { private: std::string key_; std::int64_t count_ = 1; + std::vector elements_; // For async mode: popped elements void DoInitial() override; rocksdb::Status s_; }; @@ -171,6 +173,7 @@ class LPushCmd : public BlockingBaseCmd { private: std::string key_; std::vector values_; + uint64_t llen_ = 0; // For async mode: list length after push rocksdb::Status s_; void DoInitial() override; void Clear() override { values_.clear(); } @@ -194,6 +197,7 @@ class LPushxCmd : public Cmd { private: std::string key_; + uint64_t llen_ = 0; // For async mode: list length after push rocksdb::Status s_; std::vector values_; void DoInitial() override; @@ -244,6 +248,7 @@ class LRemCmd : public Cmd { std::string key_; int64_t count_ = 0; std::string value_; + uint64_t removed_count_ = 0; // For async mode: number of elements removed rocksdb::Status s_; void DoInitial() override; }; @@ -334,6 +339,7 @@ class RPopCmd : public Cmd { private: std::string key_; std::int64_t count_ = 1; + std::vector elements_; // For async mode: popped elements void DoInitial() override; rocksdb::Status s_; }; @@ -400,6 +406,7 @@ class RPushCmd : public BlockingBaseCmd { private: std::string key_; std::vector values_; + uint64_t llen_ = 0; // For async mode: list length after push rocksdb::Status s_; void DoInitial() override; void Clear() override { values_.clear(); } @@ -425,6 +432,7 @@ class RPushxCmd : public Cmd { std::string key_; std::string value_; std::vector values_; + uint64_t llen_ = 0; // For async mode: list length after push rocksdb::Status s_; void DoInitial() override; }; diff --git a/include/pika_raft.h b/include/pika_raft.h new file mode 100644 index 0000000000..4ab4cc1614 --- /dev/null +++ b/include/pika_raft.h @@ -0,0 +1,79 @@ +// Copyright (c) 2015-present, Qihoo, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef PIKA_RAFT_H_ +#define PIKA_RAFT_H_ + +#include +#include +#include + +// Include praft.h to get complete RaftManager definition +// This must come before pika_server.h to avoid incomplete type errors +#include "praft/praft.h" + +#include "include/acl.h" +#include "include/pika_command.h" + +/* + * Raft Commands + */ + +// RAFT.CLUSTER [INIT|INFO] [args...] +// INIT: RAFT.CLUSTER INIT [peer1,peer2,peer3] (peers optional, no peers = prepare for cluster expansion) +// INFO: RAFT.CLUSTER INFO [db_name] +class RaftClusterCmd : public Cmd { + public: + RaftClusterCmd(const std::string& name, int arity, uint32_t flag) + : Cmd(name, arity, flag, static_cast(AclCategory::ADMIN)) {} + + void Do() override; + void Split(const HintKeys& hint_keys) override {}; + void Merge() override {}; + Cmd* Clone() override { return new RaftClusterCmd(*this); } + + private: + enum class Operation { INIT, INFO, UNKNOWN }; + + void DoInitial() override; + void Clear() override { + operation_ = Operation::UNKNOWN; + db_name_.clear(); + args_.clear(); + } + + Operation operation_; + std::string db_name_; + std::vector args_; +}; + +// RAFT.NODE ADD|REMOVE peer_address [db_name] +class RaftNodeCmd : public Cmd { + public: + RaftNodeCmd(const std::string& name, int arity, uint32_t flag) + : Cmd(name, arity, flag, static_cast(AclCategory::ADMIN)) {} + + void Do() override; + void Split(const HintKeys& hint_keys) override {}; + void Merge() override {}; + Cmd* Clone() override { return new RaftNodeCmd(*this); } + + private: + enum class Operation { ADD, REMOVE, UNKNOWN }; + + void DoInitial() override; + void Clear() override { + operation_ = Operation::UNKNOWN; + peer_addr_.clear(); + db_name_.clear(); + } + + Operation operation_; + std::string peer_addr_; + std::string db_name_; +}; + +#endif // PIKA_RAFT_H_ + diff --git a/include/pika_server.h b/include/pika_server.h index 81cda87b04..fc684f6fc9 100644 --- a/include/pika_server.h +++ b/include/pika_server.h @@ -47,6 +47,11 @@ #include "include/pika_transaction.h" #include "include/rsync_server.h" +// Forward declare RaftManager to avoid circular dependency +namespace pika_raft { +class RaftManager; +} + extern std::unique_ptr g_pika_conf; enum TaskType { @@ -66,6 +71,8 @@ enum TaskType { kCompactRangeSets, kCompactRangeZSets, kCompactRangeList, + kLoadDBFromCheckpoint, + kCreateCheckpoint, }; struct TaskArg { @@ -516,6 +523,12 @@ class PikaServer : public pstd::noncopyable { exec_stat_map.insert(std::make_pair(cmd_name, 0)); } } + + /* + * Raft used + */ + pika_raft::RaftManager* GetRaftManager() { return raft_manager_.get(); } + private: /* * TimingTask use @@ -659,6 +672,11 @@ class PikaServer : public pstd::noncopyable { * fast and slow thread pools */ bool slow_cmd_thread_pool_flag_; + + /* + * Raft used + */ + std::unique_ptr raft_manager_; }; #endif diff --git a/include/pika_set.h b/include/pika_set.h index c4b8eb2031..de8b5afd9e 100644 --- a/include/pika_set.h +++ b/include/pika_set.h @@ -32,6 +32,7 @@ class SAddCmd : public Cmd { private: std::string key_; std::vector members_; + int32_t added_count_ = 0; // For async mode: number of members added rocksdb::Status s_; void DoInitial() override; }; @@ -220,6 +221,7 @@ class SUnionstoreCmd : public SetOperationCmd { private: void DoInitial() override; + int32_t result_count_ = 0; // For async mode: number of members in result rocksdb::Status s_; }; @@ -249,6 +251,7 @@ class SInterstoreCmd : public SetOperationCmd { private: void DoInitial() override; + int32_t result_count_ = 0; // For async mode: number of members in result rocksdb::Status s_; }; @@ -301,6 +304,7 @@ class SDiffstoreCmd : public SetOperationCmd { Cmd* Clone() override { return new SDiffstoreCmd(*this); } private: + int32_t result_count_ = 0; // For async mode: number of members in result rocksdb::Status s_; void DoInitial() override; }; @@ -337,6 +341,7 @@ class SMoveCmd : public Cmd { std::shared_ptr srem_cmd_; std::shared_ptr sadd_cmd_; int32_t move_success_{0}; + rocksdb::Status s_; }; class SRandmemberCmd : public Cmd { diff --git a/include/pika_stream.h b/include/pika_stream.h index bf61a96c6b..f24fe2f88f 100644 --- a/include/pika_stream.h +++ b/include/pika_stream.h @@ -40,6 +40,8 @@ class XAddCmd : public Cmd { std::string key_; storage::StreamAddTrimArgs args_; int field_pos_{0}; + std::string serialized_message_; + rocksdb::Status s_; void DoInitial() override; }; @@ -57,6 +59,7 @@ class XDelCmd : public Cmd { private: std::string key_; std::vector ids_; + rocksdb::Status s_; void DoInitial() override; void Clear() override { ids_.clear(); } @@ -133,6 +136,7 @@ class XTrimCmd : public Cmd { private: std::string key_; storage::StreamAddTrimArgs args_; + rocksdb::Status s_; void DoInitial() override; }; diff --git a/include/pika_zset.h b/include/pika_zset.h index b4e5726233..3f2146fb43 100644 --- a/include/pika_zset.h +++ b/include/pika_zset.h @@ -33,6 +33,7 @@ class ZAddCmd : public Cmd { private: std::string key_; std::vector score_members; + int32_t added_count_ = 0; // For async mode rocksdb::Status s_; void DoInitial() override; }; @@ -103,7 +104,8 @@ class ZIncrbyCmd : public Cmd { private: std::string key_, member_; double by_ = .0f; - double score_ = .0f; + double score_ = .0f; // For async mode: result after increment + rocksdb::Status s_; void DoInitial() override; }; @@ -333,6 +335,7 @@ class ZUnionstoreCmd : public ZsetUIstoreParentCmd { private: void DoInitial() override; + int32_t result_count_ = 0; // For async mode // used for write binlog std::map value_to_dest_; rocksdb::Status s_; @@ -352,6 +355,7 @@ class ZInterstoreCmd : public ZsetUIstoreParentCmd { private: void DoInitial() override; + int32_t result_count_ = 0; // For async mode rocksdb::Status s_; // used for write binlog std::vector value_to_dest_; @@ -561,6 +565,7 @@ class ZRemrangebyscoreCmd : public Cmd { std::string key_, min_, max_; double min_score_ = 0, max_score_ = 0; bool left_close_ = true, right_close_ = true; + int32_t deleted_count_ = 0; // For async mode rocksdb::Status s_; void DoInitial() override; void Clear() override { left_close_ = right_close_ = true; } @@ -586,6 +591,7 @@ class ZRemrangebylexCmd : public Cmd { std::string key_, min_, max_; std::string min_member_, max_member_; bool left_close_ = true, right_close_ = true; + int32_t deleted_count_ = 0; // For async mode rocksdb::Status s_; void DoInitial() override; void Clear() override { left_close_ = right_close_ = true; } @@ -611,6 +617,8 @@ class ZPopmaxCmd : public Cmd { void DoInitial() override; std::string key_; int64_t count_ = 0; + std::vector score_members_; // For async mode: popped members + rocksdb::Status s_; }; class ZPopminCmd : public Cmd { @@ -633,6 +641,8 @@ class ZPopminCmd : public Cmd { void DoInitial() override; std::string key_; int64_t count_ = 0; + std::vector score_members_; // For async mode: popped members + rocksdb::Status s_; }; #endif diff --git a/protogen.cmake b/protogen.cmake index 895a15b175..f67cb2ea72 100644 --- a/protogen.cmake +++ b/protogen.cmake @@ -30,7 +30,7 @@ function(CUSTOM_PROTOBUF_GENERATE_CPP SRCS HDRS) "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h" COMMAND ${PROTOBUF_PROTOC} ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} - DEPENDS ${ABS_FIL} + DEPENDS ${ABS_FIL} protobuf COMMENT "Running C++ protocol buffer compiler on ${FIL}" VERBATIM) endforeach () diff --git a/src/pika_bit.cc b/src/pika_bit.cc index 1bbbdfd2cd..48c94a08ca 100644 --- a/src/pika_bit.cc +++ b/src/pika_bit.cc @@ -43,11 +43,41 @@ void BitSetCmd::DoInitial() { } void BitSetCmd::Do() { - std::string value; - int32_t bit_val = 0; - s_ = db_->storage()->SetBit(key_, bit_offset_, static_cast(on_), &bit_val); + bit_val_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendInteger(static_cast(self->bit_val_)); + AddSlotKey("k", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + + s_ = db_->storage()->SetBit(key_, bit_offset_, static_cast(on_), &bit_val_, callback); + + if (callback) { + return; + } + if (s_.ok()) { - res_.AppendInteger(static_cast(bit_val)); + res_.AppendInteger(static_cast(bit_val_)); AddSlotKey("k", key_, db_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); diff --git a/src/pika_command.cc b/src/pika_command.cc index 7392ab7ddf..e6af1680bc 100644 --- a/src/pika_command.cc +++ b/src/pika_command.cc @@ -15,6 +15,7 @@ #include "include/pika_geo.h" #include "include/pika_hash.h" #include "include/pika_hyperloglog.h" +#include "include/pika_raft.h" #include "include/pika_kv.h" #include "include/pika_list.h" #include "include/pika_pubsub.h" @@ -158,6 +159,15 @@ void InitCmdTable(CmdTable* cmd_table) { std::unique_ptr lastsaveptr = std::make_unique(kCmdNameLastSave, 1, kCmdFlagsAdmin | kCmdFlagsRead | kCmdFlagsFast); cmd_table->insert(std::pair>(kCmdNameLastSave, std::move(lastsaveptr))); + // Raft Commands + std::unique_ptr raftclusterptr = + std::make_unique(kCmdNameRaftCluster, -2, kCmdFlagsRead | kCmdFlagsAdmin | kCmdFlagsSlow); + cmd_table->insert(std::pair>(kCmdNameRaftCluster, std::move(raftclusterptr))); + + std::unique_ptr raftnodeptr = + std::make_unique(kCmdNameRaftNode, -3, kCmdFlagsWrite | kCmdFlagsAdmin | kCmdFlagsSlow); + cmd_table->insert(std::pair>(kCmdNameRaftNode, std::move(raftnodeptr))); + #ifdef WITH_COMMAND_DOCS std::unique_ptr commandptr = std::make_unique(kCmdNameCommand, -1, kCmdFlagsRead | kCmdFlagsAdmin | kCmdFlagsSlow); @@ -881,7 +891,8 @@ void Cmd::InternalProcessCommand(const HintKeys& hint_keys) { uint64_t before_do_binlog_us = pstd::NowMicros(); this->command_duration_ms = (before_do_binlog_us - before_do_command_us) / 1000; - DoBinlog(); + DoBinlog(); + if (!IsSuspend()) { db_->DBUnlockShared(); @@ -889,6 +900,7 @@ void Cmd::InternalProcessCommand(const HintKeys& hint_keys) { if (is_write()) { record_lock.Unlock(current_key()); } + uint64_t end_us = pstd::NowMicros(); this->binlog_duration_ms = (end_us - before_do_binlog_us) / 1000; @@ -944,6 +956,12 @@ bool Cmd::DoReadCommandInCache() { void Cmd::DoBinlog() { + // 如果是 Raft 模式,跳过写 binlog(改用 Protobuf binlog) + if (g_pika_server->GetRaftManager()) { + return; + } + + // Traditional binlog path (non-Raft mode) if (res().ok() && is_write() && g_pika_conf->write_binlog()) { std::shared_ptr conn_ptr = GetConn(); std::shared_ptr resp_ptr = GetResp(); @@ -1092,3 +1110,20 @@ std::shared_ptr Cmd::GetResp() { return resp_.lock(); } void Cmd::SetStage(CmdStage stage) { stage_ = stage; } bool Cmd::IsCacheMissedInRtc() const { return cache_missed_in_rtc_; } void Cmd::SetCacheMissedInRtc(bool value) { cache_missed_in_rtc_ = value; } + +// Raft async mode helper functions +bool Cmd::IsRaftLeader() const { + if (!g_pika_server || !g_pika_server->GetRaftManager()) { + return false; + } + auto node = g_pika_server->GetRaftManager()->GetRaftNode(db_->GetDBName()); + return node && node->IsLeader(); +} + +bool Cmd::IsRaftEnabled() const { + return db_ && db_->storage() && db_->storage()->IsRaftEnabled(); +} + +bool Cmd::ShouldUseAsyncMode() const { + return IsRaftLeader() && IsRaftEnabled(); +} diff --git a/src/pika_conf.cc b/src/pika_conf.cc index 80116aa847..4b698219bf 100644 --- a/src/pika_conf.cc +++ b/src/pika_conf.cc @@ -703,6 +703,26 @@ int PikaConf::Load() { rsync_timeout_ms_.store(tmp_rsync_timeout_ms); } + // Raft configuration + std::string raft_enabled_str; + GetConfStr("raft-enabled", &raft_enabled_str); + raft_enabled_ = (raft_enabled_str == "yes"); + + GetConfStr("raft-group-id", &raft_group_id_); + if (raft_group_id_.empty()) { + raft_group_id_ = "pika_raft_group"; + } + + GetConfInt("raft-election-timeout-ms", &raft_election_timeout_ms_); + if (raft_election_timeout_ms_ <= 0) { + raft_election_timeout_ms_ = 1000; + } + + GetConfInt("raft-snapshot-interval-s", &raft_snapshot_interval_s_); + if (raft_snapshot_interval_s_ <= 0) { + raft_snapshot_interval_s_ = 3600; + } + return ret; } diff --git a/src/pika_db.cc b/src/pika_db.cc index 58c4f3bf77..80499092d1 100644 --- a/src/pika_db.cc +++ b/src/pika_db.cc @@ -67,6 +67,75 @@ void DB::BgSaveDB() { g_pika_server->BGSaveTaskSchedule(&DoBgSave, static_cast(bg_task_arg)); } +pstd::Status DB::CreateCheckpoint(const std::string& checkpoint_dir) { + std::string checkpoint_sub_path = checkpoint_dir; + if (!checkpoint_sub_path.empty() && checkpoint_sub_path.back() != '/') { + checkpoint_sub_path.push_back('/'); + } + checkpoint_sub_path += db_name_; + + if (!pstd::FileExists(checkpoint_sub_path)) { + if (pstd::CreatePath(checkpoint_sub_path, 0755) != 0) { + return Status::IOError("Failed to create checkpoint path", checkpoint_sub_path); + } + } + + std::shared_lock guard(dbs_rw_); + auto tasks = storage_->CreateCheckpoint(checkpoint_sub_path); + for (auto& task : tasks) { + auto status = task.get(); + if (!status.ok()) { + return Status::Corruption("Create checkpoint failed: " + status.ToString()); + } + } + return Status::OK(); +} + +pstd::Status DB::LoadDBFromCheckpoint(const std::string& checkpoint_dir) { + std::string checkpoint_sub_path = checkpoint_dir; + if (!checkpoint_sub_path.empty() && checkpoint_sub_path.back() != '/') { + checkpoint_sub_path.push_back('/'); + } + checkpoint_sub_path += db_name_; + + if (!pstd::FileExists(checkpoint_sub_path)) { + return Status::NotFound("Checkpoint dir does not exist: " + checkpoint_sub_path); + } + + std::lock_guard guard(dbs_rw_); + opened_ = false; + + auto old_storage = storage_; + storage_.reset(); + if (old_storage) { + old_storage->Close(); + } + + storage_ = std::make_shared(); + auto checkpoint_tasks = storage_->LoadCheckpoint(checkpoint_sub_path, db_path_); + for (auto& task : checkpoint_tasks) { + auto status = task.get(); + if (!status.ok()) { + storage_.reset(); + return Status::Corruption("Load checkpoint failed: " + status.ToString()); + } + } + + storage::StorageOptions storage_options = g_pika_server->storage_options(); + auto open_status = storage_->Open(storage_options, db_path_); + if (!open_status.ok()) { + storage_.reset(); + return Status::Corruption("Storage open failed: " + open_status.ToString()); + } + + if (!g_pika_conf->raft_enabled()) { + storage_->DisableWal(false); + } + + opened_ = true; + return Status::OK(); +} + void DB::SetBinlogIoError() { return binlog_io_error_.store(true); } void DB::SetBinlogIoErrorrelieve() { return binlog_io_error_.store(false); } bool DB::IsBinlogIoError() { return binlog_io_error_.load(); } diff --git a/src/pika_geo.cc b/src/pika_geo.cc index 0649b5f008..3439fa8044 100644 --- a/src/pika_geo.cc +++ b/src/pika_geo.cc @@ -10,6 +10,9 @@ #include "pstd/include/pstd_string.h" #include "include/pika_geohash_helper.h" +#include "include/pika_client_conn.h" +#include "include/pika_slot_command.h" +#include "src/storage/include/storage/batch.h" #include "rocksdb/status.h" void GeoAddCmd::DoInitial() { @@ -56,12 +59,46 @@ void GeoAddCmd::Do() { pstd::string2d(str_bits.data(), str_bits.size(), &score); score_members.push_back({score, geo_point.member}); } + int32_t count = 0; - rocksdb::Status s = db_->storage()->ZAdd(key_, score_members, &count); - if (s.ok()) { + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendInteger(self->count_); + AddSlotKey("g", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + + s_ = db_->storage()->ZAdd(key_, score_members, &count, callback); + count_ = count; // Store count for async callback + + if (callback) { + return; + } + + if (s_.ok()) { res_.AppendInteger(count); + AddSlotKey("g", key_, db_); } else { - res_.SetRes(CmdRes::kErrOther, s.ToString()); + res_.SetRes(CmdRes::kErrOther, s_.ToString()); } } @@ -229,14 +266,13 @@ void GeoHashCmd::Do() { buf[i] = geoalphabet[idx]; } buf[11] = '\0'; - res_.AppendStringLen(11); - res_.AppendContent(buf); + res_.AppendString(std::string(buf)); continue; } else if (s.IsNotFound()) { res_.AppendStringLen(-1); continue; } else { - res_.SetRes(CmdRes::kErrOther, s.ToString()); + res_.AppendStringLen(-1); // Changed to append nil instead of setting error for the whole response continue; } } diff --git a/src/pika_hash.cc b/src/pika_hash.cc index 62694d3932..9c599beb53 100644 --- a/src/pika_hash.cc +++ b/src/pika_hash.cc @@ -26,8 +26,39 @@ void HDelCmd::DoInitial() { } void HDelCmd::Do() { + deleted_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok() || status.IsNotFound()) { + self->res_.AppendInteger(self->deleted_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->HDel(key_, fields_, &deleted_); + s_ = db_->storage()->HDel(key_, fields_, &deleted_, callback); + + if (callback) { + return; + } + if (s_.ok() || s_.IsNotFound()) { res_.AppendInteger(deleted_); } else { @@ -73,18 +104,55 @@ void HSetCmd::DoInitial() { } void HSetCmd::Do() { + ret_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendContent(":" + std::to_string(self->ret_)); + AddSlotKey("h", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); if (argv_.size() == 4) { - int32_t count = 0; - s_ = db_->storage()->HSet(key_, field_, value_, &count); + s_ = db_->storage()->HSet(key_, field_, value_, &ret_, callback); + + if (callback) { + return; + } + if (s_.ok()) { - res_.AppendContent(":" + std::to_string(count)); + res_.AppendContent(":" + std::to_string(ret_)); AddSlotKey("h", key_, db_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); } } else if (argv_.size() > 4 && argv_.size() % 2 == 0) { - s_ = db_->storage()->HMSet(key_, fields_values_); + s_ = db_->storage()->HMSet(key_, fields_values_, callback); + + if (callback) { + ret_ = static_cast(fields_values_.size()); + return; + } + if (s_.ok()) { res_.AppendContent(":" + std::to_string(fields_values_.size())); AddSlotKey("h", key_, db_); @@ -359,11 +427,47 @@ void HIncrbyCmd::DoInitial() { } void HIncrbyCmd::Do() { - int64_t new_value = 0; + new_value_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok() || status.IsNotFound()) { + self->res_.AppendContent(":" + std::to_string(self->new_value_)); + AddSlotKey("h", self->key_, self->db_); + } else if (status.IsCorruption() && status.ToString() == "Corruption: hash value is not an integer") { + self->res_.SetRes(CmdRes::kInvalidInt); + } else if (status.IsInvalidArgument()) { + self->res_.SetRes(CmdRes::kOverFlow); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->HIncrby(key_, field_, by_, &new_value); + s_ = db_->storage()->HIncrby(key_, field_, by_, &new_value_, callback); + + if (callback) { + return; // Async mode, response will be sent in callback + } + + // Sync mode fallback if (s_.ok() || s_.IsNotFound()) { - res_.AppendContent(":" + std::to_string(new_value)); + res_.AppendContent(":" + std::to_string(new_value_)); AddSlotKey("h", key_, db_); } else if (s_.IsCorruption() && s_.ToString() == "Corruption: hash value is not an integer") { res_.SetRes(CmdRes::kInvalidInt); @@ -397,12 +501,49 @@ void HIncrbyfloatCmd::DoInitial() { } void HIncrbyfloatCmd::Do() { - std::string new_value; + new_value_.clear(); + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendStringLenUint64(self->new_value_.size()); + self->res_.AppendContent(self->new_value_); + AddSlotKey("h", self->key_, self->db_); + } else if (status.IsCorruption() && status.ToString() == "Corruption: value is not a vaild float") { + self->res_.SetRes(CmdRes::kInvalidFloat); + } else if (status.IsInvalidArgument()) { + self->res_.SetRes(CmdRes::kOverFlow); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->HIncrbyfloat(key_, field_, by_, &new_value); + s_ = db_->storage()->HIncrbyfloat(key_, field_, by_, &new_value_, callback); + + if (callback) { + return; // Async mode, response will be sent in callback + } + + // Sync mode fallback if (s_.ok()) { - res_.AppendStringLenUint64(new_value.size()); - res_.AppendContent(new_value); + res_.AppendStringLenUint64(new_value_.size()); + res_.AppendContent(new_value_); AddSlotKey("h", key_, db_); } else if (s_.IsCorruption() && s_.ToString() == "Corruption: value is not a vaild float") { res_.SetRes(CmdRes::kInvalidFloat); @@ -611,8 +752,39 @@ void HMsetCmd::DoInitial() { } void HMsetCmd::Do() { + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.SetRes(CmdRes::kOk); + AddSlotKey("h", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->HMSet(key_, fvs_); + s_ = db_->storage()->HMSet(key_, fvs_, callback); + + if (callback) { + return; + } + if (s_.ok()) { res_.SetRes(CmdRes::kOk); AddSlotKey("h", key_, db_); @@ -644,11 +816,42 @@ void HSetnxCmd::DoInitial() { } void HSetnxCmd::Do() { - int32_t ret = 0; + ret_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendContent(":" + std::to_string(self->ret_)); + AddSlotKey("h", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->HSetnx(key_, field_, value_, &ret); + s_ = db_->storage()->HSetnx(key_, field_, value_, &ret_, callback); + + if (callback) { + return; + } + if (s_.ok()) { - res_.AppendContent(":" + std::to_string(ret)); + res_.AppendContent(":" + std::to_string(ret_)); AddSlotKey("h", key_, db_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); diff --git a/src/pika_hyperloglog.cc b/src/pika_hyperloglog.cc index ac3c0c12aa..9b64673d9e 100644 --- a/src/pika_hyperloglog.cc +++ b/src/pika_hyperloglog.cc @@ -4,6 +4,10 @@ // of patent rights can be found in the PATENTS file in the same directory. #include "include/pika_hyperloglog.h" +#include "include/pika_client_conn.h" +#include "include/pika_slot_command.h" +#include "storage/include/storage/storage.h" +#include "storage/include/storage/batch.h" void PfAddCmd::DoInitial() { if (!CheckArg(argv_.size())) { @@ -20,14 +24,48 @@ void PfAddCmd::DoInitial() { } void PfAddCmd::Do() { - bool update = false; - rocksdb::Status s = db_->storage()->PfAdd(key_, values_, &update); - if (s.ok() && update) { + update_ = false; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok() && self->update_) { + self->res_.AppendInteger(1); + AddSlotKey("h", self->key_, self->db_); + } else if (status.ok() && !self->update_) { + self->res_.AppendInteger(0); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + + s_ = db_->storage()->PfAdd(key_, values_, &update_, callback); + + if (callback) { + return; + } + + if (s_.ok() && update_) { res_.AppendInteger(1); - } else if (s.ok() && !update) { + AddSlotKey("h", key_, db_); + } else if (s_.ok() && !update_) { res_.AppendInteger(0); } else { - res_.SetRes(CmdRes::kErrOther, s.ToString()); + res_.SetRes(CmdRes::kErrOther, s_.ToString()); } } @@ -64,11 +102,43 @@ void PfMergeCmd::DoInitial() { } void PfMergeCmd::Do() { - rocksdb::Status s = db_->storage()->PfMerge(keys_, value_to_dest_); - if (s.ok()) { + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.SetRes(CmdRes::kOk); + AddSlotKey("h", self->keys_[0], self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + + s_ = db_->storage()->PfMerge(keys_, value_to_dest_, callback); + + if (callback) { + return; + } + + if (s_.ok()) { res_.SetRes(CmdRes::kOk); + AddSlotKey("h", keys_[0], db_); } else { - res_.SetRes(CmdRes::kErrOther, s.ToString()); + res_.SetRes(CmdRes::kErrOther, s_.ToString()); } } void PfMergeCmd::DoBinlog() { diff --git a/src/pika_kv.cc b/src/pika_kv.cc index 1c1abdd4cf..9d4cdedf4b 100644 --- a/src/pika_kv.cc +++ b/src/pika_kv.cc @@ -12,8 +12,11 @@ #include "include/pika_cache.h" #include "include/pika_conf.h" #include "include/pika_slot_command.h" +#include "include/pika_server.h" +#include "praft/praft.h" extern std::unique_ptr g_pika_conf; +extern PikaServer* g_pika_server; /* SET key value [NX] [XX] [EX ] [PX ] */ void SetCmd::DoInitial() { if (!CheckArg(argv_.size())) { @@ -67,24 +70,68 @@ void SetCmd::DoInitial() { void SetCmd::Do() { int32_t res = 1; STAGE_TIMER_GUARD(storage_duration_ms, true); + + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + int32_t result = (status.ok() || status.IsNotFound()) ? 1 : 0; + + if (status.ok() || status.IsNotFound()) { + if (self->condition_ == SetCmd::kVX) { + self->res_.AppendInteger(self->success_); + } else { + if (result == 1) { + self->res_.SetRes(CmdRes::kOk); + AddSlotKey("k", self->key_, self->db_); + } else { + self->res_.AppendStringLen(-1); + } + } + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + + // Call storage layer with optional callback switch (condition_) { case SetCmd::kXX: - s_ = db_->storage()->Setxx(key_, value_, &res, static_cast(sec_)); + s_ = db_->storage()->Setxx(key_, value_, &res, static_cast(sec_), callback); break; case SetCmd::kNX: - s_ = db_->storage()->Setnx(key_, value_, &res, static_cast(sec_)); + s_ = db_->storage()->Setnx(key_, value_, &res, static_cast(sec_), callback); break; case SetCmd::kVX: - s_ = db_->storage()->Setvx(key_, target_, value_, &success_, static_cast(sec_)); + s_ = db_->storage()->Setvx(key_, value_, target_, &success_, static_cast(sec_), callback); break; case SetCmd::kEXORPX: - s_ = db_->storage()->Setex(key_, value_, static_cast(sec_)); + s_ = db_->storage()->Setex(key_, value_, static_cast(sec_), callback); break; default: - s_ = db_->storage()->Set(key_, value_); + s_ = db_->storage()->Set(key_, value_, callback); break; } + // For async mode, response is handled by callback + if (callback) { + return; + } + + // For sync mode, set response immediately if (s_.ok() || s_.IsNotFound()) { if (condition_ == SetCmd::kVX) { res_.AppendInteger(success_); @@ -277,8 +324,45 @@ void IncrCmd::DoInitial() { } void IncrCmd::Do() { + new_value_ = 0; + expired_timestamp_sec_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendContent(":" + std::to_string(self->new_value_)); + AddSlotKey("k", self->key_, self->db_); + } else if (status.IsCorruption() && status.ToString() == "Corruption: Value is not a integer") { + self->res_.SetRes(CmdRes::kInvalidInt); + } else if (status.IsInvalidArgument()) { + self->res_.SetRes(CmdRes::kOverFlow); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->Incrby(key_, 1, &new_value_, &expired_timestamp_sec_); + s_ = db_->storage()->Incrby(key_, 1, &new_value_, &expired_timestamp_sec_, callback); + + if (callback) { + return; + } + if (s_.ok()) { res_.AppendContent(":" + std::to_string(new_value_)); AddSlotKey("k", key_, db_); @@ -342,8 +426,45 @@ void IncrbyCmd::DoInitial() { } void IncrbyCmd::Do() { + new_value_ = 0; + expired_timestamp_sec_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendContent(":" + std::to_string(self->new_value_)); + AddSlotKey("k", self->key_, self->db_); + } else if (status.IsCorruption() && status.ToString() == "Corruption: Value is not a integer") { + self->res_.SetRes(CmdRes::kInvalidInt); + } else if (status.IsInvalidArgument()) { + self->res_.SetRes(CmdRes::kOverFlow); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->Incrby(key_, by_, &new_value_, &expired_timestamp_sec_); + s_ = db_->storage()->Incrby(key_, by_, &new_value_, &expired_timestamp_sec_, callback); + + if (callback) { + return; + } + if (s_.ok()) { res_.AppendContent(":" + std::to_string(new_value_)); AddSlotKey("k", key_, db_); @@ -408,8 +529,46 @@ void IncrbyfloatCmd::DoInitial() { } void IncrbyfloatCmd::Do() { + new_value_.clear(); + expired_timestamp_sec_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendStringLenUint64(self->new_value_.size()); + self->res_.AppendContent(self->new_value_); + AddSlotKey("k", self->key_, self->db_); + } else if (status.IsCorruption() && status.ToString() == "Corruption: Value is not a vaild float") { + self->res_.SetRes(CmdRes::kInvalidFloat); + } else if (status.IsInvalidArgument()) { + self->res_.SetRes(CmdRes::KIncrByOverFlow); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->Incrbyfloat(key_, value_, &new_value_, &expired_timestamp_sec_); + s_ = db_->storage()->Incrbyfloat(key_, value_, &new_value_, &expired_timestamp_sec_, callback); + + if (callback) { + return; + } + if (s_.ok()) { res_.AppendStringLenUint64(new_value_.size()); res_.AppendContent(new_value_); @@ -473,8 +632,44 @@ void DecrCmd::DoInitial() { } void DecrCmd::Do() { + new_value_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendContent(":" + std::to_string(self->new_value_)); + } else if (status.IsCorruption() && status.ToString() == "Corruption: Value is not a integer") { + self->res_.SetRes(CmdRes::kInvalidInt); + } else if (status.IsInvalidArgument()) { + self->res_.SetRes(CmdRes::kOverFlow); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_= db_->storage()->Decrby(key_, 1, &new_value_); + s_ = db_->storage()->Decrby(key_, 1, &new_value_, callback); + + if (callback) { + return; + } + + // For sync mode, handle response immediately if (s_.ok()) { res_.AppendContent(":" + std::to_string(new_value_)); } else if (s_.IsCorruption() && s_.ToString() == "Corruption: Value is not a integer") { @@ -511,8 +706,44 @@ void DecrbyCmd::DoInitial() { } void DecrbyCmd::Do() { + new_value_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + AddSlotKey("k", self->key_, self->db_); + self->res_.AppendContent(":" + std::to_string(self->new_value_)); + } else if (status.IsCorruption() && status.ToString() == "Corruption: Value is not a integer") { + self->res_.SetRes(CmdRes::kInvalidInt); + } else if (status.IsInvalidArgument()) { + self->res_.SetRes(CmdRes::kOverFlow); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->Decrby(key_, by_, &new_value_); + s_ = db_->storage()->Decrby(key_, by_, &new_value_, callback); + + if (callback) { + return; + } + if (s_.ok()) { AddSlotKey("k", key_, db_); res_.AppendContent(":" + std::to_string(new_value_)); @@ -547,15 +778,53 @@ void GetsetCmd::DoInitial() { } void GetsetCmd::Do() { - std::string old_value; + old_value_.clear(); + + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok() || status.IsNotFound()) { + if (self->old_value_.empty()) { + self->res_.AppendStringLen(-1); + } else { + self->res_.AppendStringLenUint64(self->old_value_.size()); + self->res_.AppendContent(self->old_value_); + } + AddSlotKey("k", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->GetSet(key_, new_value_, &old_value); - if (s_.ok()) { - if (old_value.empty()) { - res_.AppendContent("$-1"); + s_ = db_->storage()->GetSet(key_, new_value_, &old_value_, callback); + + if (callback) { + return; + } + + // For sync mode, handle response immediately + if (s_.ok() || s_.IsNotFound()) { + if (old_value_.empty()) { + res_.AppendStringLen(-1); } else { - res_.AppendStringLenUint64(old_value.size()); - res_.AppendContent(old_value); + res_.AppendStringLenUint64(old_value_.size()); + res_.AppendContent(old_value_); } AddSlotKey("k", key_, db_); } else { @@ -585,11 +854,46 @@ void AppendCmd::DoInitial() { } void AppendCmd::Do() { - int32_t new_len = 0; + new_len_ = 0; + expired_timestamp_sec_ = 0; + new_value_.clear(); + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok() || status.IsNotFound()) { + self->res_.AppendInteger(self->new_len_); + AddSlotKey("k", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->Append(key_, value_, &new_len, &expired_timestamp_sec_, new_value_); + s_ = db_->storage()->Append(key_, value_, &new_len_, &expired_timestamp_sec_, new_value_, callback); + + if (callback) { + // For async mode, response will be handled by callback + return; + } + + // For sync mode, handle response immediately if (s_.ok() || s_.IsNotFound()) { - res_.AppendInteger(new_len); + res_.AppendInteger(new_len_); AddSlotKey("k", key_, db_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); @@ -833,8 +1137,39 @@ void SetnxCmd::DoInitial() { void SetnxCmd::Do() { success_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendInteger(self->success_); + AddSlotKey("k", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->Setnx(key_, value_, &success_); + s_ = db_->storage()->Setnx(key_, value_, &success_, 0, callback); + + if (callback) { + return; + } + if (s_.ok()) { res_.AppendInteger(success_); AddSlotKey("k", key_, db_); @@ -876,8 +1211,39 @@ void SetexCmd::DoInitial() { } void SetexCmd::Do() { + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.SetRes(CmdRes::kOk); + AddSlotKey("k", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->Setex(key_, value_, static_cast(sec_)); + s_ = db_->storage()->Setex(key_, value_, static_cast(sec_), callback); + + if (callback) { + return; + } + if (s_.ok()) { res_.SetRes(CmdRes::kOk); AddSlotKey("k", key_, db_); @@ -937,10 +1303,42 @@ void PsetexCmd::DoInitial() { } void PsetexCmd::Do() { + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.SetRes(CmdRes::kOk); + AddSlotKey("k", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->Setex(key_, value_, static_cast(usec_ / 1000)); + s_ = db_->storage()->Setex(key_, value_, static_cast(usec_ / 1000), callback); + + if (callback) { + return; + } + if (s_.ok()) { res_.SetRes(CmdRes::kOk); + AddSlotKey("k", key_, db_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); } @@ -1019,13 +1417,45 @@ void MsetCmd::DoInitial() { } void MsetCmd::Do() { + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.SetRes(CmdRes::kOk); + for (const auto& it : self->kvs_) { + AddSlotKey("k", it.key, self->db_); + } + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->MSet(kvs_); + s_ = db_->storage()->MSet(kvs_, callback); + + if (callback) { + return; + } + if (s_.ok()) { res_.SetRes(CmdRes::kOk); - std::vector::const_iterator it; - for (it = kvs_.begin(); it != kvs_.end(); it++) { - AddSlotKey("k", it->key, db_); + for (const auto& it : kvs_) { + AddSlotKey("k", it.key, db_); } } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); @@ -1107,13 +1537,45 @@ void MsetnxCmd::DoInitial() { void MsetnxCmd::Do() { success_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendInteger(self->success_); + for (const auto& it : self->kvs_) { + AddSlotKey("k", it.key, self->db_); + } + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - rocksdb::Status s = db_->storage()->MSetnx(kvs_, &success_); + rocksdb::Status s = db_->storage()->MSetnx(kvs_, &success_, callback); + + if (callback) { + return; + } + if (s.ok()) { res_.AppendInteger(success_); - std::vector::const_iterator it; - for (it = kvs_.begin(); it != kvs_.end(); it++) { - AddSlotKey("k", it->key, db_); + for (const auto& it : kvs_) { + AddSlotKey("k", it.key, db_); } } else { res_.SetRes(CmdRes::kErrOther, s.ToString()); @@ -1229,11 +1691,42 @@ void SetrangeCmd::DoInitial() { } void SetrangeCmd::Do() { - int32_t new_len = 0; + new_len_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendInteger(self->new_len_); + AddSlotKey("k", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->Setrange(key_, offset_, value_, &new_len); + s_ = db_->storage()->Setrange(key_, offset_, value_, &new_len_, callback); + + if (callback) { + return; + } + if (s_.ok()) { - res_.AppendInteger(new_len); + res_.AppendInteger(new_len_); AddSlotKey("k", key_, db_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); diff --git a/src/pika_list.cc b/src/pika_list.cc index 02b358ae87..f2538278e9 100644 --- a/src/pika_list.cc +++ b/src/pika_list.cc @@ -88,11 +88,42 @@ void LInsertCmd::DoInitial() { } void LInsertCmd::Do() { - int64_t llen = 0; + llen_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok() || status.IsNotFound()) { + self->res_.AppendInteger(self->llen_); + AddSlotKey("l", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->LInsert(key_, dir_, pivot_, value_, &llen); + s_ = db_->storage()->LInsert(key_, dir_, pivot_, value_, &llen_, callback); + + if (callback) { + return; + } + if (s_.ok() || s_.IsNotFound()) { - res_.AppendInteger(llen); + res_.AppendInteger(llen_); AddSlotKey("l", key_, db_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); @@ -271,11 +302,47 @@ void LPushCmd::DoInitial() { } void LPushCmd::Do() { - uint64_t llen = 0; + llen_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendInteger(static_cast(self->llen_)); + AddSlotKey("l", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + + // Handle blocking list pop operations + if (!pika_conn->IsInTxn()) { + self->TryToServeBLrPopWithThisKey(self->key_, self->db_); + } + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->LPush(key_, values_, &llen); + s_ = db_->storage()->LPush(key_, values_, &llen_, callback); + + if (callback) { + return; + } + if (s_.ok()) { - res_.AppendInteger(static_cast(llen)); + res_.AppendInteger(static_cast(llen_)); AddSlotKey("l", key_, db_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); @@ -427,17 +494,56 @@ void LPopCmd::DoInitial() { } void LPopCmd::Do() { - std::vector elements; + elements_.clear(); + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + if (self->elements_.size() > 1) { + self->res_.AppendArrayLenUint64(self->elements_.size()); + } + for (const auto& element : self->elements_) { + self->res_.AppendString(element); + } + AddSlotKey("l", self->key_, self->db_); + } else if (status.IsNotFound()) { + self->res_.AppendStringLen(-1); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->LPop(key_, count_, &elements); + s_ = db_->storage()->LPop(key_, count_, &elements_, callback); + + if (callback) { + return; // Async mode, response will be sent in callback + } + // Sync mode fallback if (s_.ok()) { - if (elements.size() > 1) { - res_.AppendArrayLenUint64(elements.size()); + if (elements_.size() > 1) { + res_.AppendArrayLenUint64(elements_.size()); } - for (const auto& element : elements) { + for (const auto& element : elements_) { res_.AppendString(element); } + AddSlotKey("l", key_, db_); } else if (s_.IsNotFound()) { res_.AppendStringLen(-1); } else { @@ -471,11 +577,42 @@ void LPushxCmd::DoInitial() { } void LPushxCmd::Do() { - uint64_t llen = 0; + llen_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok() || status.IsNotFound()) { + self->res_.AppendInteger(static_cast(self->llen_)); + AddSlotKey("l", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->LPushx(key_, values_, &llen); + s_ = db_->storage()->LPushx(key_, values_, &llen_, callback); + + if (callback) { + return; + } + if (s_.ok() || s_.IsNotFound()) { - res_.AppendInteger(static_cast(llen)); + res_.AppendInteger(static_cast(llen_)); AddSlotKey("l", key_, db_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); @@ -572,11 +709,41 @@ void LRemCmd::DoInitial() { } void LRemCmd::Do() { - uint64_t res = 0; + removed_count_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok() || status.IsNotFound()) { + self->res_.AppendInteger(static_cast(self->removed_count_)); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->LRem(key_, count_, value_, &res); + s_ = db_->storage()->LRem(key_, count_, value_, &removed_count_, callback); + + if (callback) { + return; + } + if (s_.ok() || s_.IsNotFound()) { - res_.AppendInteger(static_cast(res)); + res_.AppendInteger(static_cast(removed_count_)); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); } @@ -609,8 +776,43 @@ void LSetCmd::DoInitial() { } void LSetCmd::Do() { + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.SetRes(CmdRes::kOk); + AddSlotKey("l", self->key_, self->db_); + } else if (status.IsNotFound()) { + self->res_.SetRes(CmdRes::kNotFound); + } else if (status.IsCorruption() && status.ToString() == "Corruption: index out of range") { + self->res_.SetRes(CmdRes::kOutOfRange); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->LSet(key_, index_, value_); + s_ = db_->storage()->LSet(key_, index_, value_, callback); + + if (callback) { + return; + } + if (s_.ok()) { res_.SetRes(CmdRes::kOk); AddSlotKey("l", key_, db_); @@ -654,8 +856,38 @@ void LTrimCmd::DoInitial() { } void LTrimCmd::Do() { + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok() || status.IsNotFound()) { + self->res_.SetRes(CmdRes::kOk); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->LTrim(key_, start_, stop_); + s_ = db_->storage()->LTrim(key_, start_, stop_, callback); + + if (callback) { + return; + } + if (s_.ok() || s_.IsNotFound()) { res_.SetRes(CmdRes::kOk); } else { @@ -768,16 +1000,56 @@ void RPopCmd::DoInitial() { } void RPopCmd::Do() { - std::vector elements; + elements_.clear(); + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + if (self->elements_.size() > 1) { + self->res_.AppendArrayLenUint64(self->elements_.size()); + } + for (const auto& element : self->elements_) { + self->res_.AppendString(element); + } + AddSlotKey("l", self->key_, self->db_); + } else if (status.IsNotFound()) { + self->res_.AppendStringLen(-1); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->RPop(key_, count_, &elements); + s_ = db_->storage()->RPop(key_, count_, &elements_, callback); + + if (callback) { + return; // Async mode, response will be sent in callback + } + + // Sync mode fallback if (s_.ok()) { - if (elements.size() > 1) { - res_.AppendArrayLenUint64(elements.size()); + if (elements_.size() > 1) { + res_.AppendArrayLenUint64(elements_.size()); } - for (const auto &element: elements) { + for (const auto& element : elements_) { res_.AppendString(element); } + AddSlotKey("l", key_, db_); } else if (s_.IsNotFound()) { res_.AppendStringLen(-1); } else { @@ -811,24 +1083,57 @@ void RPopLPushCmd::DoInitial() { } void RPopLPushCmd::Do() { - std::string value; + value_poped_from_source_.clear(); + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + AddSlotKey("k", self->receiver_, self->db_); + self->res_.AppendString(self->value_poped_from_source_); + self->is_write_binlog_ = true; + self->TryToServeBLrPopWithThisKey(self->receiver_, self->db_); + } else if (status.IsNotFound()) { + self->res_.AppendStringLen(-1); + self->is_write_binlog_ = false; + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->RPoplpush(source_, receiver_, &value); + s_ = db_->storage()->RPoplpush(source_, receiver_, &value_poped_from_source_, callback); + + if (callback) { + return; // Async mode, response will be sent in callback + } + + // Sync mode fallback if (s_.ok()) { AddSlotKey("k", receiver_, db_); - res_.AppendString(value); - value_poped_from_source_ = value; + res_.AppendString(value_poped_from_source_); is_write_binlog_ = true; + TryToServeBLrPopWithThisKey(receiver_, db_); } else if (s_.IsNotFound()) { - // no actual write operation happened, will not write binlog res_.AppendStringLen(-1); is_write_binlog_ = false; - return; } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); - return; } - TryToServeBLrPopWithThisKey(receiver_, db_); } void RPopLPushCmd::ReadCache() { @@ -886,11 +1191,46 @@ void RPushCmd::DoInitial() { } void RPushCmd::Do() { - uint64_t llen = 0; + llen_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendInteger(static_cast(self->llen_)); + AddSlotKey("l", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + + if (!pika_conn->IsInTxn()) { + self->TryToServeBLrPopWithThisKey(self->key_, self->db_); + } + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->RPush(key_, values_, &llen); + s_ = db_->storage()->RPush(key_, values_, &llen_, callback); + + if (callback) { + return; + } + if (s_.ok()) { - res_.AppendInteger(static_cast(llen)); + res_.AppendInteger(static_cast(llen_)); AddSlotKey("l", key_, db_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); @@ -928,11 +1268,42 @@ void RPushxCmd::DoInitial() { } void RPushxCmd::Do() { - uint64_t llen = 0; + llen_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok() || status.IsNotFound()) { + self->res_.AppendInteger(static_cast(self->llen_)); + AddSlotKey("l", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->RPushx(key_, values_, &llen); + s_ = db_->storage()->RPushx(key_, values_, &llen_, callback); + + if (callback) { + return; + } + if (s_.ok() || s_.IsNotFound()) { - res_.AppendInteger(static_cast(llen)); + res_.AppendInteger(static_cast(llen_)); AddSlotKey("l", key_, db_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); diff --git a/src/pika_raft.cc b/src/pika_raft.cc new file mode 100644 index 0000000000..3be2de4dfb --- /dev/null +++ b/src/pika_raft.cc @@ -0,0 +1,213 @@ +// Copyright (c) 2015-present, Qihoo, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "include/pika_raft.h" + +#include +#include +#include + +// Include praft.h before pika_server.h to get complete RaftManager definition +#include "praft/praft.h" +#include "include/pika_conf.h" +#include "include/pika_server.h" +#include "pstd/include/pstd_string.h" + +extern std::unique_ptr g_pika_conf; +extern std::unique_ptr g_pika_server; + +// Helper function to split string by delimiter +static std::vector SplitString(const std::string& str, char delimiter) { + std::vector result; + std::stringstream ss(str); + std::string item; + while (std::getline(ss, item, delimiter)) { + if (!item.empty()) { + result.push_back(item); + } + } + return result; +} + +// RaftClusterCmd implementation +void RaftClusterCmd::DoInitial() { + if (!CheckArg(argv_.size())) { + res_.SetRes(CmdRes::kWrongNum, name()); + return; + } + + // Parse operation + std::string op = argv_[1]; + std::transform(op.begin(), op.end(), op.begin(), ::toupper); + + if (op == "INIT") { + operation_ = Operation::INIT; + + if (argv_.size() >= 3 && !argv_[2].empty()) { + // Peers specified - multi-node cluster initialization + args_ = SplitString(argv_[2], ','); + if (args_.empty()) { + res_.SetRes(CmdRes::kInvalidParameter, "Invalid peers list"); + return; + } + db_name_ = (argv_.size() >= 4) ? argv_[3] : "db0"; + } else { + // No peers specified - prepare node for being added to cluster + // args_ remains empty, brpc server will start but no cluster config + // The node will wait to be added via RAFT.NODE ADD from another cluster's leader + db_name_ = (argv_.size() >= 3) ? argv_[2] : "db0"; + + LOG(INFO) << "Preparing Raft node (no initial cluster config), waiting to be added to an existing cluster"; + } + } else if (op == "INFO") { + operation_ = Operation::INFO; + db_name_ = (argv_.size() >= 3) ? argv_[2] : "db0"; + } else { + res_.SetRes(CmdRes::kInvalidParameter, "Unknown operation: " + op); + return; + } +} + +void RaftClusterCmd::Do() { + // Check if Raft is enabled + if (!g_pika_conf->raft_enabled()) { + res_.SetRes(CmdRes::kErrOther, "Raft is not enabled in configuration"); + return; + } + + auto raft_mgr = g_pika_server->GetRaftManager(); + if (!raft_mgr) { + res_.SetRes(CmdRes::kErrOther, "Raft manager not initialized"); + return; + } + + pstd::Status status; + + switch (operation_) { + case Operation::INIT: { + LOG(INFO) << "Initializing Raft cluster for DB: " << db_name_ + << " with peers: " << argv_[2]; + status = raft_mgr->InitCluster(db_name_, args_); + if (status.ok()) { + res_.AppendStringRaw("+OK\r\n"); + } else { + res_.SetRes(CmdRes::kErrOther, "Failed to initialize cluster: " + status.ToString()); + } + break; + } + + case Operation::INFO: { + std::string info; + status = raft_mgr->GetClusterInfo(db_name_, &info); + if (status.ok()) { + std::vector lines; + std::stringstream ss(info); + std::string line; + while (std::getline(ss, line)) { + if (!line.empty()) { + lines.push_back(line); + } + } + + res_.AppendArrayLen(lines.size()); + for (const auto& l : lines) { + res_.AppendStringLen(l.size()); + res_.AppendContent(l); + } + } else { + res_.SetRes(CmdRes::kErrOther, "Failed to get cluster info: " + status.ToString()); + } + break; + } + + default: + res_.SetRes(CmdRes::kErrOther, "Unknown operation"); + break; + } +} + +// RaftNodeCmd implementation +void RaftNodeCmd::DoInitial() { + if (!CheckArg(argv_.size())) { + res_.SetRes(CmdRes::kWrongNum, name()); + return; + } + + // Parse operation + std::string op = argv_[1]; + std::transform(op.begin(), op.end(), op.begin(), ::toupper); + + if (op == "ADD") { + operation_ = Operation::ADD; + } else if (op == "REMOVE") { + operation_ = Operation::REMOVE; + } else { + res_.SetRes(CmdRes::kInvalidParameter, "Unknown operation: " + op); + return; + } + + if (argv_.size() < 3) { + res_.SetRes(CmdRes::kWrongNum, "RAFT.NODE requires peer address"); + return; + } + + peer_addr_ = argv_[2]; + db_name_ = (argv_.size() >= 4) ? argv_[3] : "db0"; +} + +void RaftNodeCmd::Do() { + // Check if Raft is enabled + if (!g_pika_conf->raft_enabled()) { + res_.SetRes(CmdRes::kErrOther, "Raft is not enabled in configuration"); + return; + } + + auto raft_mgr = g_pika_server->GetRaftManager(); + if (!raft_mgr) { + res_.SetRes(CmdRes::kErrOther, "Raft manager not initialized"); + return; + } + + pstd::Status status; + + switch (operation_) { + case Operation::ADD: { + LOG(INFO) << "Adding node to Raft cluster, DB: " << db_name_ + << ", peer: " << peer_addr_; + status = raft_mgr->AddNode(db_name_, peer_addr_); + if (status.ok()) { + // Don't modify config file - braft manages cluster membership in raft_meta + // The raft-peers in config file is only used for initial bootstrap + LOG(INFO) << "Node added successfully to Raft cluster (managed by braft raft_meta)"; + + res_.AppendStringRaw("+OK\r\n"); + } else { + res_.SetRes(CmdRes::kErrOther, "Failed to add node: " + status.ToString()); + } + break; + } + + case Operation::REMOVE: { + LOG(INFO) << "Removing node from Raft cluster, DB: " << db_name_ + << ", peer: " << peer_addr_; + status = raft_mgr->RemoveNode(db_name_, peer_addr_); + if (status.ok()) { + // Don't modify config file - braft manages cluster membership in raft_meta + // The raft-peers in config file is only used for initial bootstrap + LOG(INFO) << "Node removed successfully from Raft cluster (managed by braft raft_meta)"; + + res_.AppendStringRaw("+OK\r\n"); + } else { + res_.SetRes(CmdRes::kErrOther, "Failed to remove node: " + status.ToString()); + } + break; + } + + default: + res_.SetRes(CmdRes::kErrOther, "Unknown operation"); + break; + } +} + diff --git a/src/pika_server.cc b/src/pika_server.cc index bbf444191d..e8ad6d2409 100644 --- a/src/pika_server.cc +++ b/src/pika_server.cc @@ -22,8 +22,10 @@ #include "include/pika_dispatch_thread.h" #include "include/pika_instant.h" #include "include/pika_monotonic_time.h" +#include "praft/praft.h" #include "include/pika_rm.h" #include "include/pika_server.h" +#include "binlog.pb.h" using pstd::Status; extern PikaServer* g_pika_server; @@ -103,6 +105,27 @@ PikaServer::PikaServer() acl_ = std::make_unique<::Acl>(); SetSlowCmdThreadPoolFlag(g_pika_conf->slow_cmd_pool()); + + // Initialize Raft if enabled + if (g_pika_conf->raft_enabled()) { + raft_manager_ = std::make_unique(); + auto status = raft_manager_->Init(); + if (!status.ok()) { + LOG(FATAL) << "Failed to initialize Raft manager: " << status.ToString(); + } + LOG(INFO) << "Raft manager initialized successfully"; + + std::lock_guard rwl(storage_options_rw_); + storage_options_.append_log_function = + [this](const ::pikiwidb::Binlog& binlog, std::promise&& promise, + storage::CommitCallback callback) { + std::string db_name = "db0"; + + raft_manager_->AppendLog(db_name, binlog, std::move(promise), callback); + }; + LOG(INFO) << "Raft append_log_function registered in storage_options"; + } + bgsave_thread_.set_thread_name("PikaServer::bgsave_thread_"); purge_thread_.set_thread_name("PikaServer::purge_thread_"); bgslots_cleanup_thread_.set_thread_name("PikaServer::bgslots_cleanup_thread_"); @@ -130,6 +153,12 @@ PikaServer::~PikaServer() { key_scan_thread_.StopThread(); pika_migrate_thread_->StopThread(); + // Shutdown Raft if running + if (raft_manager_) { + raft_manager_->Shutdown(); + LOG(INFO) << "Raft manager shutdown complete"; + } + dbs_.clear(); LOG(INFO) << "PikaServer " << pthread_self() << " exit!!!"; @@ -210,6 +239,16 @@ void PikaServer::Start() { << (ret == net::kCreateThreadError ? ": create thread error " : ": other error"); } + // Start Raft if enabled + if (raft_manager_) { + auto status = raft_manager_->Start(); + if (!status.ok()) { + LOG(WARNING) << "Failed to start Raft manager: " << status.ToString(); + } else { + LOG(INFO) << "Raft manager started successfully"; + } + } + time(&start_time_s_); LOG(INFO) << "Pika Server going to start"; rsync_server_->Start(); @@ -458,6 +497,37 @@ Status PikaServer::DoSameThingSpecificDB(const std::set& dbs, const case TaskType::kCompactRangeList: db_item.second->CompactRange(storage::DataType::kLists, arg.argv[0], arg.argv[1]); break; + case TaskType::kLoadDBFromCheckpoint: { + // arg.argv[0] should contain checkpoint_path + if (arg.argv.empty()) { + LOG(ERROR) << "LoadDBFromCheckpoint requires checkpoint_path argument"; + return Status::InvalidArgument("Missing checkpoint_path"); + } + std::string checkpoint_path = arg.argv[0]; + auto s = db_item.second->LoadDBFromCheckpoint(checkpoint_path); + if (!s.ok()) { + LOG(ERROR) << "Failed to load DB from checkpoint: " << s.ToString(); + return s; + } + LOG(INFO) << "Successfully loaded DB " << db_item.first << " from checkpoint: " << checkpoint_path; + break; + } + case TaskType::kCreateCheckpoint: { + // arg.argv[0] should contain checkpoint_path + if (arg.argv.empty()) { + LOG(ERROR) << "CreateCheckpoint requires checkpoint_path argument"; + return Status::InvalidArgument("Missing checkpoint_path"); + } + std::string checkpoint_path = arg.argv[0]; + auto s = db_item.second->CreateCheckpoint(checkpoint_path); + if (!s.ok()) { + LOG(ERROR) << "Failed to create checkpoint: " << s.ToString(); + return s; + } + LOG(INFO) << "Successfully created checkpoint for DB " << db_item.first << " at: " << checkpoint_path; + break; + } + default: break; } diff --git a/src/pika_set.cc b/src/pika_set.cc index 6e764149e0..e0ac65258c 100644 --- a/src/pika_set.cc +++ b/src/pika_set.cc @@ -23,15 +23,46 @@ void SAddCmd::DoInitial() { } void SAddCmd::Do() { - int32_t count = 0; + added_count_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendInteger(self->added_count_); + AddSlotKey("s", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->SAdd(key_, members_, &count); + s_ = db_->storage()->SAdd(key_, members_, &added_count_, callback); + + if (callback) { + return; + } + if (!s_.ok()) { res_.SetRes(CmdRes::kErrOther, s_.ToString()); return; } AddSlotKey("s", key_, db_); - res_.AppendInteger(count); + res_.AppendInteger(added_count_); } void SAddCmd::DoThroughDB() { @@ -69,8 +100,52 @@ void SPopCmd::DoInitial() { } void SPopCmd::Do() { + members_.clear(); + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + if (self->argv_.size() == 2) { + self->res_.AppendStringLen(self->members_[0].size()); + self->res_.AppendContent(self->members_[0]); + } else { + self->res_.AppendArrayLenUint64(self->members_.size()); + for (const auto& member : self->members_) { + self->res_.AppendStringLenUint64(member.size()); + self->res_.AppendContent(member); + } + } + AddSlotKey("s", self->key_, self->db_); + } else if (status.IsNotFound()) { + self->res_.AppendContent("$-1"); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->SPop(key_, &members_, count_); + s_ = db_->storage()->SPop(key_, &members_, count_, callback); + + if (callback) { + return; // Async mode, response will be sent in callback + } + + // Sync mode fallback if (s_.ok()) { if (argv_.size() == 2) { res_.AppendStringLen(members_[0].size()); @@ -82,6 +157,7 @@ void SPopCmd::Do() { res_.AppendContent(member); } } + AddSlotKey("s", key_, db_); } else if (s_.IsNotFound()) { res_.AppendContent("$-1"); } else { @@ -291,8 +367,39 @@ void SRemCmd::DoInitial() { } void SRemCmd::Do() { + deleted_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok() || status.IsNotFound()) { + self->res_.AppendInteger(self->deleted_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->SRem(key_, members_, &deleted_); + s_ = db_->storage()->SRem(key_, members_, &deleted_, callback); + + if (callback) { + return; + } + if (s_.ok() || s_.IsNotFound()) { res_.AppendInteger(deleted_); } else { @@ -348,11 +455,43 @@ void SUnionstoreCmd::DoInitial() { } void SUnionstoreCmd::Do() { - int32_t count = 0; + result_count_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendInteger(self->result_count_); + AddSlotKey("s", self->dest_key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->SUnionstore(dest_key_, keys_, value_to_dest_, &count); + s_ = db_->storage()->SUnionstore(dest_key_, keys_, value_to_dest_, &result_count_, callback); + + if (callback) { + return; + } + if (s_.ok()) { - res_.AppendInteger(count); + res_.AppendInteger(result_count_); + AddSlotKey("s", dest_key_, db_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); } @@ -447,11 +586,43 @@ void SInterstoreCmd::DoInitial() { } void SInterstoreCmd::Do() { - int32_t count = 0; + result_count_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendInteger(self->result_count_); + AddSlotKey("s", self->dest_key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->SInterstore(dest_key_, keys_, value_to_dest_, &count); + s_ = db_->storage()->SInterstore(dest_key_, keys_, value_to_dest_, &result_count_, callback); + + if (callback) { + return; + } + if (s_.ok()) { - res_.AppendInteger(count); + res_.AppendInteger(result_count_); + AddSlotKey("s", dest_key_, db_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); } @@ -553,11 +724,43 @@ void SDiffstoreCmd::DoInitial() { } void SDiffstoreCmd::Do() { - int32_t count = 0; + result_count_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendInteger(self->result_count_); + AddSlotKey("s", self->dest_key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->SDiffstore(dest_key_, keys_, value_to_dest_, &count); + s_ = db_->storage()->SDiffstore(dest_key_, keys_, value_to_dest_, &result_count_, callback); + + if (callback) { + return; + } + if (s_.ok()) { - res_.AppendInteger(count); + res_.AppendInteger(result_count_); + AddSlotKey("s", dest_key_, db_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); } @@ -587,12 +790,48 @@ void SMoveCmd::DoInitial() { } void SMoveCmd::Do() { - int32_t res = 0; + move_success_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok() || status.IsNotFound()) { + self->res_.AppendInteger(self->move_success_); + if (self->move_success_) { + AddSlotKey("s", self->dest_key_, self->db_); + } + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->SMove(src_key_, dest_key_, member_, &res); + s_ = db_->storage()->SMove(src_key_, dest_key_, member_, &move_success_, callback); + + if (callback) { + return; // Async mode, response will be sent in callback + } + + // Sync mode fallback if (s_.ok() || s_.IsNotFound()) { - res_.AppendInteger(res); - move_success_ = res; + res_.AppendInteger(move_success_); + if (move_success_) { + AddSlotKey("s", dest_key_, db_); + } } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); } diff --git a/src/pika_zset.cc b/src/pika_zset.cc index 58d012d2d0..889addc72f 100644 --- a/src/pika_zset.cc +++ b/src/pika_zset.cc @@ -35,11 +35,42 @@ void ZAddCmd::DoInitial() { } void ZAddCmd::Do() { - int32_t count = 0; + added_count_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendInteger(self->added_count_); + AddSlotKey("z", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->ZAdd(key_, score_members, &count); + s_ = db_->storage()->ZAdd(key_, score_members, &added_count_, callback); + + if (callback) { + return; + } + if (s_.ok()) { - res_.AppendInteger(count); + res_.AppendInteger(added_count_); AddSlotKey("z", key_, db_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); @@ -167,18 +198,52 @@ void ZIncrbyCmd::DoInitial() { } void ZIncrbyCmd::Do() { - double score = 0.0; + score_ = 0.0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + char buf[32]; + int64_t len = pstd::d2string(buf, sizeof(buf), self->score_); + self->res_.AppendStringLen(len); + self->res_.AppendContent(buf); + AddSlotKey("z", self->key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - rocksdb::Status s = db_->storage()->ZIncrby(key_, member_, by_, &score); - if (s.ok()) { - score_ = score; + s_ = db_->storage()->ZIncrby(key_, member_, by_, &score_, callback); + + if (callback) { + return; // Async mode, response will be sent in callback + } + + // Sync mode fallback + if (s_.ok()) { char buf[32]; - int64_t len = pstd::d2string(buf, sizeof(buf), score); + int64_t len = pstd::d2string(buf, sizeof(buf), score_); res_.AppendStringLen(len); res_.AppendContent(buf); AddSlotKey("z", key_, db_); } else { - res_.SetRes(CmdRes::kErrOther, s.ToString()); + res_.SetRes(CmdRes::kErrOther, s_.ToString()); } } @@ -714,8 +779,39 @@ void ZRemCmd::DoInitial() { } void ZRemCmd::Do() { + deleted_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok() || status.IsNotFound()) { + self->res_.AppendInteger(self->deleted_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->ZRem(key_, members_, &deleted_); + s_ = db_->storage()->ZRem(key_, members_, &deleted_, callback); + + if (callback) { + return; + } + if (s_.ok() || s_.IsNotFound()) { res_.AppendInteger(deleted_); } else { @@ -802,11 +898,42 @@ void ZUnionstoreCmd::DoInitial() { } void ZUnionstoreCmd::Do() { - int32_t count = 0; + result_count_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendInteger(self->result_count_); + AddSlotKey("z", self->dest_key_, self->db_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->ZUnionstore(dest_key_, keys_, weights_, aggregate_, value_to_dest_, &count); + s_ = db_->storage()->ZUnionstore(dest_key_, keys_, weights_, aggregate_, value_to_dest_, &result_count_, callback); + + if (callback) { + return; + } + if (s_.ok()) { - res_.AppendInteger(count); + res_.AppendInteger(result_count_); AddSlotKey("z", dest_key_, db_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); @@ -883,11 +1010,41 @@ void ZInterstoreCmd::DoInitial() { } void ZInterstoreCmd::Do() { - int32_t count = 0; + result_count_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok()) { + self->res_.AppendInteger(self->result_count_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->ZInterstore(dest_key_, keys_, weights_, aggregate_, value_to_dest_, &count); + s_ = db_->storage()->ZInterstore(dest_key_, keys_, weights_, aggregate_, value_to_dest_, &result_count_, callback); + + if (callback) { + return; + } + if (s_.ok()) { - res_.AppendInteger(count); + res_.AppendInteger(result_count_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); } @@ -1385,11 +1542,41 @@ void ZRemrangebyrankCmd::DoInitial() { } void ZRemrangebyrankCmd::Do() { - int32_t count = 0; + ele_deleted_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok() || status.IsNotFound()) { + self->res_.AppendInteger(self->ele_deleted_); + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->ZRemrangebyrank(key_, static_cast(start_rank_), static_cast(stop_rank_), &count); + s_ = db_->storage()->ZRemrangebyrank(key_, static_cast(start_rank_), static_cast(stop_rank_), &ele_deleted_, callback); + + if (callback) { + return; + } + if (s_.ok() || s_.IsNotFound()) { - res_.AppendInteger(count); + res_.AppendInteger(ele_deleted_); } else { res_.SetRes(CmdRes::kErrOther, s_.ToString()); } @@ -1427,14 +1614,45 @@ void ZRemrangebyscoreCmd::Do() { res_.AppendContent(":0"); return; } - int32_t count = 0; + + deleted_count_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (!status.ok() && !status.IsNotFound()) { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } else { + self->res_.AppendInteger(self->deleted_count_); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->ZRemrangebyscore(key_, min_score_, max_score_, left_close_, right_close_, &count); + s_ = db_->storage()->ZRemrangebyscore(key_, min_score_, max_score_, left_close_, right_close_, &deleted_count_, callback); + + if (callback) { + return; + } + if (!s_.ok() && !s_.IsNotFound()) { res_.SetRes(CmdRes::kErrOther, s_.ToString()); return; } - res_.AppendInteger(count); + res_.AppendInteger(deleted_count_); } void ZRemrangebyscoreCmd::DoThroughDB() { @@ -1469,15 +1687,45 @@ void ZRemrangebylexCmd::Do() { res_.AppendContent("*0"); return; } - int32_t count = 0; + + deleted_count_ = 0; + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (!status.ok() && !status.IsNotFound()) { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } else { + self->res_.AppendInteger(self->deleted_count_); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } STAGE_TIMER_GUARD(storage_duration_ms, true); - s_ = db_->storage()->ZRemrangebylex(key_, min_member_, max_member_, left_close_, right_close_, &count); + s_ = db_->storage()->ZRemrangebylex(key_, min_member_, max_member_, left_close_, right_close_, &deleted_count_, callback); + + if (callback) { + return; + } + if (!s_.ok() && !s_.IsNotFound()) { res_.SetRes(CmdRes::kErrOther, s_.ToString()); return; } - res_.AppendInteger(count); + res_.AppendInteger(deleted_count_); } void ZRemrangebylexCmd::DoThroughDB() { @@ -1509,21 +1757,66 @@ void ZPopmaxCmd::DoInitial() { } void ZPopmaxCmd::Do() { + score_members_.clear(); + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok() || status.IsNotFound()) { + char buf[32]; + int64_t len = 0; + self->res_.AppendArrayLenUint64(self->score_members_.size() * 2); + for (const auto& sm : self->score_members_) { + self->res_.AppendString(sm.member); + len = pstd::d2string(buf, sizeof(buf), sm.score); + self->res_.AppendStringLen(len); + self->res_.AppendContent(buf); + } + if (!self->score_members_.empty()) { + AddSlotKey("z", self->key_, self->db_); + } + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - std::vector score_members; - rocksdb::Status s = db_->storage()->ZPopMax(key_, count_, &score_members); - if (s.ok() || s.IsNotFound()) { + s_ = db_->storage()->ZPopMax(key_, count_, &score_members_, callback); + + if (callback) { + return; // Async mode, response will be sent in callback + } + + // Sync mode fallback + if (s_.ok() || s_.IsNotFound()) { char buf[32]; int64_t len = 0; - res_.AppendArrayLenUint64(score_members.size() * 2); - for (const auto& sm : score_members) { + res_.AppendArrayLenUint64(score_members_.size() * 2); + for (const auto& sm : score_members_) { res_.AppendString(sm.member); len = pstd::d2string(buf, sizeof(buf), sm.score); res_.AppendStringLen(len); res_.AppendContent(buf); } + if (!score_members_.empty()) { + AddSlotKey("z", key_, db_); + } } else { - res_.SetRes(CmdRes::kErrOther, s.ToString()); + res_.SetRes(CmdRes::kErrOther, s_.ToString()); } } @@ -1568,20 +1861,65 @@ void ZPopminCmd::DoUpdateCache(){ } void ZPopminCmd::Do() { + score_members_.clear(); + storage::CommitCallback callback = nullptr; + + if (ShouldUseAsyncMode()) { + auto self = std::static_pointer_cast(shared_from_this()); + auto resp_ptr = std::make_shared(); + auto pika_conn = std::dynamic_pointer_cast(GetConn()); + + if (!pika_conn) { + res_.SetRes(CmdRes::kErrOther, "Invalid connection"); + return; + } + + callback = [self, resp_ptr, pika_conn](rocksdb::Status status) { + if (status.ok() || status.IsNotFound()) { + char buf[32]; + int64_t len = 0; + self->res_.AppendArrayLenUint64(self->score_members_.size() * 2); + for (const auto& sm : self->score_members_) { + self->res_.AppendString(sm.member); + len = pstd::d2string(buf, sizeof(buf), sm.score); + self->res_.AppendStringLen(len); + self->res_.AppendContent(buf); + } + if (!self->score_members_.empty()) { + AddSlotKey("z", self->key_, self->db_); + } + } else { + self->res_.SetRes(CmdRes::kErrOther, status.ToString()); + } + + *resp_ptr = std::move(self->res_.message()); + pika_conn->WriteResp(*resp_ptr); + pika_conn->NotifyEpoll(true); + }; + } + STAGE_TIMER_GUARD(storage_duration_ms, true); - std::vector score_members; - rocksdb::Status s = db_->storage()->ZPopMin(key_, count_, &score_members); - if (s.ok() || s.IsNotFound()) { + s_ = db_->storage()->ZPopMin(key_, count_, &score_members_, callback); + + if (callback) { + return; // Async mode, response will be sent in callback + } + + // Sync mode fallback + if (s_.ok() || s_.IsNotFound()) { char buf[32]; int64_t len = 0; - res_.AppendArrayLenUint64(score_members.size() * 2); - for (const auto& sm : score_members) { + res_.AppendArrayLenUint64(score_members_.size() * 2); + for (const auto& sm : score_members_) { res_.AppendString(sm.member); len = pstd::d2string(buf, sizeof(buf), sm.score); res_.AppendStringLen(len); res_.AppendContent(buf); } + if (!score_members_.empty()) { + AddSlotKey("z", key_, db_); + } } else { - res_.SetRes(CmdRes::kErrOther, s.ToString()); + res_.SetRes(CmdRes::kErrOther, s_.ToString()); } } diff --git a/src/praft/CMakeLists.txt b/src/praft/CMakeLists.txt new file mode 100644 index 0000000000..5b60d23e26 --- /dev/null +++ b/src/praft/CMakeLists.txt @@ -0,0 +1,116 @@ +cmake_minimum_required(VERSION 3.18) + +set(CMAKE_CXX_STANDARD 17) +project(praft) + +# Generate binlog protobuf files +set(BINLOG_PROTO_FILE ${CMAKE_CURRENT_SOURCE_DIR}/src/binlog.proto) +custom_protobuf_generate_cpp(BINLOG_PROTO_SRCS BINLOG_PROTO_HDRS ${BINLOG_PROTO_FILE}) +message("praft BINLOG_PROTO_SRCS = ${BINLOG_PROTO_SRCS}") +message("praft BINLOG_PROTO_HDRS = ${BINLOG_PROTO_HDRS}") + +# Create a separate binlog_pb library for reuse +add_library(binlog_pb STATIC ${BINLOG_PROTO_SRCS}) +target_include_directories(binlog_pb + PUBLIC ${CMAKE_CURRENT_BINARY_DIR} + PUBLIC ${INSTALL_INCLUDEDIR} +) +target_link_libraries(binlog_pb PUBLIC ${PROTOBUF_LIBRARY}) +add_dependencies(binlog_pb protobuf) + +# Collect all source files (excluding binlog.proto which is in binlog_pb) +aux_source_directory(./src DIR_SRCS) + +# Create static library (WITHOUT binlog protobuf sources, link binlog_pb instead) +add_library(praft STATIC ${DIR_SRCS}) + +# Dependencies +add_dependencies(praft + gflags + protobuf + leveldb + brpc + braft + glog + pstd + storage + binlog_pb + pika_proto_gen +) + +# Include directories +target_include_directories(praft + PUBLIC ${PROJECT_SOURCE_DIR} + PUBLIC ${PROJECT_SOURCE_DIR}/include + PUBLIC ${CMAKE_SOURCE_DIR} # Project root for include/pika_*.h + PUBLIC ${CMAKE_SOURCE_DIR}/src # For other modules + PUBLIC ${CMAKE_SOURCE_DIR}/src/storage/include # For storage module + PUBLIC ${CMAKE_SOURCE_DIR}/src/pstd/include # For pstd module + PUBLIC ${CMAKE_BINARY_DIR} # For generated protobuf files (pika_inner_message.pb.h) + PUBLIC ${CMAKE_CURRENT_BINARY_DIR} # For generated binlog.pb.h + PUBLIC ${INSTALL_INCLUDEDIR} +) + +# Link libraries +target_link_libraries(praft + PUBLIC binlog_pb # Link the binlog protobuf library + PUBLIC ${BRAFT_LIBRARY} + PUBLIC ${BRPC_LIBRARY} + PUBLIC ${LEVELDB_LIBRARY} + PUBLIC ${PROTOBUF_LIBRARY} + PUBLIC ${GFLAGS_LIBRARY} + PUBLIC ${GLOG_LIBRARY} + PUBLIC pstd + PUBLIC storage +) + +# Platform-specific libraries +if(CMAKE_SYSTEM_NAME MATCHES "Darwin") + # macOS frameworks + target_link_libraries(praft + PUBLIC "-framework CoreFoundation" + PUBLIC "-framework CoreGraphics" + PUBLIC "-framework CoreData" + PUBLIC "-framework CoreText" + PUBLIC "-framework Security" + PUBLIC "-framework Foundation" + PUBLIC "-framework ApplicationServices" + PUBLIC "-framework SystemConfiguration" + PUBLIC "-framework AppKit" + ) + + # OpenSSL + find_package(OpenSSL REQUIRED) + if(OPENSSL_FOUND) + target_include_directories(praft PUBLIC ${OPENSSL_INCLUDE_DIR}) + target_link_libraries(praft PUBLIC ${OPENSSL_LIBRARIES}) + else() + find_library(OPENSSL_CRYPTO_LIBRARY NAMES crypto libcrypto) + find_library(OPENSSL_SSL_LIBRARY NAMES ssl libssl) + if(OPENSSL_CRYPTO_LIBRARY) + target_link_libraries(praft PUBLIC ${OPENSSL_CRYPTO_LIBRARY}) + endif() + if(OPENSSL_SSL_LIBRARY) + target_link_libraries(praft PUBLIC ${OPENSSL_SSL_LIBRARY}) + endif() + endif() +elseif(CMAKE_SYSTEM_NAME MATCHES "Linux") + # Linux system libraries + target_link_libraries(praft + PUBLIC rt + PUBLIC dl + ) + + # OpenSSL + find_package(OpenSSL) + if(OPENSSL_FOUND) + target_link_libraries(praft PUBLIC OpenSSL::SSL OpenSSL::Crypto) + else() + find_library(OPENSSL_CRYPTO_LIBRARY NAMES crypto libcrypto) + find_library(OPENSSL_SSL_LIBRARY NAMES ssl libssl) + if(OPENSSL_CRYPTO_LIBRARY AND OPENSSL_SSL_LIBRARY) + target_link_libraries(praft PUBLIC ${OPENSSL_SSL_LIBRARY} ${OPENSSL_CRYPTO_LIBRARY}) + endif() + endif() +endif() + diff --git a/src/praft/include/praft/praft.h b/src/praft/include/praft/praft.h new file mode 100644 index 0000000000..a637a69dbb --- /dev/null +++ b/src/praft/include/praft/praft.h @@ -0,0 +1,223 @@ +// Copyright (c) 2015-present, Qihoo, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef PRAFT_PRAFT_H_ +#define PRAFT_PRAFT_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "braft/raft.h" +#include "braft/storage.h" +#include "braft/util.h" +#include "braft/file_system_adaptor.h" +#include "pstd/include/pstd_mutex.h" +#include "pstd/include/pstd_status.h" +#include "rocksdb/status.h" +#include "storage/batch.h" + +class PikaServer; +class Cmd; + +// Forward declarations +namespace storage { +class Storage; +} + +// Forward declarations +namespace pikiwidb { +class Binlog; +} + +namespace net { +class NetConn; +} + +namespace pika_raft { + +// Raft log entry data structure +// Write done closure for asynchronous Raft callback +class WriteDoneClosure : public braft::Closure { + public: + WriteDoneClosure() = default; + ~WriteDoneClosure() override = default; + + void Run() override; + + // Set promise for synchronous Raft apply (used in Follower on_apply) + void SetPromise(std::shared_ptr> p) { + promise_ = p; + } + + // Set callback for async response (used in Leader) + void SetCallback(storage::CommitCallback callback) { + callback_ = callback; + } + + private: + // For synchronous mode (Follower) + std::shared_ptr> promise_; + + // For asynchronous mode (Leader) + storage::CommitCallback callback_; +}; + +// Pika state machine implementation +class PikaStateMachine : public braft::StateMachine { + public: + PikaStateMachine(); + ~PikaStateMachine() override = default; + + void SetLeaderTerm(std::atomic* leader_term); + + // Apply committed log entry + void on_apply(braft::Iterator& iter) override; + + // Save snapshot + void on_snapshot_save(braft::SnapshotWriter* writer, braft::Closure* done) override; + + // Load snapshot + int on_snapshot_load(braft::SnapshotReader* reader) override; + + // Leadership changed callback + void on_leader_start(int64_t term) override; + void on_leader_stop(const butil::Status& status) override; + + // Error callback + void on_error(const ::braft::Error& e) override; + + // Configuration changed callback + void on_configuration_committed(const ::braft::Configuration& conf) override; + + void on_start_following(const ::braft::LeaderChangeContext& ctx) override; + void on_stop_following(const ::braft::LeaderChangeContext& ctx) override; + + private: + std::atomic is_node_first_start_up_{true}; // 标记节点是否首次启动 + std::atomic* leader_term_{nullptr}; +}; + +// Raft node wrapper +class PikaRaftNode { + public: + PikaRaftNode(const std::string& group_id, const braft::PeerId& peer_id); + ~PikaRaftNode(); + + // Initialize the Raft node + pstd::Status Init(const std::vector& peers); + + // Start the Raft node + pstd::Status Start(); + + // Shutdown the Raft node + void Shutdown(); + + // Check if this node is leader + bool IsLeader() const; + + // Get leader peer ID + braft::PeerId GetLeaderId(); + + // Add peer to the cluster + pstd::Status AddPeer(const braft::PeerId& peer); + + // Remove peer from the cluster + pstd::Status RemovePeer(const braft::PeerId& peer); + + // Get cluster status information + void GetStatus(std::string* status_str); + + void GetLeaderLeaseStatus(braft::LeaderLeaseStatus* status) const; + + // Trigger a snapshot creation + pstd::Status DoSnapshot(int64_t self_snapshot_index = 0, bool is_sync = true); + + braft::Node* GetRaftNode() { return node_.get(); } + + private: + std::string group_id_; + braft::PeerId peer_id_; + std::unique_ptr server_; // brpc server for Raft RPC + std::unique_ptr node_; + std::unique_ptr state_machine_; + + // Raft data paths + std::string raft_data_dir_; + std::string raft_log_uri_; + std::string raft_meta_uri_; + std::string raft_snapshot_uri_; + + // Snapshot adaptor + scoped_refptr snapshot_adaptor_; + std::atomic leader_term_{-1}; +}; + +// Raft cluster manager +class RaftManager { + public: + RaftManager(); + ~RaftManager(); + + // Initialize the Raft manager + pstd::Status Init(); + + // Start the Raft manager + pstd::Status Start(); + + // Shutdown the Raft manager + void Shutdown(); + + // Initialize a new Raft cluster + pstd::Status InitCluster(const std::string& db_name, const std::vector& peers); + + // Add a node to the cluster + pstd::Status AddNode(const std::string& db_name, const std::string& peer_addr); + + // Remove a node from the cluster + pstd::Status RemoveNode(const std::string& db_name, const std::string& peer_addr); + + // Get cluster information + pstd::Status GetClusterInfo(const std::string& db_name, std::string* info); + + // Append binlog (supports both sync and async modes) + // Sync mode: pass promise, async mode: pass callback + void AppendLog(const std::string& db_name, + const ::pikiwidb::Binlog& log, + std::promise&& promise, + storage::CommitCallback callback = nullptr); + + // Get Raft node for a specific DB + std::shared_ptr GetRaftNode(const std::string& db_name); + + // Apply binlog entry to storage (public for PikaStateMachine to call) + rocksdb::Status ApplyBinlogEntry(const ::pikiwidb::Binlog& binlog, uint64_t log_index = 0); + + private: + std::atomic initialized_; + std::atomic running_; + + // Configuration + int election_timeout_ms_; + int snapshot_interval_s_; + std::string group_id_; + + // Raft nodes for each database + mutable std::shared_mutex nodes_mutex_; + std::unordered_map> raft_nodes_; + + // Helper methods + pstd::Status CreateRaftNode(const std::string& db_name, const std::vector& peers); + braft::PeerId ParsePeerId(const std::string& peer_str); +}; + +} // namespace pika_raft + +#endif // PRAFT_PRAFT_H_ diff --git a/src/praft/include/praft/psnapshot.h b/src/praft/include/praft/psnapshot.h new file mode 100644 index 0000000000..ecd6da0530 --- /dev/null +++ b/src/praft/include/praft/psnapshot.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +#pragma once + +#include + +#include "braft/file_system_adaptor.h" +#include "braft/macros.h" +#include "braft/snapshot.h" + +#define PRAFT_SNAPSHOT_META_FILE "__raft_snapshot_meta" +#define PRAFT_SNAPSHOT_PATH "snapshot/snapshot_" +#define IS_RDONLY 0x01 + +// 自定义文件系统适配器,用于Braft快照生成 +class PPosixFileSystemAdaptor : public braft::PosixFileSystemAdaptor { +public: + PPosixFileSystemAdaptor() {} + ~PPosixFileSystemAdaptor() {} + + braft::FileAdaptor* open(const std::string& path, int oflag, const ::google::protobuf::Message* file_meta, + butil::File::Error* e) override; + + void AddAllFiles(const std::string& dir, braft::LocalSnapshotMetaTable* snapshot_meta_memtable, + const std::string& base_path); + +private: + braft::raft_mutex_t mutex_; +}; diff --git a/src/praft/src/binlog.proto b/src/praft/src/binlog.proto new file mode 100644 index 0000000000..3cb2d9a6bc --- /dev/null +++ b/src/praft/src/binlog.proto @@ -0,0 +1,42 @@ +// Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +syntax = "proto3"; +package pikiwidb; + +option optimize_for = LITE_RUNTIME; + +// 操作类型 +enum OperateType { + kNoOperate = 0; + kPut = 1; + kDelete = 2; +} + +// 数据类型(与 storage::DataType 对应) +enum DataType { + kAll = 0; + kStrings = 1; + kHashes = 2; + kLists = 3; + kZSets = 4; + kSets = 5; + kStreams = 6; +} + +// Binlog 条目(对应单个 RocksDB 操作) +message BinlogEntry { + DataType data_type = 1; // 数据类型(用于定位对应的 RocksDB 实例) + uint32 cf_idx = 2; // 列族索引 (column family index, 0=meta/default, 1=data) + OperateType op_type = 3; // 操作类型 + bytes key = 4; // 已编码的 key + optional bytes value = 5; // 已编码的 value(包含 TTL、version 等) +} + +// Binlog(对应一次 Raft 日志提交) +message Binlog { + uint32 db_id = 1; // 数据库 ID + uint32 slot_idx = 2; // 槽位索引(预留) + repeated BinlogEntry entries = 3; // 批量操作条目 +} diff --git a/src/praft/src/praft.cc b/src/praft/src/praft.cc new file mode 100644 index 0000000000..75e116f995 --- /dev/null +++ b/src/praft/src/praft.cc @@ -0,0 +1,802 @@ +// Copyright (c) 2015-present, Qihoo, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "praft/praft.h" + +#include +#include +#include +#include +#include + +#include + +#include "braft/configuration.h" +#include "braft/raft.h" +#include "braft/repeated_timer_task.h" +#include "brpc/server.h" +#include "brpc/closure_guard.h" +#include "include/pika_conf.h" +#include "include/pika_server.h" +#include "include/pika_command.h" +#include "include/pika_client_conn.h" +#include "binlog.pb.h" +#include "storage/storage.h" +#include "storage/batch.h" +#include "pstd/include/env.h" +#include "praft/psnapshot.h" +#include "binlog.pb.h" + +DECLARE_bool(raft_enable_leader_lease); + +extern std::unique_ptr g_pika_conf; +extern std::unique_ptr g_pika_server; + +namespace pika_raft { + +// PikaStateMachine implementation +PikaStateMachine::PikaStateMachine() {} + +void PikaStateMachine::SetLeaderTerm(std::atomic* leader_term) { + leader_term_ = leader_term +} + +void PikaStateMachine::on_apply(braft::Iterator& iter) { + for (; iter.valid(); iter.next()) { + auto done = iter.done(); + brpc::ClosureGuard done_guard(done); + + int64_t index = iter.index(); + + if (!g_pika_server || !g_pika_server->GetRaftManager()) { + // Run closure asynchronously in bthread to avoid blocking on_apply + if (done) { + braft::run_closure_in_bthread(done_guard.release()); + } + continue; + } + + pikiwidb::Binlog binlog; + butil::IOBufAsZeroCopyInputStream wrapper(iter.data()); + if (!binlog.ParseFromZeroCopyStream(&wrapper)) { + if (done) { + done->status().set_error(EINVAL, "Failed to parse binlog"); + } + if (done) { + braft::run_closure_in_bthread(done_guard.release()); + } + continue; + } + + // Apply binlog with log index for tracking + rocksdb::Status apply_status = g_pika_server->GetRaftManager()->ApplyBinlogEntry(binlog, index); + + if (done) { + if (apply_status.ok()) { + done->status().set_error(0, "OK"); + } else { + done->status().set_error(-1, "%s", apply_status.ToString().c_str()); + LOG(ERROR) << "Apply binlog failed: " << apply_status.ToString(); + } + } + + + // Run closure asynchronously in bthread to avoid blocking on_apply + if (done) { + braft::run_closure_in_bthread(done_guard.release()); + } + } +} + +void PikaStateMachine::on_snapshot_save(braft::SnapshotWriter* writer, braft::Closure* done) { + brpc::ClosureGuard done_guard(done); +} + +int PikaStateMachine::on_snapshot_load(braft::SnapshotReader* reader) { + if (!reader) { + LOG(ERROR) << "SnapshotReader is null"; + return -1; + } + + if (!g_pika_server || !g_pika_server->GetRaftManager()) { + LOG(ERROR) << "PikaServer or RaftManager is not initialized"; + return -1; + } + + // 首次启动时的处理 + if (is_node_first_start_up_.load()) { + /* + * 场景分析: + * 1. 正常关机后重启:所有内存数据已刷盘,快照已截断到最新位置, + * 此时 flush-index 和 apply-index 相同,应该获取最大日志索引 + * 2. 异常宕机后重启:部分数据未刷盘,应该获取最小 flush-index 作为 + * 故障恢复的起点 + */ + std::string db_name = "db0"; + auto db = g_pika_server->GetDB(db_name); + if (db && db->storage()) { + uint64_t replay_point = db->storage()->GetSmallestFlushedLogIndex(); + LOG(INFO) << "Node first start, detected replay_point: " << replay_point; + } + + is_node_first_start_up_.store(false); + + /* + * 如果节点刚加入集群且没有任何数据,启动时不会加载本地快照。 + * 因此需要在从 Leader 加载快照后,执行数据恢复。 + * + * 如果有本地日志数据(不是空节点),直接返回,从 replay_point 开始回放日志 + */ + uint64_t last_log_index = 0; + if (auto raft_mgr = g_pika_server->GetRaftManager()) { + auto raft_node = raft_mgr->GetRaftNode(db_name); + if (raft_node && raft_node->GetRaftNode()) { + braft::NodeStatus status; + raft_node->GetRaftNode()->get_status(&status); + last_log_index = status.last_index; + } + } + if (last_log_index != 0) { + LOG(INFO) << "Node has local data, last_log_index: " << last_log_index + << ", will replay from existing data"; + return 0; + } + } + + /* + * 安装快照: + * 1. 新节点加入集群(无本地数据) + * 2. Follower 落后太多,Leader 主动推送快照 + */ + std::string reader_path = reader->get_path(); + LOG(INFO) << "Loading snapshot from: " << reader_path; + + std::string db_name = "db0"; + auto db = g_pika_server->GetDB(db_name); + if (!db) { + LOG(ERROR) << "Failed to get DB: " << db_name; + return -1; + } + + auto storage = db->storage(); + if (!storage) { + LOG(ERROR) << "Storage is null for DB: " << db_name; + return -1; + } + + std::set dbs{db_name}; + TaskArg task(TaskType::kLoadDBFromCheckpoint, {reader_path}); + auto status = g_pika_server->DoSameThingSpecificDB(dbs, task); + if (!status.ok()) { + LOG(ERROR) << "Failed to load snapshot into DB: " << status.ToString(); + return -1; + } + + LOG(INFO) << "Snapshot load completed from: " << reader_path; + return 0; +} + +void PikaStateMachine::on_leader_start(int64_t term) { + if (leader_term_) { + leader_term_->store(term, std::memory_order_release); + } +} + +void PikaStateMachine::on_leader_stop(const butil::Status& status) { + if (leader_term_) { + leader_term_->store(-1, std::memory_order_release); + } +} + +void PikaStateMachine::on_error(const ::braft::Error& e) { + // Error occurred +} + +void PikaStateMachine::on_configuration_committed(const ::braft::Configuration& conf) { + // Configuration committed +} + +void PikaStateMachine::on_start_following(const ::braft::LeaderChangeContext& ctx) { + // Start following +} + +void PikaStateMachine::on_stop_following(const ::braft::LeaderChangeContext& ctx) { + // Stop following +} + +// PikaRaftNode implementation +PikaRaftNode::PikaRaftNode(const std::string& group_id, const braft::PeerId& peer_id) + : group_id_(group_id), peer_id_(peer_id) { + // Setup Raft data directories + raft_data_dir_ = g_pika_conf->db_path() + "/raft/" + group_id; + raft_log_uri_ = "local://" + raft_data_dir_ + "/log"; + raft_meta_uri_ = "local://" + raft_data_dir_ + "/raft_meta"; + raft_snapshot_uri_ = "local://" + raft_data_dir_ + "/snapshot"; +} + +PikaRaftNode::~PikaRaftNode() { + Shutdown(); +} + +pstd::Status PikaRaftNode::Init(const std::vector& peers) { + // Create state machine + state_machine_ = std::make_unique(); + state_machine_->SetLeaderTerm(&leader_term_); + + // Create and start brpc server for Raft RPC + server_ = std::make_unique(); + + // Add Raft service to brpc server + if (braft::add_service(server_.get(), peer_id_.addr) != 0) { + LOG(ERROR) << "Failed to add Raft service to brpc server"; + return pstd::Status::Corruption("Failed to add Raft service"); + } + + // Start brpc server + if (server_->Start(peer_id_.addr.port, nullptr) != 0) { + LOG(ERROR) << "Failed to start brpc server on " << peer_id_.addr; + return pstd::Status::Corruption("Failed to start brpc server"); + } + + LOG(INFO) << "brpc server started on " << peer_id_.addr; + + // Setup Raft node options + braft::NodeOptions node_options; + + // Set initial configuration + for (const auto& peer : peers) { + node_options.initial_conf.add_peer(peer); + } + + // Set file system paths + node_options.log_uri = raft_log_uri_; + node_options.raft_meta_uri = raft_meta_uri_; + node_options.snapshot_uri = raft_snapshot_uri_; + + // Set state machine + node_options.fsm = state_machine_.get(); + + // Set election timeout + node_options.election_timeout_ms = g_pika_conf ? g_pika_conf->raft_election_timeout_ms() : 1000; + + // Set snapshot interval + node_options.snapshot_interval_s = g_pika_conf ? g_pika_conf->raft_snapshot_interval_s() : 3600; + + // Initialize custom snapshot adaptor + snapshot_adaptor_ = new PPosixFileSystemAdaptor(); + node_options.snapshot_file_system_adaptor = &snapshot_adaptor_; + + // Create and initialize Raft node + node_ = std::make_unique(group_id_, peer_id_); + + if (node_->init(node_options) != 0) { + LOG(ERROR) << "Failed to init Raft node"; + return pstd::Status::Corruption("Failed to init Raft node"); + } + + FLAGS_raft_enable_leader_lease = true; + + return pstd::Status::OK(); +} + +pstd::Status PikaRaftNode::Start() { + if (!node_) { + return pstd::Status::Corruption("Raft node not initialized"); + } + + return pstd::Status::OK(); +} + +void PikaRaftNode::Shutdown() { + if (node_) { + node_->shutdown(nullptr); + node_->join(); + node_.reset(); + } + + if (server_) { + server_->Stop(0); + server_->Join(); + server_.reset(); + } +} + +bool PikaRaftNode::IsLeader() const { + if (!node_) return false; + + braft::LeaderLeaseStatus lease_status; + node_->get_leader_lease_status(&lease_status); + + auto current_term = leader_term_.load(std::memory_order_acquire); + return current_term > 0 && current_term == lease_status.term && + lease_status.state == braft::LeaseState::LEASE_VALID; +} + +braft::PeerId PikaRaftNode::GetLeaderId() { + if (!node_) return braft::PeerId(); + return node_->leader_id(); +} + +void PikaRaftNode::GetLeaderLeaseStatus(braft::LeaderLeaseStatus* status) const { + if (!node_ || !status) { + return; + } + node_->get_leader_lease_status(status); +} + +pstd::Status PikaRaftNode::AddPeer(const braft::PeerId& peer) { + if (!node_) { + return pstd::Status::Corruption("Raft node not initialized"); + } + + // Check if current node is leader + // Member changes must be initiated by the leader + if (!IsLeader()) { + braft::PeerId leader = GetLeaderId(); + std::string error_msg = "Not leader. Current leader is: " + leader.to_string(); + LOG(WARNING) << "AddPeer failed: " << error_msg; + return pstd::Status::Corruption(error_msg); + } + + braft::SynchronizedClosure done; + node_->add_peer(peer, &done); + done.wait(); + + if (!done.status().ok()) { + return pstd::Status::Corruption("Failed to add peer: " + done.status().error_str()); + } + + return pstd::Status::OK(); +} + +pstd::Status PikaRaftNode::RemovePeer(const braft::PeerId& peer) { + if (!node_) { + return pstd::Status::Corruption("Raft node not initialized"); + } + + // Check if current node is leader + // Member changes must be initiated by the leader + if (!IsLeader()) { + braft::PeerId leader = GetLeaderId(); + std::string error_msg = "Not leader. Current leader is: " + leader.to_string(); + LOG(WARNING) << "RemovePeer failed: " << error_msg; + return pstd::Status::Corruption(error_msg); + } + + braft::SynchronizedClosure done; + node_->remove_peer(peer, &done); + done.wait(); + + if (!done.status().ok()) { + return pstd::Status::Corruption("Failed to remove peer: " + done.status().error_str()); + } + + return pstd::Status::OK(); +} + +void PikaRaftNode::GetStatus(std::string* status_str) { + if (!node_) { + *status_str = "Raft node not initialized"; + return; + } + + braft::NodeStatus status; + node_->get_status(&status); + + std::ostringstream oss; + oss << "Group: " << group_id_ << "\n"; + oss << "PeerId: " << peer_id_.to_string() << "\n"; + oss << "State: " << (IsLeader() ? "LEADER" : "FOLLOWER") << "\n"; + oss << "Leader: " << status.leader_id.to_string() << "\n"; + oss << "Term: " << status.term << "\n"; + + // Try to list cluster members + std::vector peers; + butil::Status st = node_->list_peers(&peers); + + if (st.ok() && !peers.empty()) { + oss << "Cluster Members (" << peers.size() << "): "; + for (size_t i = 0; i < peers.size(); i++) { + oss << peers[i].to_string(); + if (i < peers.size() - 1) { + oss << ", "; + } + } + oss << "\n"; + } else { + // For Follower nodes, list_peers() may not work, suggest querying Leader + if (!IsLeader()) { + oss << "Cluster Members: Query leader at " << status.leader_id.to_string() + << " for full member list\n"; + } else { + oss << "Cluster Members: Unable to retrieve\n"; + } + } + + oss << "Committed Index: " << status.committed_index << "\n"; + oss << "Known Applied Index: " << status.known_applied_index << "\n"; + oss << "Pending Index: " << status.pending_index << "\n"; + oss << "Pending Queue Size: " << status.pending_queue_size << "\n"; + oss << "Applying Index: " << status.applying_index << "\n"; + oss << "First Index: " << status.first_index << "\n"; + oss << "Last Index: " << status.last_index << "\n"; + + *status_str = oss.str(); +} + +pstd::Status PikaRaftNode::DoSnapshot(int64_t self_snapshot_index, bool is_sync) { + if (!node_) { + return pstd::Status::Corruption("Raft node not initialized"); + } + + if (is_sync) { + braft::SynchronizedClosure done; + node_->snapshot(&done); + done.wait(); + + if (!done.status().ok()) { + return pstd::Status::Corruption("Failed to create snapshot: " + done.status().error_str()); + } + } else { + node_->snapshot(nullptr); + } + + return pstd::Status::OK(); +} + +// RaftManager implementation +RaftManager::RaftManager() + : initialized_(false), + running_(false), + election_timeout_ms_(1000), + snapshot_interval_s_(3600) { +} + +RaftManager::~RaftManager() { + Shutdown(); +} + +pstd::Status RaftManager::Init() { + if (initialized_.load()) { + return pstd::Status::OK(); + } + + // Load configuration + election_timeout_ms_ = g_pika_conf->raft_election_timeout_ms(); + snapshot_interval_s_ = g_pika_conf->raft_snapshot_interval_s(); + group_id_ = g_pika_conf->raft_group_id(); + + LOG(INFO) << "Initializing Raft manager with group_id: " << group_id_ + << ", election_timeout: " << election_timeout_ms_ << "ms" + << ", snapshot_interval: " << snapshot_interval_s_ << "s"; + + // Check if Raft metadata directory exists (node was previously in a cluster) + std::string raft_meta_dir = g_pika_conf->db_path() + "/raft/" + group_id_ + "_db0/raft_meta"; + bool raft_meta_exists = pstd::FileExists(raft_meta_dir); + + if (raft_meta_exists) { + // Node was previously in a cluster, restore from persisted metadata + LOG(INFO) << "Raft metadata directory exists, node was previously in cluster. Restoring from persisted configuration..."; + + // When raft_meta exists, braft will ignore initial_conf and load configuration from raft_meta + // We pass an empty peer list as it will be ignored anyway + std::vector empty_peer_list; + pstd::Status status = InitCluster("db0", empty_peer_list); + if (!status.ok()) { + LOG(ERROR) << "Failed to restore Raft node from metadata: " << status.ToString(); + } else { + LOG(INFO) << "Raft node restored successfully from persisted configuration"; + } + } else { + // First time startup - no raft_meta found + // Do not auto-initialize, require manual initialization via RAFT.CLUSTER INIT command + LOG(INFO) << "No existing Raft metadata found."; + LOG(INFO) << "This is the first time starting Raft on this node."; + LOG(INFO) << "Please initialize the cluster manually using: RAFT.CLUSTER INIT [peers]"; + LOG(INFO) << " - For single-node cluster: RAFT.CLUSTER INIT"; + LOG(INFO) << " - For multi-node cluster: RAFT.CLUSTER INIT ,,..."; + } + + initialized_.store(true); + return pstd::Status::OK(); +} + +pstd::Status RaftManager::Start() { + if (!initialized_.load()) { + return pstd::Status::Corruption("RaftManager not initialized"); + } + + if (running_.load()) { + return pstd::Status::OK(); + } + + LOG(INFO) << "Starting Raft manager"; + + // Start all Raft nodes + std::shared_lock lock(nodes_mutex_); + for (auto& pair : raft_nodes_) { + auto status = pair.second->Start(); + if (!status.ok()) { + LOG(ERROR) << "Failed to start Raft node for DB: " << pair.first; + return status; + } + } + + running_.store(true); + LOG(INFO) << "Raft manager started successfully"; + return pstd::Status::OK(); +} + +void RaftManager::Shutdown() { + if (!running_.load()) { + return; + } + + LOG(INFO) << "Shutting down Raft manager"; + + // Shutdown all Raft nodes + std::unique_lock lock(nodes_mutex_); + for (auto& pair : raft_nodes_) { + pair.second->Shutdown(); + } + raft_nodes_.clear(); + + running_.store(false); + initialized_.store(false); + LOG(INFO) << "Raft manager shutdown complete"; +} + +pstd::Status RaftManager::InitCluster(const std::string& db_name, + const std::vector& peers) { + std::vector peer_ids; + for (const auto& peer_str : peers) { + braft::PeerId peer_id = ParsePeerId(peer_str); + if (peer_id.is_empty()) { + return pstd::Status::Corruption("Invalid peer address: " + peer_str); + } + peer_ids.push_back(peer_id); + } + + return CreateRaftNode(db_name, peer_ids); +} + + +pstd::Status RaftManager::AddNode(const std::string& db_name, + const std::string& peer_addr) { + auto node = GetRaftNode(db_name); + if (!node) { + return pstd::Status::Corruption("Raft node not found for DB: " + db_name); + } + + braft::PeerId peer_id = ParsePeerId(peer_addr); + if (peer_id.is_empty()) { + return pstd::Status::Corruption("Invalid peer address: " + peer_addr); + } + + return node->AddPeer(peer_id); +} + +pstd::Status RaftManager::RemoveNode(const std::string& db_name, + const std::string& peer_addr) { + auto node = GetRaftNode(db_name); + if (!node) { + return pstd::Status::Corruption("Raft node not found for DB: " + db_name); + } + + braft::PeerId peer_id = ParsePeerId(peer_addr); + + if (peer_id.is_empty()) { + return pstd::Status::Corruption("Invalid peer address: " + peer_addr); + } + + return node->RemovePeer(peer_id); +} + +pstd::Status RaftManager::GetClusterInfo(const std::string& db_name, + std::string* info) { + auto node = GetRaftNode(db_name); + + if (!node) { + return pstd::Status::Corruption("Raft node not found for DB: " + db_name); + } + + node->GetStatus(info); + return pstd::Status::OK(); +} + +std::shared_ptr RaftManager::GetRaftNode(const std::string& db_name) { + std::shared_lock lock(nodes_mutex_); + auto it = raft_nodes_.find(db_name); + if (it != raft_nodes_.end()) { + return it->second; + } + return nullptr; +} + +pstd::Status RaftManager::CreateRaftNode(const std::string& db_name, + const std::vector& peers) { + std::unique_lock lock(nodes_mutex_); + + // Check if node already exists + if (raft_nodes_.find(db_name) != raft_nodes_.end()) { + return pstd::Status::Corruption("Raft node already exists for DB: " + db_name); + } + + // Determine the Raft port for this node + // Raft uses Pika port + 3000 + int raft_port = g_pika_conf->port() + 3000; + + // Find the peer address from the peers list that matches our Raft port + // This allows the user to specify the exact address in RAFT.CLUSTER INIT command + braft::PeerId peer_id; + bool found = false; + + for (const auto& peer : peers) { + if (peer.addr.port == raft_port) { + peer_id = peer; + found = true; + LOG(INFO) << "Found matching peer address in cluster config: " << peer.to_string(); + break; + } + } + + // If no matching peer found, return error + if (!found) { + std::string error_msg = "No matching peer address found in cluster config for Raft port " + + std::to_string(raft_port) + + ". Please include this node's address (with port " + + std::to_string(raft_port) + + ") in the RAFT.CLUSTER INIT command. " + + "Example: RAFT.CLUSTER INIT :" + std::to_string(raft_port) + + ",:,..."; + LOG(ERROR) << error_msg; + return pstd::Status::Corruption(error_msg); + } + + LOG(INFO) << "Creating Raft node for DB: " << db_name << " with address: " << peer_id.to_string(); + + // Create Raft node + auto node = std::make_shared(group_id_ + "_" + db_name, peer_id); + + // Initialize node + auto status = node->Init(peers); + if (!status.ok()) { + return status; + } + + // Start node if manager is running + if (running_.load()) { + status = node->Start(); + if (!status.ok()) { + return status; + } + } + + // Store node + raft_nodes_[db_name] = node; + + LOG(INFO) << "Created Raft node for DB: " << db_name; + return pstd::Status::OK(); +} + +braft::PeerId RaftManager::ParsePeerId(const std::string& peer_str) { + braft::PeerId peer_id; + if (peer_id.parse(peer_str) != 0) { + return braft::PeerId(); + } + return peer_id; +} + +// WriteDoneClosure implementation + +void WriteDoneClosure::Run() { + std::unique_ptr self_guard(this); + + // If promise is set, notify the waiting thread (synchronous mode) + if (promise_) { + if (status().ok()) { + promise_->set_value(rocksdb::Status::OK()); + } else { + promise_->set_value(rocksdb::Status::IOError(status().error_str())); + } + return; + } + + // If callback is set, call it (asynchronous mode for Leader) + if (callback_) { + rocksdb::Status s; + if (status().ok()) { + s = rocksdb::Status::OK(); + } else { + s = rocksdb::Status::IOError(status().error_str()); + } + // Call callback with status only (result is captured in lambda) + callback_(s); + return; + } + + // Legacy path for closures without promise or callback + if (!status().ok()) { + LOG(WARNING) << "Raft operation failed: " << status().error_str(); + } +} + +void RaftManager::AppendLog(const std::string& db_name, + const ::pikiwidb::Binlog& log, + std::promise&& promise, + storage::CommitCallback callback) { + auto node = GetRaftNode(db_name); + if (!node) { + LOG(ERROR) << "Raft node not found for DB: " << db_name; + if (callback) { + callback(rocksdb::Status::NotFound("Raft node not found")); + } else { + promise.set_value(rocksdb::Status::NotFound("Raft node not found")); + } + return; + } + + if (!node->IsLeader()) { + braft::PeerId leader = node->GetLeaderId(); + LOG(WARNING) << "Current node is not leader for DB: " << db_name + << ", leader: " << leader.to_string(); + if (callback) { + callback(rocksdb::Status::Incomplete("Not leader, leader is: " + leader.to_string())); + } else { + promise.set_value(rocksdb::Status::Incomplete("Not leader")); + } + return; + } + + auto* done = new WriteDoneClosure(); + + if (callback) { + done->SetCallback(callback); + } else { + done->SetPromise(std::make_shared>(std::move(promise))); + } + + butil::IOBuf data; + butil::IOBufAsZeroCopyOutputStream wrapper(&data); + if (!log.SerializeToZeroCopyStream(&wrapper)) { + done->status().set_error(-1, "Failed to serialize binlog"); + done->Run(); + return; + } + + braft::Task task; + task.data = &data; + task.done = done; + + node->GetRaftNode()->apply(task); +} + +rocksdb::Status RaftManager::ApplyBinlogEntry(const ::pikiwidb::Binlog& binlog, uint64_t log_index) { + std::string db_name = "db0"; + + auto db = g_pika_server->GetDB(db_name); + if (!db) { + LOG(ERROR) << "Failed to get DB: " << db_name; + return rocksdb::Status::NotFound("DB not found: " + db_name); + } + + auto storage = db->storage(); + if (!storage) { + LOG(ERROR) << "Storage is null for DB: " << db_name; + return rocksdb::Status::InvalidArgument("Storage is null"); + } + + // Pass log_index to storage layer for tracking + auto status = storage->OnBinlogWrite(binlog, log_index); + + if (!status.ok()) { + LOG(ERROR) << "Failed to apply binlog to " << db_name << " at log_index " << log_index + << ": " << status.ToString(); + } + + return status; +} + +} // namespace pika_raft diff --git a/src/praft/src/psnapshot.cc b/src/praft/src/psnapshot.cc new file mode 100644 index 0000000000..953d456fae --- /dev/null +++ b/src/praft/src/psnapshot.cc @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +#include "praft/psnapshot.h" + +#include +#include +#include +#include + +#include "braft/local_file_meta.pb.h" +#include "butil/files/file_path.h" +#include "include/pika_conf.h" +#include "include/pika_server.h" +#include "praft/praft.h" +#include "storage/storage.h" +#include "storage/backupable.h" + +extern std::unique_ptr g_pika_conf; +extern std::unique_ptr g_pika_server; + +static bool IsDirectory(const std::string& path) { + struct stat st; + if (stat(path.c_str(), &st) != 0) { + return false; + } + return S_ISDIR(st.st_mode); +} + +static bool IsRegularFile(const std::string& path) { + struct stat st; + if (stat(path.c_str(), &st) != 0) { + return false; + } + return S_ISREG(st.st_mode); +} + +static std::string GetRelativePath(const std::string& full_path, const std::string& base_path) { + if (full_path.find(base_path) == 0) { + std::string relative = full_path.substr(base_path.length()); + if (!relative.empty() && relative[0] == '/') { + relative = relative.substr(1); + } + return relative; + } + return full_path; +} + +braft::FileAdaptor* PPosixFileSystemAdaptor::open(const std::string& path, int oflag, + const ::google::protobuf::Message* file_meta, + butil::File::Error* e) { + if ((oflag & IS_RDONLY) == 0) { // This is a read operation + bool snapshots_exists = false; + std::string snapshot_path; + + // parse snapshot path + butil::FilePath parse_snapshot_path(path); + std::vector components; + parse_snapshot_path.GetComponents(&components); + for (const auto& component : components) { + snapshot_path += component + "/"; + if (component.find("snapshot_") != std::string::npos) { + break; + } + } + + // check whether snapshots have been created + std::lock_guard guard(mutex_); + if (!snapshot_path.empty()) { + DIR* dir = opendir(snapshot_path.c_str()); + if (dir) { + struct dirent* entry; + while ((entry = readdir(dir)) != nullptr) { + std::string filename = entry->d_name; + if (filename != "." && filename != ".." && filename.find(PRAFT_SNAPSHOT_META_FILE) == std::string::npos) { + std::string full_path = snapshot_path + "/" + filename; + if (IsRegularFile(full_path) || IsDirectory(full_path)) { + // If the path directory contains files other than raft_snapshot_meta, snapshots have been generated + snapshots_exists = true; + break; + } + } + } + closedir(dir); + } + } + + // Snapshot generation + if (!snapshots_exists && !snapshot_path.empty()) { + braft::LocalSnapshotMetaTable snapshot_meta_memtable; + std::string meta_path = snapshot_path + "/" PRAFT_SNAPSHOT_META_FILE; + LOG(INFO) << "start to generate snapshot in path " << snapshot_path; + braft::FileSystemAdaptor* fs = braft::default_file_system(); + assert(fs); + snapshot_meta_memtable.load_from_file(fs, meta_path); + + if (g_pika_server) { + auto db = g_pika_server->GetDB("db0"); + if (db) { + std::set dbs = {db->GetDBName()}; + TaskArg checkpoint_task(TaskType::kCreateCheckpoint, {snapshot_path}); + auto status = g_pika_server->DoSameThingSpecificDB(dbs, checkpoint_task); + if (!status.ok()) { + LOG(ERROR) << "Failed to create checkpoint for snapshot: " << status.ToString(); + } + } + } + + AddAllFiles(snapshot_path, &snapshot_meta_memtable, snapshot_path); + + // Update snapshot meta with last log index and term + if (g_pika_server) { + auto db = g_pika_server->GetDB("db0"); + if (db && db->storage()) { + auto& new_meta = const_cast(snapshot_meta_memtable.meta()); + + // Get the smallest flushed log index as the snapshot point + uint64_t last_log_index = db->storage()->GetSmallestFlushedLogIndex(); + new_meta.set_last_included_index(last_log_index); + + // Get the term for this log index + auto raft_mgr = g_pika_server->GetRaftManager(); + if (raft_mgr) { + auto raft_node = raft_mgr->GetRaftNode("db0"); + if (raft_node && raft_node->GetRaftNode()) { + braft::NodeStatus status; + raft_node->GetRaftNode()->get_status(&status); + new_meta.set_last_included_term(status.term); + LOG(INFO) << "Updated snapshot meta: last_included_index=" << last_log_index + << ", last_included_term=" << status.term; + } + } + } + } + + auto rc = snapshot_meta_memtable.save_to_file(fs, meta_path); + if (rc == 0) { + LOG(INFO) << "Succeed to save snapshot in path " << snapshot_path; + } else { + LOG(ERROR) << "Fail to save snapshot in path " << snapshot_path; + } + LOG(INFO) << "generate snapshot completed in path " << snapshot_path; + } + } + + return braft::PosixFileSystemAdaptor::open(path, oflag, file_meta, e); +} + +void PPosixFileSystemAdaptor::AddAllFiles(const std::string& dir, + braft::LocalSnapshotMetaTable* snapshot_meta_memtable, + const std::string& base_path) { + assert(snapshot_meta_memtable); + DIR* dirp = opendir(dir.c_str()); + if (!dirp) { + LOG(WARNING) << "Failed to open directory: " << dir; + return; + } + + struct dirent* entry; + while ((entry = readdir(dirp)) != nullptr) { + std::string filename = entry->d_name; + if (filename == "." || filename == "..") { + continue; + } + + std::string full_path = dir + "/" + filename; + if (IsDirectory(full_path)) { + LOG(INFO) << "dir_path = " << full_path; + AddAllFiles(full_path, snapshot_meta_memtable, base_path); + } else if (IsRegularFile(full_path)) { + std::string relative_path = GetRelativePath(full_path, base_path); + LOG(INFO) << "file_path = " << relative_path; + braft::LocalFileMeta meta; + if (snapshot_meta_memtable->add_file(relative_path, meta) != 0) { + LOG(WARNING) << "Failed to add file: " << relative_path; + } + } + } + closedir(dirp); +} diff --git a/src/storage/CMakeLists.txt b/src/storage/CMakeLists.txt index 7143682ce6..a86eb8795c 100644 --- a/src/storage/CMakeLists.txt +++ b/src/storage/CMakeLists.txt @@ -20,16 +20,19 @@ aux_source_directory(./src DIR_SRCS) add_library(storage STATIC ${DIR_SRCS} ) -add_dependencies(storage rocksdb gtest glog gflags fmt ${LIBUNWIND_NAME} pstd) +add_dependencies(storage rocksdb gtest glog gflags fmt ${LIBUNWIND_NAME} pstd binlog_pb) # TODO fix rocksdb include path target_include_directories(storage PUBLIC ${PROJECT_SOURCE_DIR} PUBLIC ${PROJECT_SOURCE_DIR}/include + ${CMAKE_SOURCE_DIR}/src/praft # For praft headers + ${CMAKE_BINARY_DIR}/src/praft # For generated binlog.pb.h ${INSTALL_INCLUDEDIR} ${ROCKSDB_SOURCE_DIR} ) target_link_libraries(storage + PUBLIC binlog_pb # Link binlog protobuf library PUBLIC ${ROCKSDB_LIBRARY} ${SNAPPY_LIBRARY} ${ZSTD_LIBRARY} @@ -40,4 +43,5 @@ target_link_libraries(storage ${GFLAGS_LIBRARY} ${FMT_LIBRARY} ${LIBUNWIND_LIBRARY} + ${PROTOBUF_LIBRARY} PUBLIC pstd) diff --git a/src/storage/include/storage/batch.h b/src/storage/include/storage/batch.h new file mode 100644 index 0000000000..9f10989933 --- /dev/null +++ b/src/storage/include/storage/batch.h @@ -0,0 +1,94 @@ +// Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include +#include +#include +#include +#include "rocksdb/db.h" +#include "rocksdb/status.h" + +// Forward declarations +namespace pikiwidb { +class Binlog; +} + +class Cmd; + +namespace net { +class NetConn; +} + +namespace storage { + +class Storage; +class Redis; + +// Callback for async commit result +// Parameters: status (result should be captured in the lambda from storage layer) +// Note: Connection and result should be captured in the lambda, not passed as parameters +using CommitCallback = std::function; + +using AppendLogFunction = std::function&&, + CommitCallback)>; + +using ColumnFamilyIndex = uint32_t; + +class Batch { +public: + virtual ~Batch() = default; + + virtual void Put(ColumnFamilyIndex cf_idx, const rocksdb::Slice& key, const rocksdb::Slice& value) = 0; + + virtual void Delete(ColumnFamilyIndex cf_idx, const rocksdb::Slice& key) = 0; + + virtual rocksdb::Status Commit(CommitCallback callback = nullptr) = 0; + + int32_t Count() const { return count_; } + + static std::unique_ptr CreateBatch(Redis* redis); + +protected: + int32_t count_ = 0; +}; + +class RocksBatch : public Batch { +public: + RocksBatch(rocksdb::DB* db, + const rocksdb::WriteOptions& options, + const std::vector& handles); + + void Put(ColumnFamilyIndex cf_idx, const rocksdb::Slice& key, const rocksdb::Slice& value) override; + void Delete(ColumnFamilyIndex cf_idx, const rocksdb::Slice& key) override; + rocksdb::Status Commit(CommitCallback callback = nullptr) override; + +private: + rocksdb::WriteBatch batch_; + rocksdb::DB* db_; + const rocksdb::WriteOptions& options_; + std::vector handles_; +}; + +class BinlogBatch : public Batch { +public: + BinlogBatch(AppendLogFunction func, uint32_t data_type, uint32_t db_id, uint32_t slot_idx = 0, uint32_t timeout_s = 10); + ~BinlogBatch() override; + + void Put(ColumnFamilyIndex cf_idx, const rocksdb::Slice& key, const rocksdb::Slice& value) override; + void Delete(ColumnFamilyIndex cf_idx, const rocksdb::Slice& key) override; + + // Commit to Raft (sync mode if callback is null, async mode otherwise) + rocksdb::Status Commit(CommitCallback callback = nullptr) override; + +private: + AppendLogFunction append_log_func_; + std::unique_ptr<::pikiwidb::Binlog> binlog_; + uint32_t data_type_; // 数据类型,用于 binlog entry + uint32_t timeout_seconds_; +}; + +} // namespace storage diff --git a/src/storage/include/storage/storage.h b/src/storage/include/storage/storage.h index 8c2e53a7b3..e6274ddd38 100644 --- a/src/storage/include/storage/storage.h +++ b/src/storage/include/storage/storage.h @@ -7,6 +7,8 @@ #define INCLUDE_STORAGE_STORAGE_H_ #include +#include +#include #include #include #include @@ -23,6 +25,18 @@ #include "rocksdb/table.h" #include "pstd/include/pstd_mutex.h" +#include "storage/batch.h" + +// Forward declarations +namespace pikiwidb { +class Binlog; +} + +class Cmd; + +namespace net { +class NetConn; +} namespace storage { @@ -68,6 +82,11 @@ struct StreamInfoResult; template class LRUCache; +// Forward declaration for Binlog +namespace pikiwidb { +class Binlog; +} + struct StorageOptions { rocksdb::Options options; rocksdb::BlockBasedTableOptions table_options; @@ -76,6 +95,12 @@ struct StorageOptions { size_t statistics_max_size = 0; size_t small_compaction_threshold = 5000; size_t small_compaction_duration_threshold = 10000; + + std::function&&, + CommitCallback)> append_log_function; + + std::function do_snapshot_function; + Status ResetOptions(const OptionType& option_type, const std::unordered_map& options_map); }; @@ -178,10 +203,12 @@ class Storage { // Set key to hold the string value. if key // already holds a value, it is overwritten - Status Set(const Slice& key, const Slice& value); + Status Set(const Slice& key, const Slice& value, + CommitCallback callback = nullptr); // Set key to hold the string value. if key exist - Status Setxx(const Slice& key, const Slice& value, int32_t* ret, int32_t ttl = 0); + Status Setxx(const Slice& key, const Slice& value, int32_t* ret, int32_t ttl = 0, + CommitCallback callback = nullptr); // Get the value of key. If the key does not exist // the special value nil is returned @@ -193,17 +220,17 @@ class Storage { // Atomically sets key to value and returns the old value stored at key // Returns an error when key exists but does not hold a string value. - Status GetSet(const Slice& key, const Slice& value, std::string* old_value); + Status GetSet(const Slice& key, const Slice& value, std::string* old_value, CommitCallback callback = nullptr); // Sets or clears the bit at offset in the string value stored at key - Status SetBit(const Slice& key, int64_t offset, int32_t value, int32_t* ret); + Status SetBit(const Slice& key, int64_t offset, int32_t value, int32_t* ret, CommitCallback callback = nullptr); // Returns the bit value at offset in the string value stored at key Status GetBit(const Slice& key, int64_t offset, int32_t* ret); // Sets the given keys to their respective values // MSET replaces existing values with new values - Status MSet(const std::vector& kvs); + Status MSet(const std::vector& kvs, CommitCallback callback = nullptr); // Returns the values of all specified keys. For every key // that does not hold a string value or does not exist, the @@ -218,28 +245,29 @@ class Storage { // Set key to hold string value if key does not exist // return 1 if the key was set // return 0 if the key was not set - Status Setnx(const Slice& key, const Slice& value, int32_t* ret, int32_t ttl = 0); + Status Setnx(const Slice& key, const Slice& value, int32_t* ret, int32_t ttl = 0, + CommitCallback callback = nullptr); // Sets the given keys to their respective values. // MSETNX will not perform any operation at all even // if just a single key already exists. - Status MSetnx(const std::vector& kvs, int32_t* ret); + Status MSetnx(const std::vector& kvs, int32_t* ret, CommitCallback callback = nullptr); // Set key to hold string new_value if key currently hold the give value // return 1 if the key currently hold the give value And override success // return 0 if the key doesn't exist And override fail // return -1 if the key currently does not hold the given value And override fail - Status Setvx(const Slice& key, const Slice& value, const Slice& new_value, int32_t* ret, int32_t ttl = 0); + Status Setvx(const Slice& key, const Slice& value, const Slice& new_value, int32_t* ret, int32_t ttl = 0, CommitCallback callback = nullptr); // delete the key that holds a given value // return 1 if the key currently hold the give value And delete success // return 0 if the key doesn't exist And del fail // return -1 if the key currently does not hold the given value And del fail - Status Delvx(const Slice& key, const Slice& value, int32_t* ret); + Status Delvx(const Slice& key, const Slice& value, int32_t* ret, CommitCallback callback = nullptr); // Set key to hold string value if key does not exist // return the length of the string after it was modified by the command - Status Setrange(const Slice& key, int64_t start_offset, const Slice& value, int32_t* ret); + Status Setrange(const Slice& key, int64_t start_offset, const Slice& value, int32_t* ret, CommitCallback callback = nullptr); // Returns the substring of the string value stored at key, // determined by the offsets start and end (both are inclusive) @@ -251,7 +279,7 @@ class Storage { // If key already exists and is a string, this command appends the value at // the end of the string // return the length of the string after the append operation - Status Append(const Slice& key, const Slice& value, int32_t* ret, int32_t* expired_timestamp_sec, std::string& out_new_value); + Status Append(const Slice& key, const Slice& value, int32_t* ret, int32_t* expired_timestamp_sec, std::string& out_new_value, CommitCallback callback = nullptr); // Count the number of set bits (population counting) in a string. // return the number of bits set to 1 @@ -272,19 +300,20 @@ class Storage { // Decrements the number stored at key by decrement // return the value of key after the decrement - Status Decrby(const Slice& key, int64_t value, int64_t* ret); + Status Decrby(const Slice& key, int64_t value, int64_t* ret, CommitCallback callback = nullptr); // Increments the number stored at key by increment. // If the key does not exist, it is set to 0 before performing the operation - Status Incrby(const Slice& key, int64_t value, int64_t* ret, int32_t* expired_timestamp_sec); + Status Incrby(const Slice& key, int64_t value, int64_t* ret, int32_t* expired_timestamp_sec, CommitCallback callback = nullptr); // Increment the string representing a floating point number // stored at key by the specified increment. - Status Incrbyfloat(const Slice& key, const Slice& value, std::string* ret, int32_t* expired_timestamp_sec); + Status Incrbyfloat(const Slice& key, const Slice& value, std::string* ret, int32_t* expired_timestamp_sec, CommitCallback callback = nullptr); // Set key to hold the string value and set key to timeout after a given // number of seconds - Status Setex(const Slice& key, const Slice& value, int32_t ttl); + Status Setex(const Slice& key, const Slice& value, int32_t ttl, + CommitCallback callback = nullptr); // Returns the length of the string value stored at key. An error // is returned when key holds a non-string value. @@ -294,14 +323,15 @@ class Storage { // specifying the number of seconds representing the TTL (time to live), it // takes an absolute Unix timestamp (seconds since January 1, 1970). A // timestamp in the past will delete the key immediately. - Status PKSetexAt(const Slice& key, const Slice& value, int32_t timestamp); + Status PKSetexAt(const Slice& key, const Slice& value, int32_t timestamp, CommitCallback callback = nullptr); // Hashes Commands // Sets field in the hash stored at key to value. If key does not exist, a new // key holding a hash is created. If field already exists in the hash, it is // overwritten. - Status HSet(const Slice& key, const Slice& field, const Slice& value, int32_t* res); + Status HSet(const Slice& key, const Slice& field, const Slice& value, int32_t* res, + CommitCallback callback = nullptr); // Returns the value associated with field in the hash stored at key. // the value associated with field, or nil when field is not present in the @@ -311,7 +341,8 @@ class Storage { // Sets the specified fields to their respective values in the hash stored at // key. This command overwrites any specified fields already existing in the // hash. If key does not exist, a new key holding a hash is created. - Status HMSet(const Slice& key, const std::vector& fvs); + Status HMSet(const Slice& key, const std::vector& fvs, + CommitCallback callback = nullptr); // Returns the values associated with the specified fields in the hash stored // at key. @@ -336,7 +367,8 @@ class Storage { // Sets field in the hash stored at key to value, only if field does not yet // exist. If key does not exist, a new key holding a hash is created. If field // already exists, this operation has no effect. - Status HSetnx(const Slice& key, const Slice& field, const Slice& value, int32_t* ret); + Status HSetnx(const Slice& key, const Slice& field, const Slice& value, int32_t* ret, + CommitCallback callback = nullptr); // Returns the number of fields contained in the hash stored at key. // Return 0 when key does not exist. @@ -356,7 +388,7 @@ class Storage { // increment. If key does not exist, a new key holding a hash is created. If // field does not exist the value is set to 0 before the operation is // performed. - Status HIncrby(const Slice& key, const Slice& field, int64_t value, int64_t* ret); + Status HIncrby(const Slice& key, const Slice& field, int64_t value, int64_t* ret, CommitCallback callback = nullptr); // Increment the specified field of a hash stored at key, and representing a // floating point number, by the specified increment. If the increment value @@ -368,12 +400,13 @@ class Storage { // The field contains a value of the wrong type (not a string). // The current field content or the specified increment are not parsable as a // double precision floating point number. - Status HIncrbyfloat(const Slice& key, const Slice& field, const Slice& by, std::string* new_value); + Status HIncrbyfloat(const Slice& key, const Slice& field, const Slice& by, std::string* new_value, CommitCallback callback = nullptr); // Removes the specified fields from the hash stored at key. Specified fields // that do not exist within this hash are ignored. If key does not exist, it // is treated as an empty hash and this command returns 0. - Status HDel(const Slice& key, const std::vector& fields, int32_t* ret); + Status HDel(const Slice& key, const std::vector& fields, int32_t* ret, + CommitCallback callback = nullptr); // See SCAN for HSCAN documentation. Status HScan(const Slice& key, int64_t cursor, const std::string& pattern, int64_t count, @@ -400,7 +433,8 @@ class Storage { // Add the specified members to the set stored at key. Specified members that // are already a member of this set are ignored. If key does not exist, a new // set is created before adding the specified members. - Status SAdd(const Slice& key, const std::vector& members, int32_t* ret); + Status SAdd(const Slice& key, const std::vector& members, int32_t* ret, + CommitCallback callback = nullptr); // Returns the set cardinality (number of elements) of the set stored at key. Status SCard(const Slice& key, int32_t* ret); @@ -426,7 +460,8 @@ class Storage { // key3 = {a, c, e} // SDIFFSTORE destination key1 key2 key3 // destination = {b, d} - Status SDiffstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret); + Status SDiffstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret, + CommitCallback callback = nullptr); // Returns the members of the set resulting from the intersection of all the // given sets. @@ -449,7 +484,8 @@ class Storage { // key3 = {a, c, e} // SINTERSTORE destination key1 key2 key3 // destination = {a, c} - Status SInterstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret); + Status SInterstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret, + CommitCallback callback = nullptr); // Returns if member is a member of the set stored at key. Status SIsmember(const Slice& key, const Slice& member, int32_t* ret); @@ -463,10 +499,11 @@ class Storage { // Remove the specified members from the set stored at key. Specified members // that are not a member of this set are ignored. If key does not exist, it is // treated as an empty set and this command returns 0. - Status SRem(const Slice& key, const std::vector& members, int32_t* ret); + Status SRem(const Slice& key, const std::vector& members, int32_t* ret, + CommitCallback callback = nullptr); // Removes and returns several random elements specified by count from the set value store at key. - Status SPop(const Slice& key, std::vector* members, int64_t count); + Status SPop(const Slice& key, std::vector* members, int64_t count, CommitCallback callback = nullptr); // When called with just the key argument, return a random element from the // set value stored at key. @@ -486,7 +523,7 @@ class Storage { // removed from the source set and added to the destination set. When the // specified element already exists in the destination set, it is only removed // from the source set. - Status SMove(const Slice& source, const Slice& destination, const Slice& member, int32_t* ret); + Status SMove(const Slice& source, const Slice& destination, const Slice& member, int32_t* ret, CommitCallback callback = nullptr); // Returns the members of the set resulting from the union of all the given // sets. @@ -508,7 +545,8 @@ class Storage { // key3 = {c, d, e} // SUNIONSTORE destination key1 key2 key3 // destination = {a, b, c, d, e} - Status SUnionstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret); + Status SUnionstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret, + CommitCallback callback = nullptr); // See SCAN for SSCAN documentation. Status SScan(const Slice& key, int64_t cursor, const std::string& pattern, int64_t count, @@ -519,12 +557,14 @@ class Storage { // Insert all the specified values at the head of the list stored at key. If // key does not exist, it is created as empty list before performing the push // operations. - Status LPush(const Slice& key, const std::vector& values, uint64_t* ret); + Status LPush(const Slice& key, const std::vector& values, uint64_t* ret, + CommitCallback callback = nullptr); // Insert all the specified values at the tail of the list stored at key. If // key does not exist, it is created as empty list before performing the push // operation. - Status RPush(const Slice& key, const std::vector& values, uint64_t* ret); + Status RPush(const Slice& key, const std::vector& values, uint64_t* ret, + CommitCallback callback = nullptr); // Returns the specified elements of the list stored at key. The offsets start // and stop are zero-based indexes, with 0 being the first element of the list @@ -536,7 +576,8 @@ class Storage { // Removes the first count occurrences of elements equal to value from the // list stored at key. The count argument influences the operation in the // following ways - Status LTrim(const Slice& key, int64_t start, int64_t stop); + Status LTrim(const Slice& key, int64_t start, int64_t stop, + CommitCallback callback = nullptr); // Returns the length of the list stored at key. If key does not exist, it is // interpreted as an empty list and 0 is returned. An error is returned when @@ -544,10 +585,10 @@ class Storage { Status LLen(const Slice& key, uint64_t* len); // Removes and returns the first elements of the list stored at key. - Status LPop(const Slice& key, int64_t count, std::vector* elements); + Status LPop(const Slice& key, int64_t count, std::vector* elements, CommitCallback callback = nullptr); // Removes and returns the last elements of the list stored at key. - Status RPop(const Slice& key, int64_t count, std::vector* elements); + Status RPop(const Slice& key, int64_t count, std::vector* elements, CommitCallback callback = nullptr); // Returns the element at index index in the list stored at key. The index is // zero-based, so 0 means the first element, 1 the second element and so on. @@ -562,17 +603,20 @@ class Storage { // performed. // An error is returned when key exists but does not hold a list value. Status LInsert(const Slice& key, const BeforeOrAfter& before_or_after, const std::string& pivot, - const std::string& value, int64_t* ret); + const std::string& value, int64_t* ret, + CommitCallback callback = nullptr); // Inserts value at the head of the list stored at key, only if key already // exists and holds a list. In contrary to LPUSH, no operation will be // performed when key does not yet exist. - Status LPushx(const Slice& key, const std::vector& values, uint64_t* len); + Status LPushx(const Slice& key, const std::vector& values, uint64_t* len, + CommitCallback callback = nullptr); // Inserts value at the tail of the list stored at key, only if key already // exists and holds a list. In contrary to RPUSH, no operation will be // performed when key does not yet exist. - Status RPushx(const Slice& key, const std::vector& values, uint64_t* len); + Status RPushx(const Slice& key, const std::vector& values, uint64_t* len, + CommitCallback callback = nullptr); // Removes the first count occurrences of elements equal to value from the // list stored at key. The count argument influences the operation in the @@ -586,13 +630,15 @@ class Storage { // // Note that non-existing keys are treated like empty lists, so when key does // not exist, the command will always return 0. - Status LRem(const Slice& key, int64_t count, const Slice& value, uint64_t* ret); + Status LRem(const Slice& key, int64_t count, const Slice& value, uint64_t* ret, + CommitCallback callback = nullptr); // Sets the list element at index to value. For more information on the index // argument, see LINDEX. // // An error is returned for out of range indexes. - Status LSet(const Slice& key, int64_t index, const Slice& value); + Status LSet(const Slice& key, int64_t index, const Slice& value, + CommitCallback callback = nullptr); // Atomically returns and removes the last element (tail) of the list stored // at source, and pushes the element at the first element (head) of the list @@ -607,7 +653,7 @@ class Storage { // equivalent to removing the last element from the list and pushing it as // first element of the list, so it can be considered as a list rotation // command. - Status RPoplpush(const Slice& source, const Slice& destination, std::string* element); + Status RPoplpush(const Slice& source, const Slice& destination, std::string* element, CommitCallback callback = nullptr); // Zsets Commands @@ -616,14 +662,14 @@ class Storage { // set less than count, it will pop out the total number of sorted set. If two // ScoreMember's score were the same, the lexicographic predominant elements will // be pop out. - Status ZPopMax(const Slice& key, int64_t count, std::vector* score_members); + Status ZPopMax(const Slice& key, int64_t count, std::vector* score_members, CommitCallback callback = nullptr); // Pop the minimum count score_members which have less score in the sorted set. // And return the result in the score_members,If the total number of the sorted // set less than count, it will pop out the total number of sorted set. If two // ScoreMember's score were the same, the lexicographic predominant elements will // not be pop out. - Status ZPopMin(const Slice& key, int64_t count, std::vector* score_members); + Status ZPopMin(const Slice& key, int64_t count, std::vector* score_members, CommitCallback callback = nullptr); // Adds all the specified members with the specified scores to the sorted set // stored at key. It is possible to specify multiple score / member pairs. If @@ -636,7 +682,8 @@ class Storage { // does not hold a sorted set, an error is returned. // The score values should be the string representation of a double precision // floating point number. +inf and -inf values are valid values as well. - Status ZAdd(const Slice& key, const std::vector& score_members, int32_t* ret); + Status ZAdd(const Slice& key, const std::vector& score_members, int32_t* ret, + CommitCallback callback = nullptr); // Returns the sorted set cardinality (number of elements) of the sorted set // stored at key. @@ -664,7 +711,7 @@ class Storage { // The score value should be the string representation of a numeric value, and // accepts double precision floating point numbers. It is possible to provide // a negative value to decrement the score. - Status ZIncrby(const Slice& key, const Slice& member, double increment, double* ret); + Status ZIncrby(const Slice& key, const Slice& member, double increment, double* ret, CommitCallback callback = nullptr); // Returns the specified range of elements in the sorted set stored at key. // The elements are considered to be ordered from the lowest to the highest @@ -786,7 +833,8 @@ class Storage { // existing members are ignored. // // An error is returned when key exists and does not hold a sorted set. - Status ZRem(const Slice& key, const std::vector& members, int32_t* ret); + Status ZRem(const Slice& key, const std::vector& members, int32_t* ret, + CommitCallback callback = nullptr); // Removes all elements in the sorted set stored at key with rank between // start and stop. Both start and stop are 0 -based indexes with 0 being the @@ -794,11 +842,13 @@ class Storage { // they indicate offsets starting at the element with the highest score. For // example: -1 is the element with the highest score, -2 the element with the // second highest score and so forth. - Status ZRemrangebyrank(const Slice& key, int32_t start, int32_t stop, int32_t* ret); + Status ZRemrangebyrank(const Slice& key, int32_t start, int32_t stop, int32_t* ret, + CommitCallback callback = nullptr); // Removes all elements in the sorted set stored at key with a score between // min and max (inclusive). - Status ZRemrangebyscore(const Slice& key, double min, double max, bool left_close, bool right_close, int32_t* ret); + Status ZRemrangebyscore(const Slice& key, double min, double max, bool left_close, bool right_close, int32_t* ret, + CommitCallback callback = nullptr); // Returns the specified range of elements in the sorted set stored at key. // The elements are considered to be ordered from the highest to the lowest @@ -867,7 +917,8 @@ class Storage { // // If destination already exists, it is overwritten. Status ZUnionstore(const Slice& destination, const std::vector& keys, const std::vector& weights, - AGGREGATE agg, std::map& value_to_dest, int32_t* ret); + AGGREGATE agg, std::map& value_to_dest, int32_t* ret, + CommitCallback callback = nullptr); // Computes the intersection of numkeys sorted sets given by the specified // keys, and stores the result in destination. It is mandatory to provide the @@ -884,7 +935,8 @@ class Storage { // // If destination already exists, it is overwritten. Status ZInterstore(const Slice& destination, const std::vector& keys, const std::vector& weights, - AGGREGATE agg, std::vector& value_to_dest, int32_t* ret); + AGGREGATE agg, std::vector& value_to_dest, int32_t* ret, + CommitCallback callback = nullptr); // When all the elements in a sorted set are inserted with the same score, in // order to force lexicographical ordering, this command returns all the @@ -927,7 +979,7 @@ class Storage { // Similarly, this command actually returns the same elements that ZRANGEBYLEX // would return if called with the same min and max arguments. Status ZRemrangebylex(const Slice& key, const Slice& min, const Slice& max, bool left_close, bool right_close, - int32_t* ret); + int32_t* ret, CommitCallback callback = nullptr); // See SCAN for ZSCAN documentation. Status ZScan(const Slice& key, int64_t cursor, const std::string& pattern, int64_t count, @@ -1056,7 +1108,7 @@ class Storage { }; // Adds all the element arguments to the HyperLogLog data structure stored // at the variable name specified as first argument. - Status PfAdd(const Slice& key, const std::vector& values, bool* update); + Status PfAdd(const Slice& key, const std::vector& values, bool* update, CommitCallback callback = nullptr); // When called with a single key, returns the approximated cardinality // computed by the HyperLogLog data structure stored at the specified @@ -1066,12 +1118,15 @@ class Storage { // Merge multiple HyperLogLog values into an unique value that will // approximate the cardinality of the union of the observed Sets of the source // HyperLogLog structures. - Status PfMerge(const std::vector& keys, std::string& value_to_dest); + Status PfMerge(const std::vector& keys, std::string& value_to_dest, CommitCallback callback = nullptr); // Admin Commands Status StartBGThread(); Status RunBGTask(); Status AddBGTask(const BGTask& bg_task); + Status CreateCheckpointInternal(const std::string& checkpoint_path, const std::string& db_name); + Status LoadCheckpointInternal(const std::string& checkpoint_sub_path, const std::string& db_sub_path, + const std::string& db_type); Status Compact(const DataType& type, bool sync = false); Status CompactRange(const DataType& type, const std::string& start, const std::string& end, bool sync = false); @@ -1101,7 +1156,31 @@ class Storage { const std::string& db_type, const std::unordered_map& options); void GetRocksDBInfo(std::string& info); + bool IsRaftEnabled() const { return append_log_function_ != nullptr; } + + const std::function&&, + CommitCallback)>& + GetAppendLogFunction() const { return append_log_function_; } + + rocksdb::Status OnBinlogWrite(const ::pikiwidb::Binlog& binlog, uint64_t log_index); + + uint64_t GetSmallestFlushedLogIndex(); + + // Load database from checkpoint directory + // This will replace current database data with checkpoint data + Status LoadFromCheckpoint(const std::string& checkpoint_path); + std::vector> LoadCheckpoint(const std::string& checkpoint_sub_path, + const std::string& db_sub_path); + Status Close(); + + // Create checkpoint (snapshot) of current database state + // This will create a consistent snapshot to the specified directory + std::vector> CreateCheckpoint(const std::string& checkpoint_path); + private: + std::string db_path_; // Store db path for checkpoint restore + StorageOptions open_options_; + bool open_options_initialized_ = false; std::unique_ptr strings_db_; std::unique_ptr hashes_db_; std::unique_ptr sets_db_; @@ -1123,6 +1202,10 @@ class Storage { // For scan keys in data base std::atomic scan_keynum_exit_ = false; + + // Raft binlog callback + std::function&&, + CommitCallback)> append_log_function_; }; } // namespace storage diff --git a/src/storage/include/storage/storage_define.h b/src/storage/include/storage/storage_define.h new file mode 100644 index 0000000000..b5822c2d01 --- /dev/null +++ b/src/storage/include/storage/storage_define.h @@ -0,0 +1,17 @@ +// Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_STORAGE_DEFINE_H_ +#define STORAGE_STORAGE_DEFINE_H_ + +#include + +namespace storage { + +using LogIndex = int64_t; + +} // namespace storage + +#endif // STORAGE_STORAGE_DEFINE_H_ diff --git a/src/storage/src/batch.cc b/src/storage/src/batch.cc new file mode 100644 index 0000000000..3c18f7138b --- /dev/null +++ b/src/storage/src/batch.cc @@ -0,0 +1,121 @@ +// Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include "storage/storage.h" +#include "storage/batch.h" +#include "src/redis.h" +#include "glog/logging.h" +#include "binlog.pb.h" +#include + +namespace storage { + +std::unique_ptr Batch::CreateBatch(Redis* redis) { + if (redis->GetStorage() && redis->GetStorage()->IsRaftEnabled()) { + return std::make_unique( + redis->GetStorage()->GetAppendLogFunction(), + redis->GetDataType(), // data_type + 0, // db_id + 0, // slot_idx + 10 // timeout_s + ); + } else { + return std::make_unique( + redis->GetDB(), + redis->GetWriteOptions(), + redis->GetHandles() + ); + } +} + +RocksBatch::RocksBatch(rocksdb::DB* db, + const rocksdb::WriteOptions& options, + const std::vector& handles) + : db_(db), options_(options), handles_(handles) { +} + +void RocksBatch::Put(ColumnFamilyIndex cf_idx, const rocksdb::Slice& key, const rocksdb::Slice& value) { + if (handles_.empty() || cf_idx >= handles_.size()) { + batch_.Put(key, value); + } else { + batch_.Put(handles_[cf_idx], key, value); + } + count_++; +} + +void RocksBatch::Delete(ColumnFamilyIndex cf_idx, const rocksdb::Slice& key) { + if (handles_.empty() || cf_idx >= handles_.size()) { + batch_.Delete(key); + } else { + batch_.Delete(handles_[cf_idx], key); + } + count_++; +} + +rocksdb::Status RocksBatch::Commit(CommitCallback callback) { + // RocksBatch is for non-Raft mode, always synchronous + // Ignore callback parameter + return db_->Write(options_, &batch_); +} + + +BinlogBatch::BinlogBatch(AppendLogFunction func, uint32_t data_type, uint32_t db_id, uint32_t slot_idx, uint32_t timeout_s) + : append_log_func_(std::move(func)), + binlog_(std::make_unique<::pikiwidb::Binlog>()), + data_type_(data_type), + timeout_seconds_(timeout_s) { + binlog_->set_db_id(db_id); + binlog_->set_slot_idx(slot_idx); +} + +BinlogBatch::~BinlogBatch() = default; + +void BinlogBatch::Put(ColumnFamilyIndex cf_idx, const rocksdb::Slice& key, const rocksdb::Slice& value) { + auto* entry = binlog_->add_entries(); + entry->set_data_type(static_cast<::pikiwidb::DataType>(data_type_)); + entry->set_cf_idx(cf_idx); + entry->set_op_type(::pikiwidb::OperateType::kPut); + entry->set_key(key.data(), key.size()); + entry->set_value(value.data(), value.size()); + count_++; +} + +void BinlogBatch::Delete(ColumnFamilyIndex cf_idx, const rocksdb::Slice& key) { + auto* entry = binlog_->add_entries(); + entry->set_data_type(static_cast<::pikiwidb::DataType>(data_type_)); + entry->set_cf_idx(cf_idx); + entry->set_op_type(::pikiwidb::OperateType::kDelete); + entry->set_key(key.data(), key.size()); + count_++; +} + +rocksdb::Status BinlogBatch::Commit(CommitCallback callback) { + if (count_ == 0) { + return rocksdb::Status::OK(); + } + + if (callback) { + // Async mode: pass callback to append_log_func + std::promise dummy_promise; + append_log_func_(*binlog_, std::move(dummy_promise), callback); + // Return OK immediately, actual response will be sent in Raft callback + return rocksdb::Status::OK(); + } else { + // Sync mode: wait for Raft to apply + std::promise promise; + auto future = promise.get_future(); + + append_log_func_(*binlog_, std::move(promise), nullptr); + + auto status = future.wait_for(std::chrono::seconds(timeout_seconds_)); + if (status == std::future_status::timeout) { + LOG(ERROR) << "Raft apply timeout after " << timeout_seconds_ << " seconds"; + return rocksdb::Status::TimedOut("Wait for Raft apply timeout"); + } + + return future.get(); + } +} + +} // namespace storage diff --git a/src/storage/src/log_index.cc b/src/storage/src/log_index.cc new file mode 100644 index 0000000000..3995e52224 --- /dev/null +++ b/src/storage/src/log_index.cc @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +#include "log_index.h" + +#include +#include +#include +#include + +#include "redis.h" + +namespace storage { + +rocksdb::Status LogIndexOfColumnFamilies::Init(Redis *db) { + // Resize cf_ vector based on actual number of column families + size_t cf_count = db->GetHandles().size(); + cf_.resize(cf_count); + + for (size_t i = 0; i < cf_.size(); i++) { + rocksdb::TablePropertiesCollection collection; + auto s = db->GetDB()->GetPropertiesOfAllTables(db->GetHandles()[i], &collection); + if (!s.ok()) { + return s; + } + auto res = LogIndexTablePropertiesCollector::GetLargestLogIndexFromTableCollection(collection); + if (res.has_value()) { + auto log_index = res->GetAppliedLogIndex(); + auto sequence_number = res->GetSequenceNumber(); + cf_[i].applied_index.SetLogIndexSeqnoPair(log_index, sequence_number); + cf_[i].flushed_index.SetLogIndexSeqnoPair(log_index, sequence_number); + } + } + return rocksdb::Status::OK(); +} + +LogIndexOfColumnFamilies::SmallestIndexRes LogIndexOfColumnFamilies::GetSmallestLogIndex(int flush_cf) const { + SmallestIndexRes res; + for (int i = 0; i < static_cast(cf_.size()); i++) { + if (i != flush_cf && cf_[i].flushed_index >= cf_[i].applied_index) { + continue; + } + auto applied_log_index = cf_[i].applied_index.GetLogIndex(); + auto flushed_log_index = cf_[i].flushed_index.GetLogIndex(); + auto flushed_seqno = cf_[i].flushed_index.GetSequenceNumber(); + if (applied_log_index < res.smallest_applied_log_index) { + res.smallest_applied_log_index = applied_log_index; + res.smallest_applied_log_index_cf = i; + } + if (flushed_log_index < res.smallest_flushed_log_index) { + res.smallest_flushed_log_index = flushed_log_index; + res.smallest_flushed_seqno = flushed_seqno; + res.smallest_flushed_log_index_cf = i; + } + } + return res; +} + +size_t LogIndexOfColumnFamilies::GetPendingFlushGap() const { + std::set s; + for (size_t i = 0; i < cf_.size(); i++) { + s.insert(cf_[i].applied_index.GetLogIndex()); + s.insert(cf_[i].flushed_index.GetLogIndex()); + } + assert(!s.empty()); + if (s.size() == 1) { + return 0; + } + auto iter_first = s.begin(); + auto iter_last = s.end(); + return *std::prev(iter_last) - *iter_first; +} + +std::atomic_int64_t LogIndexAndSequenceCollector::max_gap_ = 1000; + +std::optional LogIndexTablePropertiesCollector::ReadStatsFromTableProps( + const std::shared_ptr &table_props) { + const auto &user_properties = table_props->user_collected_properties; + const auto it = user_properties.find(kPropertyName.data()); + if (it == user_properties.end()) { + return std::nullopt; + } + std::string s = it->second; + LogIndex applied_log_index; + SequenceNumber largest_seqno; + auto res = sscanf(s.c_str(), "%" PRIi64 "/%" PRIu64 "", &applied_log_index, &largest_seqno); + assert(res == 2); + + return LogIndexAndSequencePair(applied_log_index, largest_seqno); +} + +LogIndex LogIndexAndSequenceCollector::FindAppliedLogIndex(SequenceNumber seqno) const { + if (seqno == 0) { // the seqno will be 0 when executing compaction + return 0; + } + std::shared_lock gd(mutex_); + if (list_.empty() || seqno < list_.front().GetSequenceNumber()) { + return 0; + } + if (seqno >= list_.back().GetSequenceNumber()) { + return list_.back().GetAppliedLogIndex(); + } + + auto it = std::lower_bound( + list_.begin(), list_.end(), seqno, + [](const LogIndexAndSequencePair &p, SequenceNumber tar) { return p.GetSequenceNumber() <= tar; }); + if (it->GetSequenceNumber() > seqno) { + --it; + } + assert(it->GetSequenceNumber() <= seqno); + return it->GetAppliedLogIndex(); +} + +void LogIndexAndSequenceCollector::Update(LogIndex smallest_applied_log_index, SequenceNumber smallest_flush_seqno) { + // If step length > 1, log index is sampled and sacrifice precision to save memory usage. + // It means that extra applied log may be applied again on start stage. + if ((smallest_applied_log_index & step_length_mask_) == 0) { + std::lock_guard gd(mutex_); + list_.emplace_back(smallest_applied_log_index, smallest_flush_seqno); + } +} + +void LogIndexAndSequenceCollector::Purge(LogIndex smallest_applied_log_index) { + // The reason that we use smallest applied log index of all column families instead of smallest flushed log index is + // that the log index corresponding to the largest sequence number in the next flush must be greater than or equal to + // the smallest applied log index at this moment. + // So we just need to make sure that there is an element in the queue which is less than or equal to the smallest + // applied log index to ensure that we can find a correct log index while doing next flush. + std::lock_guard gd(mutex_); + if (list_.size() < 2) { + return; + } + auto second = std::next(list_.begin()); + while (list_.size() >= 2 && second->GetAppliedLogIndex() <= smallest_applied_log_index) { + list_.pop_front(); + second = std::next(list_.begin()); + } +} + +auto LogIndexTablePropertiesCollector::GetLargestLogIndexFromTableCollection( + const rocksdb::TablePropertiesCollection &collection) -> std::optional { + LogIndex max_flushed_log_index{-1}; + rocksdb::SequenceNumber seqno{}; + for (const auto &[_, props] : collection) { + auto res = LogIndexTablePropertiesCollector::ReadStatsFromTableProps(props); + if (res.has_value() && res->GetAppliedLogIndex() > max_flushed_log_index) { + max_flushed_log_index = res->GetAppliedLogIndex(); + seqno = res->GetSequenceNumber(); + } + } + return max_flushed_log_index == -1 ? std::nullopt + : std::make_optional(max_flushed_log_index, seqno); +} + +void LogIndexAndSequenceCollectorPurger::OnFlushCompleted(rocksdb::DB *db, + const rocksdb::FlushJobInfo &flush_job_info) { + cf_->SetFlushedLogIndex(flush_job_info.cf_id, collector_->FindAppliedLogIndex(flush_job_info.largest_seqno), + flush_job_info.largest_seqno); + + auto [smallest_applied_log_index_cf, smallest_applied_log_index, smallest_flushed_log_index_cf, + smallest_flushed_log_index, smallest_flushed_seqno] = cf_->GetSmallestLogIndex(flush_job_info.cf_id); + collector_->Purge(smallest_applied_log_index); + + if (smallest_flushed_log_index_cf != -1) { + cf_->SetFlushedLogIndexGlobal(smallest_flushed_log_index, smallest_flushed_seqno); + } + auto count = count_.fetch_add(1); + + if (count % 10 == 0 && callback_) { + callback_(smallest_flushed_log_index, false); + } + + if (flush_job_info.cf_id == manul_flushing_cf_.load()) { + manul_flushing_cf_.store(-1); + } + + auto flushing_cf = manul_flushing_cf_.load(); + if (flushing_cf != static_cast(-1) || !collector_->IsFlushPending()) { + return; + } + + assert(flushing_cf == static_cast(-1)); + + if (!manul_flushing_cf_.compare_exchange_strong(flushing_cf, smallest_flushed_log_index_cf)) { + return; + } + + assert(manul_flushing_cf_.load() == static_cast(smallest_flushed_log_index_cf)); + rocksdb::FlushOptions flush_option; + flush_option.wait = false; + db->Flush(flush_option, column_families_->at(smallest_flushed_log_index_cf)); +} + +} // namespace storage diff --git a/src/storage/src/log_index.h b/src/storage/src/log_index.h new file mode 100644 index 0000000000..37e2970027 --- /dev/null +++ b/src/storage/src/log_index.h @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/listener.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/types.h" + +#include "storage/storage_define.h" + +namespace storage { + +using rocksdb::SequenceNumber; +class Redis; + +class LogIndexAndSequencePair { + public: + LogIndexAndSequencePair(LogIndex applied_log_index, SequenceNumber seqno) + : applied_log_index_(applied_log_index), seqno_(seqno) {} + + void SetAppliedLogIndex(LogIndex applied_log_index) { applied_log_index_ = applied_log_index; } + void SetSequenceNumber(SequenceNumber seqno) { seqno_ = seqno; } + + LogIndex GetAppliedLogIndex() const { return applied_log_index_; } + SequenceNumber GetSequenceNumber() const { return seqno_; } + + private: + LogIndex applied_log_index_ = 0; + SequenceNumber seqno_ = 0; +}; + +struct LogIndexSeqnoPair { + std::atomic log_index = 0; + std::atomic seqno = 0; + + LogIndex GetLogIndex() const { return log_index.load(); } + + SequenceNumber GetSequenceNumber() const { return seqno.load(); } + + void SetLogIndexSeqnoPair(LogIndex l, SequenceNumber s) { + log_index.store(l); + seqno.store(s); + } + + LogIndexSeqnoPair() = default; + + // Copy constructor + LogIndexSeqnoPair(const LogIndexSeqnoPair& other) + : log_index(other.log_index.load()), seqno(other.seqno.load()) {} + + // Copy assignment operator + LogIndexSeqnoPair& operator=(const LogIndexSeqnoPair& other) { + if (this != &other) { + log_index.store(other.log_index.load()); + seqno.store(other.seqno.load()); + } + return *this; + } + + bool operator==(const LogIndexSeqnoPair &other) const { return seqno.load() == other.seqno.load(); } + + bool operator<=(const LogIndexSeqnoPair &other) const { return seqno.load() <= other.seqno.load(); } + + bool operator>=(const LogIndexSeqnoPair &other) const { return seqno.load() >= other.seqno.load(); } + + bool operator<(const LogIndexSeqnoPair &other) const { return seqno.load() < other.seqno.load(); } +}; + +class LogIndexOfColumnFamilies { + struct LogIndexPair { + LogIndexSeqnoPair applied_index; // newest record in memtable. + LogIndexSeqnoPair flushed_index; // newest record in sst file. + }; + + struct SmallestIndexRes { + int smallest_applied_log_index_cf = -1; + LogIndex smallest_applied_log_index = std::numeric_limits::max(); + + int smallest_flushed_log_index_cf = -1; + LogIndex smallest_flushed_log_index = std::numeric_limits::max(); + SequenceNumber smallest_flushed_seqno = std::numeric_limits::max(); + }; + + public: + // Read the largest log index of each column family from all sst files + rocksdb::Status Init(Redis *db); + + SmallestIndexRes GetSmallestLogIndex(int flush_cf) const; + + void SetFlushedLogIndex(size_t cf_id, LogIndex log_index, SequenceNumber seqno) { + if (cf_id >= cf_.size()) return; + cf_[cf_id].flushed_index.log_index.store(std::max(cf_[cf_id].flushed_index.log_index.load(), log_index)); + cf_[cf_id].flushed_index.seqno.store(std::max(cf_[cf_id].flushed_index.seqno.load(), seqno)); + } + + void SetFlushedLogIndexGlobal(LogIndex log_index, SequenceNumber seqno) { + SetLastFlushIndex(log_index, seqno); + for (size_t i = 0; i < cf_.size(); i++) { + if (cf_[i].flushed_index <= last_flush_index_) { + auto flush_log_index = std::max(cf_[i].flushed_index.GetLogIndex(), last_flush_index_.GetLogIndex()); + auto flush_sequence_number = + std::max(cf_[i].flushed_index.GetSequenceNumber(), last_flush_index_.GetSequenceNumber()); + cf_[i].flushed_index.SetLogIndexSeqnoPair(flush_log_index, flush_sequence_number); + } + } + } + + bool IsApplied(size_t cf_id, LogIndex cur_log_index) const { + if (cf_id >= cf_.size()) return false; + return cur_log_index < cf_[cf_id].applied_index.GetLogIndex(); + } + + void Update(size_t cf_id, LogIndex cur_log_index, SequenceNumber cur_seqno) { + if (cf_id >= cf_.size()) return; + if (cf_[cf_id].flushed_index <= last_flush_index_ && cf_[cf_id].flushed_index == cf_[cf_id].applied_index) { + auto flush_log_index = std::max(cf_[cf_id].flushed_index.GetLogIndex(), last_flush_index_.GetLogIndex()); + auto flush_sequence_number = + std::max(cf_[cf_id].flushed_index.GetSequenceNumber(), last_flush_index_.GetSequenceNumber()); + cf_[cf_id].flushed_index.SetLogIndexSeqnoPair(flush_log_index, flush_sequence_number); + } + + cf_[cf_id].applied_index.SetLogIndexSeqnoPair(cur_log_index, cur_seqno); + } + + bool IsPendingFlush() const; + + size_t GetPendingFlushGap() const; + + void SetLastFlushIndex(LogIndex flushed_logindex, SequenceNumber flushed_seqno) { + auto lastest_flush_log_index = std::max(last_flush_index_.GetLogIndex(), flushed_logindex); + auto lastest_flush_sequence_number = std::max(last_flush_index_.GetSequenceNumber(), flushed_seqno); + last_flush_index_.SetLogIndexSeqnoPair(lastest_flush_log_index, lastest_flush_sequence_number); + } + + // for gtest + LogIndexSeqnoPair &GetLastFlushIndex() { return last_flush_index_; } + + LogIndexPair &GetCFStatus(size_t cf) { return cf_[cf]; } + + private: + std::vector cf_; // Dynamic size based on actual CF count + LogIndexSeqnoPair last_flush_index_; +}; + +class LogIndexAndSequenceCollector { + public: + explicit LogIndexAndSequenceCollector(uint8_t step_length_bit = 0) { step_length_mask_ = (1 << step_length_bit) - 1; } + + // find the index of log which contain seqno or before it + LogIndex FindAppliedLogIndex(SequenceNumber seqno) const; + + // if there's a new pair, add it to list; otherwise, do nothing + void Update(LogIndex smallest_applied_log_index, SequenceNumber smallest_flush_seqno); + + // purge out dated log index after memtable flushed. + void Purge(LogIndex smallest_applied_log_index); + + // Is manual flushing required? + bool IsFlushPending() const { return GetSize() >= max_gap_; } + + // for gtest + uint64_t GetSize() const { + std::shared_lock share_lock(mutex_); + return list_.size(); + } + + std::deque &GetList() { + std::shared_lock share_lock(mutex_); + return list_; + } + + public: + static std::atomic_int64_t max_gap_; + + private: + uint64_t step_length_mask_ = 0; + mutable std::shared_mutex mutex_; + std::deque list_; +}; + +class LogIndexTablePropertiesCollector : public rocksdb::TablePropertiesCollector { + public: + static constexpr std::string_view kPropertyName = "LargestLogIndex/LargestSequenceNumber"; + + explicit LogIndexTablePropertiesCollector(const LogIndexAndSequenceCollector &collector) : collector_(collector) {} + + rocksdb::Status AddUserKey(const rocksdb::Slice &key, const rocksdb::Slice &value, rocksdb::EntryType type, + SequenceNumber seq, uint64_t file_size) override { + largest_seqno_ = std::max(largest_seqno_, seq); + return rocksdb::Status::OK(); + } + rocksdb::Status Finish(rocksdb::UserCollectedProperties *properties) override { + properties->insert(Materialize()); + return rocksdb::Status::OK(); + } + const char *Name() const override { return "LogIndexTablePropertiesCollector"; } + rocksdb::UserCollectedProperties GetReadableProperties() const override { + return rocksdb::UserCollectedProperties{Materialize()}; + } + + static std::optional ReadStatsFromTableProps( + const std::shared_ptr &table_props); + + static auto GetLargestLogIndexFromTableCollection(const rocksdb::TablePropertiesCollection &collection) + -> std::optional; + + private: + std::pair Materialize() const { + if (-1 == cache_) { + cache_ = collector_.FindAppliedLogIndex(largest_seqno_); + } + return std::make_pair(static_cast(kPropertyName), + std::to_string(cache_) + "/" + std::to_string(largest_seqno_)); + } + + private: + const LogIndexAndSequenceCollector &collector_; + SequenceNumber largest_seqno_ = 0; + mutable LogIndex cache_{-1}; +}; + +class LogIndexTablePropertiesCollectorFactory : public rocksdb::TablePropertiesCollectorFactory { + public: + explicit LogIndexTablePropertiesCollectorFactory(const LogIndexAndSequenceCollector &collector) + : collector_(collector) {} + ~LogIndexTablePropertiesCollectorFactory() override = default; + + rocksdb::TablePropertiesCollector *CreateTablePropertiesCollector( + [[maybe_unused]] rocksdb::TablePropertiesCollectorFactory::Context context) override { + return new LogIndexTablePropertiesCollector(collector_); + } + const char *Name() const override { return "LogIndexTablePropertiesCollectorFactory"; } + + private: + const LogIndexAndSequenceCollector &collector_; +}; + +class LogIndexAndSequenceCollectorPurger : public rocksdb::EventListener { + public: + explicit LogIndexAndSequenceCollectorPurger(std::vector *column_families, + LogIndexAndSequenceCollector *collector, LogIndexOfColumnFamilies *cf, + std::function callback) + : column_families_(column_families), collector_(collector), cf_(cf), callback_(callback) {} + + void OnFlushCompleted(rocksdb::DB *db, const rocksdb::FlushJobInfo &flush_job_info) override; + + private: + std::vector *column_families_ = nullptr; + LogIndexAndSequenceCollector *collector_ = nullptr; + LogIndexOfColumnFamilies *cf_ = nullptr; + std::atomic_uint64_t count_ = 0; + std::atomic manul_flushing_cf_ = -1; + std::function callback_; +}; + +} // namespace storage diff --git a/src/storage/src/redis.h b/src/storage/src/redis.h index 21eaa2aa94..da3214cf05 100644 --- a/src/storage/src/redis.h +++ b/src/storage/src/redis.h @@ -17,6 +17,7 @@ #include "pstd/include/env.h" #include "src/lock_mgr.h" +#include "src/log_index.h" #include "src/lru_cache.h" #include "src/mutex_impl.h" #include "storage/storage.h" @@ -31,6 +32,9 @@ class Redis { virtual ~Redis(); rocksdb::DB* GetDB() { return db_; } + Storage* GetStorage() { return storage_; } + DataType GetDataType() const { return type_; } + const rocksdb::WriteOptions& GetWriteOptions() const { return default_write_options_; } struct KeyStatistics { size_t window_size; @@ -141,6 +145,27 @@ class Redis { Status UpdateSpecificKeyStatistics(const std::string& key, uint64_t count); Status UpdateSpecificKeyDuration(const std::string& key, uint64_t duration); Status AddCompactKeyTaskIfNeeded(const std::string& key, uint64_t count, uint64_t duration); + + // Log index management for Raft + LogIndexAndSequenceCollector log_index_collector_; + LogIndexOfColumnFamilies log_index_of_all_cfs_; + + public: + void UpdateLogIndex(LogIndex applied_log_index, SequenceNumber seqno) { + log_index_collector_.Update(applied_log_index, seqno); + } + + void UpdateAppliedLogIndexOfColumnFamily(size_t cf_idx, LogIndex logidx, SequenceNumber seqno) { + log_index_of_all_cfs_.Update(cf_idx, logidx, seqno); + } + + bool IsApplied(size_t cf_idx, LogIndex logidx) const { + return log_index_of_all_cfs_.IsApplied(cf_idx, logidx); + } + + LogIndexOfColumnFamilies& GetLogIndexOfColumnFamilies() { return log_index_of_all_cfs_; } + + LogIndexAndSequenceCollector& GetCollector() { return log_index_collector_; } }; } // namespace storage diff --git a/src/storage/src/redis_hashes.cc b/src/storage/src/redis_hashes.cc index b885a487dd..865d9d07e8 100644 --- a/src/storage/src/redis_hashes.cc +++ b/src/storage/src/redis_hashes.cc @@ -56,15 +56,42 @@ Status RedisHashes::Open(const StorageOptions& storage_options, const std::strin meta_cf_table_ops.block_cache = rocksdb::NewLRUCache(storage_options.block_cache_size); data_cf_table_ops.block_cache = rocksdb::NewLRUCache(storage_options.block_cache_size); } + meta_cf_ops.table_factory.reset(rocksdb::NewBlockBasedTableFactory(meta_cf_table_ops)); data_cf_ops.table_factory.reset(rocksdb::NewBlockBasedTableFactory(data_cf_table_ops)); + + // Add LogIndex table properties collector for Raft + meta_cf_ops.table_properties_collector_factories.push_back( + std::make_shared(log_index_collector_)); + data_cf_ops.table_properties_collector_factories.push_back( + std::make_shared(log_index_collector_)); + + // Add LogIndex event listener for Raft + if (storage_options.do_snapshot_function) { + auto purger = std::make_shared( + &handles_, &log_index_collector_, &log_index_of_all_cfs_, + storage_options.do_snapshot_function); + db_ops.listeners.push_back(purger); + } std::vector column_families; // Meta CF column_families.emplace_back(rocksdb::kDefaultColumnFamilyName, meta_cf_ops); // Data CF column_families.emplace_back("data_cf", data_cf_ops); - return rocksdb::DB::Open(db_ops, db_path, column_families, &handles_, &db_); + + s = rocksdb::DB::Open(db_ops, db_path, column_families, &handles_, &db_); + if (!s.ok()) { + return s; + } + + // Initialize log index of column families + s = log_index_of_all_cfs_.Init(this); + if (!s.ok()) { + LOG(ERROR) << "Failed to init log index of column families for hashes: " << s.ToString(); + } + + return s; } Status RedisHashes::CompactRange(const rocksdb::Slice* begin, const rocksdb::Slice* end, const ColumnFamilyType& type) { @@ -189,7 +216,8 @@ Status RedisHashes::PKPatternMatchDelWithRemoveKeys(const DataType& data_type, c return s; } -Status RedisHashes::HDel(const Slice& key, const std::vector& fields, int32_t* ret) { +Status RedisHashes::HDel(const Slice& key, const std::vector& fields, int32_t* ret, + CommitCallback callback) { uint32_t statistic = 0; std::vector filtered_fields; std::unordered_set field_set; @@ -201,7 +229,7 @@ Status RedisHashes::HDel(const Slice& key, const std::vector& field } } - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); rocksdb::ReadOptions read_options; const rocksdb::Snapshot* snapshot; @@ -226,7 +254,7 @@ Status RedisHashes::HDel(const Slice& key, const std::vector& field if (s.ok()) { del_cnt++; statistic++; - batch.Delete(handles_[1], hashes_data_key.Encode()); + batch->Delete(1, hashes_data_key.Encode()); } else if (s.IsNotFound()) { continue; } else { @@ -238,7 +266,7 @@ Status RedisHashes::HDel(const Slice& key, const std::vector& field return Status::InvalidArgument("hash size overflow"); } parsed_hashes_meta_value.ModifyCount(-del_cnt); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); } } else if (s.IsNotFound()) { *ret = 0; @@ -246,7 +274,7 @@ Status RedisHashes::HDel(const Slice& key, const std::vector& field } else { return s; } - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(callback); UpdateSpecificKeyStatistics(key.ToString(), statistic); return s; } @@ -353,13 +381,12 @@ Status RedisHashes::HGetallWithTTL(const Slice& key, std::vector* fv return s; } -Status RedisHashes::HIncrby(const Slice& key, const Slice& field, int64_t value, int64_t* ret) { +Status RedisHashes::HIncrby(const Slice& key, const Slice& field, int64_t value, int64_t* ret, CommitCallback callback) { *ret = 0; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); int32_t version = 0; - uint32_t statistic = 0; std::string old_value; std::string meta_value; @@ -372,10 +399,10 @@ Status RedisHashes::HIncrby(const Slice& key, const Slice& field, int64_t value, version = parsed_hashes_meta_value.UpdateVersion(); parsed_hashes_meta_value.set_count(1); parsed_hashes_meta_value.set_timestamp(0); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); HashesDataKey hashes_data_key(key, version, field); Int64ToStr(value_buf, 32, value); - batch.Put(handles_[1], hashes_data_key.Encode(), value_buf); + batch->Put(1, hashes_data_key.Encode(), value_buf); *ret = value; } else { version = parsed_hashes_meta_value.version(); @@ -391,16 +418,15 @@ Status RedisHashes::HIncrby(const Slice& key, const Slice& field, int64_t value, } *ret = ival + value; Int64ToStr(value_buf, 32, *ret); - batch.Put(handles_[1], hashes_data_key.Encode(), value_buf); - statistic++; + batch->Put(1, hashes_data_key.Encode(), value_buf); } else if (s.IsNotFound()) { Int64ToStr(value_buf, 32, value); if (!parsed_hashes_meta_value.CheckModifyCount(1)){ return Status::InvalidArgument("hash size overflow"); } parsed_hashes_meta_value.ModifyCount(1); - batch.Put(handles_[0], key, meta_value); - batch.Put(handles_[1], hashes_data_key.Encode(), value_buf); + batch->Put(0, key, meta_value); + batch->Put(1, hashes_data_key.Encode(), value_buf); *ret = value; } else { return s; @@ -410,27 +436,25 @@ Status RedisHashes::HIncrby(const Slice& key, const Slice& field, int64_t value, EncodeFixed32(meta_value_buf, 1); HashesMetaValue hashes_meta_value(Slice(meta_value_buf, sizeof(int32_t))); version = hashes_meta_value.UpdateVersion(); - batch.Put(handles_[0], key, hashes_meta_value.Encode()); + batch->Put(0, key, hashes_meta_value.Encode()); HashesDataKey hashes_data_key(key, version, field); Int64ToStr(value_buf, 32, value); - batch.Put(handles_[1], hashes_data_key.Encode(), value_buf); + batch->Put(1, hashes_data_key.Encode(), value_buf); *ret = value; } else { return s; } - s = db_->Write(default_write_options_, &batch); - UpdateSpecificKeyStatistics(key.ToString(), statistic); - return s; + + return batch->Commit(callback); } -Status RedisHashes::HIncrbyfloat(const Slice& key, const Slice& field, const Slice& by, std::string* new_value) { +Status RedisHashes::HIncrbyfloat(const Slice& key, const Slice& field, const Slice& by, std::string* new_value, CommitCallback callback) { new_value->clear(); - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); int32_t version = 0; - uint32_t statistic = 0; std::string meta_value; std::string old_value_str; long double long_double_by; @@ -447,11 +471,11 @@ Status RedisHashes::HIncrbyfloat(const Slice& key, const Slice& field, const Sli version = parsed_hashes_meta_value.UpdateVersion(); parsed_hashes_meta_value.set_count(1); parsed_hashes_meta_value.set_timestamp(0); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); HashesDataKey hashes_data_key(key, version, field); LongDoubleToStr(long_double_by, new_value); - batch.Put(handles_[1], hashes_data_key.Encode(), *new_value); + batch->Put(1, hashes_data_key.Encode(), *new_value); } else { version = parsed_hashes_meta_value.version(); HashesDataKey hashes_data_key(key, version, field); @@ -467,16 +491,15 @@ Status RedisHashes::HIncrbyfloat(const Slice& key, const Slice& field, const Sli if (LongDoubleToStr(total, new_value) == -1) { return Status::InvalidArgument("Overflow"); } - batch.Put(handles_[1], hashes_data_key.Encode(), *new_value); - statistic++; + batch->Put(1, hashes_data_key.Encode(), *new_value); } else if (s.IsNotFound()) { LongDoubleToStr(long_double_by, new_value); if (!parsed_hashes_meta_value.CheckModifyCount(1)){ return Status::InvalidArgument("hash size overflow"); } parsed_hashes_meta_value.ModifyCount(1); - batch.Put(handles_[0], key, meta_value); - batch.Put(handles_[1], hashes_data_key.Encode(), *new_value); + batch->Put(0, key, meta_value); + batch->Put(1, hashes_data_key.Encode(), *new_value); } else { return s; } @@ -485,17 +508,16 @@ Status RedisHashes::HIncrbyfloat(const Slice& key, const Slice& field, const Sli EncodeFixed32(meta_value_buf, 1); HashesMetaValue hashes_meta_value(Slice(meta_value_buf, sizeof(int32_t))); version = hashes_meta_value.UpdateVersion(); - batch.Put(handles_[0], key, hashes_meta_value.Encode()); + batch->Put(0, key, hashes_meta_value.Encode()); HashesDataKey hashes_data_key(key, version, field); LongDoubleToStr(long_double_by, new_value); - batch.Put(handles_[1], hashes_data_key.Encode(), *new_value); + batch->Put(1, hashes_data_key.Encode(), *new_value); } else { return s; } - s = db_->Write(default_write_options_, &batch); - UpdateSpecificKeyStatistics(key.ToString(), statistic); - return s; + + return batch->Commit(callback); } Status RedisHashes::HKeys(const Slice& key, std::vector* fields) { @@ -592,7 +614,8 @@ Status RedisHashes::HMGet(const Slice& key, const std::vector& fiel return s; } -Status RedisHashes::HMSet(const Slice& key, const std::vector& fvs) { +Status RedisHashes::HMSet(const Slice& key, const std::vector& fvs, + CommitCallback callback) { uint32_t statistic = 0; std::unordered_set fields; std::vector filtered_fvs; @@ -604,7 +627,7 @@ Status RedisHashes::HMSet(const Slice& key, const std::vector& fvs) } } - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); int32_t version = 0; @@ -619,10 +642,10 @@ Status RedisHashes::HMSet(const Slice& key, const std::vector& fvs) return Status::InvalidArgument("hash size overflow"); } parsed_hashes_meta_value.set_count(static_cast(filtered_fvs.size())); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); for (const auto& fv : filtered_fvs) { HashesDataKey hashes_data_key(key, version, fv.field); - batch.Put(handles_[1], hashes_data_key.Encode(), fv.value); + batch->Put(1, hashes_data_key.Encode(), fv.value); } } else { int32_t count = 0; @@ -633,10 +656,10 @@ Status RedisHashes::HMSet(const Slice& key, const std::vector& fvs) s = db_->Get(default_read_options_, handles_[1], hashes_data_key.Encode(), &data_value); if (s.ok()) { statistic++; - batch.Put(handles_[1], hashes_data_key.Encode(), fv.value); + batch->Put(1, hashes_data_key.Encode(), fv.value); } else if (s.IsNotFound()) { count++; - batch.Put(handles_[1], hashes_data_key.Encode(), fv.value); + batch->Put(1, hashes_data_key.Encode(), fv.value); } else { return s; } @@ -645,25 +668,26 @@ Status RedisHashes::HMSet(const Slice& key, const std::vector& fvs) return Status::InvalidArgument("hash size overflow"); } parsed_hashes_meta_value.ModifyCount(count); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); } } else if (s.IsNotFound()) { EncodeFixed32(meta_value_buf, filtered_fvs.size()); HashesMetaValue hashes_meta_value(Slice(meta_value_buf, sizeof(int32_t))); version = hashes_meta_value.UpdateVersion(); - batch.Put(handles_[0], key, hashes_meta_value.Encode()); + batch->Put(0, key, hashes_meta_value.Encode()); for (const auto& fv : filtered_fvs) { HashesDataKey hashes_data_key(key, version, fv.field); - batch.Put(handles_[1], hashes_data_key.Encode(), fv.value); + batch->Put(1, hashes_data_key.Encode(), fv.value); } } - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(callback); UpdateSpecificKeyStatistics(key.ToString(), statistic); return s; } -Status RedisHashes::HSet(const Slice& key, const Slice& field, const Slice& value, int32_t* res) { - rocksdb::WriteBatch batch; +Status RedisHashes::HSet(const Slice& key, const Slice& field, const Slice& value, int32_t* res, + CommitCallback callback) { + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); int32_t version = 0; @@ -676,9 +700,9 @@ Status RedisHashes::HSet(const Slice& key, const Slice& field, const Slice& valu if (parsed_hashes_meta_value.IsStale() || parsed_hashes_meta_value.count() == 0) { version = parsed_hashes_meta_value.InitialMetaValue(); parsed_hashes_meta_value.set_count(1); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); HashesDataKey data_key(key, version, field); - batch.Put(handles_[1], data_key.Encode(), value); + batch->Put(1, data_key.Encode(), value); *res = 1; } else { version = parsed_hashes_meta_value.version(); @@ -690,7 +714,7 @@ Status RedisHashes::HSet(const Slice& key, const Slice& field, const Slice& valu if (data_value == value.ToString()) { return Status::OK(); } else { - batch.Put(handles_[1], hashes_data_key.Encode(), value); + batch->Put(1, hashes_data_key.Encode(), value); statistic++; } } else if (s.IsNotFound()) { @@ -698,8 +722,8 @@ Status RedisHashes::HSet(const Slice& key, const Slice& field, const Slice& valu return Status::InvalidArgument("hash size overflow"); } parsed_hashes_meta_value.ModifyCount(1); - batch.Put(handles_[0], key, meta_value); - batch.Put(handles_[1], hashes_data_key.Encode(), value); + batch->Put(0, key, meta_value); + batch->Put(1, hashes_data_key.Encode(), value); *res = 1; } else { return s; @@ -709,20 +733,21 @@ Status RedisHashes::HSet(const Slice& key, const Slice& field, const Slice& valu EncodeFixed32(meta_value_buf, 1); HashesMetaValue meta_value(Slice(meta_value_buf, sizeof(int32_t))); version = meta_value.UpdateVersion(); - batch.Put(handles_[0], key, meta_value.Encode()); + batch->Put(0, key, meta_value.Encode()); HashesDataKey data_key(key, version, field); - batch.Put(handles_[1], data_key.Encode(), value); + batch->Put(1, data_key.Encode(), value); *res = 1; } else { return s; } - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(callback); UpdateSpecificKeyStatistics(key.ToString(), statistic); return s; } -Status RedisHashes::HSetnx(const Slice& key, const Slice& field, const Slice& value, int32_t* ret) { - rocksdb::WriteBatch batch; +Status RedisHashes::HSetnx(const Slice& key, const Slice& field, const Slice& value, int32_t* ret, + CommitCallback callback) { + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); int32_t version = 0; @@ -734,9 +759,9 @@ Status RedisHashes::HSetnx(const Slice& key, const Slice& field, const Slice& va if (parsed_hashes_meta_value.IsStale() || parsed_hashes_meta_value.count() == 0) { version = parsed_hashes_meta_value.InitialMetaValue(); parsed_hashes_meta_value.set_count(1); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); HashesDataKey hashes_data_key(key, version, field); - batch.Put(handles_[1], hashes_data_key.Encode(), value); + batch->Put(1, hashes_data_key.Encode(), value); *ret = 1; } else { version = parsed_hashes_meta_value.version(); @@ -750,8 +775,8 @@ Status RedisHashes::HSetnx(const Slice& key, const Slice& field, const Slice& va return Status::InvalidArgument("hash size overflow"); } parsed_hashes_meta_value.ModifyCount(1); - batch.Put(handles_[0], key, meta_value); - batch.Put(handles_[1], hashes_data_key.Encode(), value); + batch->Put(0, key, meta_value); + batch->Put(1, hashes_data_key.Encode(), value); *ret = 1; } else { return s; @@ -761,14 +786,14 @@ Status RedisHashes::HSetnx(const Slice& key, const Slice& field, const Slice& va EncodeFixed32(meta_value_buf, 1); HashesMetaValue hashes_meta_value(Slice(meta_value_buf, sizeof(int32_t))); version = hashes_meta_value.UpdateVersion(); - batch.Put(handles_[0], key, hashes_meta_value.Encode()); + batch->Put(0, key, hashes_meta_value.Encode()); HashesDataKey hashes_data_key(key, version, field); - batch.Put(handles_[1], hashes_data_key.Encode(), value); + batch->Put(1, hashes_data_key.Encode(), value); *ret = 1; } else { return s; } - return db_->Write(default_write_options_, &batch); + return batch->Commit(callback); } Status RedisHashes::HVals(const Slice& key, std::vector* values) { diff --git a/src/storage/src/redis_hashes.h b/src/storage/src/redis_hashes.h index cc6c7c6529..1ad02db081 100644 --- a/src/storage/src/redis_hashes.h +++ b/src/storage/src/redis_hashes.h @@ -29,19 +29,23 @@ class RedisHashes : public Redis { Status PKPatternMatchDelWithRemoveKeys(const DataType& data_type, const std::string& pattern, int64_t* ret, std::vector* remove_keys, const int64_t& max_count) override; // Hashes Commands - Status HDel(const Slice& key, const std::vector& fields, int32_t* ret); + Status HDel(const Slice& key, const std::vector& fields, int32_t* ret, + CommitCallback callback = nullptr); Status HExists(const Slice& key, const Slice& field); Status HGet(const Slice& key, const Slice& field, std::string* value); Status HGetall(const Slice& key, std::vector* fvs); Status HGetallWithTTL(const Slice& key, std::vector* fvs, int64_t* ttl); - Status HIncrby(const Slice& key, const Slice& field, int64_t value, int64_t* ret); - Status HIncrbyfloat(const Slice& key, const Slice& field, const Slice& by, std::string* new_value); + Status HIncrby(const Slice& key, const Slice& field, int64_t value, int64_t* ret, CommitCallback callback = nullptr); + Status HIncrbyfloat(const Slice& key, const Slice& field, const Slice& by, std::string* new_value, CommitCallback callback = nullptr); Status HKeys(const Slice& key, std::vector* fields); Status HLen(const Slice& key, int32_t* ret); Status HMGet(const Slice& key, const std::vector& fields, std::vector* vss); - Status HMSet(const Slice& key, const std::vector& fvs); - Status HSet(const Slice& key, const Slice& field, const Slice& value, int32_t* res); - Status HSetnx(const Slice& key, const Slice& field, const Slice& value, int32_t* ret); + Status HMSet(const Slice& key, const std::vector& fvs, + CommitCallback callback = nullptr); + Status HSet(const Slice& key, const Slice& field, const Slice& value, int32_t* res, + CommitCallback callback = nullptr); + Status HSetnx(const Slice& key, const Slice& field, const Slice& value, int32_t* ret, + CommitCallback callback = nullptr); Status HVals(const Slice& key, std::vector* values); Status HStrlen(const Slice& key, const Slice& field, int32_t* len); Status HScan(const Slice& key, int64_t cursor, const std::string& pattern, int64_t count, diff --git a/src/storage/src/redis_lists.cc b/src/storage/src/redis_lists.cc index 09a045a4d4..277797eaf6 100644 --- a/src/storage/src/redis_lists.cc +++ b/src/storage/src/redis_lists.cc @@ -63,15 +63,42 @@ Status RedisLists::Open(const StorageOptions& storage_options, const std::string meta_cf_table_ops.block_cache = rocksdb::NewLRUCache(storage_options.block_cache_size); data_cf_table_ops.block_cache = rocksdb::NewLRUCache(storage_options.block_cache_size); } + meta_cf_ops.table_factory.reset(rocksdb::NewBlockBasedTableFactory(meta_cf_table_ops)); data_cf_ops.table_factory.reset(rocksdb::NewBlockBasedTableFactory(data_cf_table_ops)); + + // Add LogIndex table properties collector for Raft + meta_cf_ops.table_properties_collector_factories.push_back( + std::make_shared(log_index_collector_)); + data_cf_ops.table_properties_collector_factories.push_back( + std::make_shared(log_index_collector_)); + + // Add LogIndex event listener for Raft + if (storage_options.do_snapshot_function) { + auto purger = std::make_shared( + &handles_, &log_index_collector_, &log_index_of_all_cfs_, + storage_options.do_snapshot_function); + db_ops.listeners.push_back(purger); + } std::vector column_families; // Meta CF column_families.emplace_back(rocksdb::kDefaultColumnFamilyName, meta_cf_ops); // Data CF column_families.emplace_back("data_cf", data_cf_ops); - return rocksdb::DB::Open(db_ops, db_path, column_families, &handles_, &db_); + + s = rocksdb::DB::Open(db_ops, db_path, column_families, &handles_, &db_); + if (!s.ok()) { + return s; + } + + // Initialize log index of column families + s = log_index_of_all_cfs_.Init(this); + if (!s.ok()) { + LOG(ERROR) << "Failed to init log index of column families for lists: " << s.ToString(); + } + + return s; } Status RedisLists::CompactRange(const rocksdb::Slice* begin, const rocksdb::Slice* end, const ColumnFamilyType& type) { @@ -230,9 +257,10 @@ Status RedisLists::LIndex(const Slice& key, int64_t index, std::string* element) } Status RedisLists::LInsert(const Slice& key, const BeforeOrAfter& before_or_after, const std::string& pivot, - const std::string& value, int64_t* ret) { + const std::string& value, int64_t* ret, + CommitCallback callback) { *ret = 0; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); std::string meta_value; Status s = db_->Get(default_read_options_, handles_[0], key, &meta_value); @@ -286,7 +314,7 @@ Status RedisLists::LInsert(const Slice& key, const BeforeOrAfter& before_or_afte current_index = parsed_lists_meta_value.left_index(); for (const auto& node : list_nodes) { ListsDataKey lists_data_key(key, version, current_index++); - batch.Put(handles_[1], lists_data_key.Encode(), node); + batch->Put(1, lists_data_key.Encode(), node); } parsed_lists_meta_value.ModifyLeftIndex(1); } else { @@ -307,16 +335,16 @@ Status RedisLists::LInsert(const Slice& key, const BeforeOrAfter& before_or_afte current_index = target_index + 1; for (const auto& node : list_nodes) { ListsDataKey lists_data_key(key, version, current_index++); - batch.Put(handles_[1], lists_data_key.Encode(), node); + batch->Put(1, lists_data_key.Encode(), node); } parsed_lists_meta_value.ModifyRightIndex(1); } parsed_lists_meta_value.ModifyCount(1); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); ListsDataKey lists_target_key(key, version, target_index); - batch.Put(handles_[1], lists_target_key.Encode(), value); + batch->Put(1, lists_target_key.Encode(), value); *ret = static_cast(parsed_lists_meta_value.count()); - return db_->Write(default_write_options_, &batch); + return batch->Commit(callback); } } } else if (s.IsNotFound()) { @@ -343,11 +371,9 @@ Status RedisLists::LLen(const Slice& key, uint64_t* len) { return s; } -Status RedisLists::LPop(const Slice& key, int64_t count, std::vector* elements) { - uint32_t statistic = 0; +Status RedisLists::LPop(const Slice& key, int64_t count, std::vector* elements, CommitCallback callback) { elements->clear(); - - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); std::string meta_value; @@ -361,36 +387,31 @@ Status RedisLists::LPop(const Slice& key, int64_t count, std::vector(parsed_lists_meta_value.count()); int32_t version = parsed_lists_meta_value.version(); - int32_t start_index = 0; auto stop_index = static_cast(count<=size?count-1:size-1); int32_t cur_index = 0; ListsDataKey lists_data_key(key, version, parsed_lists_meta_value.left_index()+1); rocksdb::Iterator* iter = db_->NewIterator(default_read_options_, handles_[1]); for (iter->Seek(lists_data_key.Encode()); iter->Valid() && cur_index <= stop_index; iter->Next(), ++cur_index) { - statistic++; elements->push_back(iter->value().ToString()); - batch.Delete(handles_[1],iter->key()); + batch->Delete(1, iter->key()); parsed_lists_meta_value.ModifyCount(-1); parsed_lists_meta_value.ModifyLeftIndex(-1); } - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); delete iter; } - } - if (batch.Count() != 0U) { - s = db_->Write(default_write_options_, &batch); - if (s.ok()) { - batch.Clear(); - } - UpdateSpecificKeyStatistics(key.ToString(), statistic); - } + } else { return s; } -Status RedisLists::LPush(const Slice& key, const std::vector& values, uint64_t* ret) { + return batch->Commit(callback); +} + +Status RedisLists::LPush(const Slice& key, const std::vector& values, uint64_t* ret, + CommitCallback callback) { *ret = 0; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); uint64_t index = 0; @@ -409,9 +430,9 @@ Status RedisLists::LPush(const Slice& key, const std::vector& value parsed_lists_meta_value.ModifyLeftIndex(1); parsed_lists_meta_value.ModifyCount(1); ListsDataKey lists_data_key(key, version, index); - batch.Put(handles_[1], lists_data_key.Encode(), value); + batch->Put(1, lists_data_key.Encode(), value); } - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); *ret = parsed_lists_meta_value.count(); } else if (s.IsNotFound()) { char str[8]; @@ -422,19 +443,20 @@ Status RedisLists::LPush(const Slice& key, const std::vector& value index = lists_meta_value.left_index(); lists_meta_value.ModifyLeftIndex(1); ListsDataKey lists_data_key(key, version, index); - batch.Put(handles_[1], lists_data_key.Encode(), value); + batch->Put(1, lists_data_key.Encode(), value); } - batch.Put(handles_[0], key, lists_meta_value.Encode()); + batch->Put(0, key, lists_meta_value.Encode()); *ret = lists_meta_value.right_index() - lists_meta_value.left_index() - 1; } else { return s; } - return db_->Write(default_write_options_, &batch); + return batch->Commit(callback); } -Status RedisLists::LPushx(const Slice& key, const std::vector& values, uint64_t* len) { +Status RedisLists::LPushx(const Slice& key, const std::vector& values, uint64_t* len, + CommitCallback callback) { *len = 0; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); std::string meta_value; @@ -452,11 +474,11 @@ Status RedisLists::LPushx(const Slice& key, const std::vector& valu parsed_lists_meta_value.ModifyCount(1); parsed_lists_meta_value.ModifyLeftIndex(1); ListsDataKey lists_data_key(key, version, index); - batch.Put(handles_[1], lists_data_key.Encode(), value); + batch->Put(1, lists_data_key.Encode(), value); } - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); *len = parsed_lists_meta_value.count(); - return db_->Write(default_write_options_, &batch); + return batch->Commit(callback); } } return s; @@ -575,9 +597,10 @@ Status RedisLists::LRangeWithTTL(const Slice& key, int64_t start, int64_t stop, } } -Status RedisLists::LRem(const Slice& key, int64_t count, const Slice& value, uint64_t* ret) { +Status RedisLists::LRem(const Slice& key, int64_t count, const Slice& value, uint64_t* ret, + CommitCallback callback) { *ret = 0; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); std::string meta_value; Status s = db_->Get(default_read_options_, handles_[0], key, &meta_value); @@ -646,7 +669,7 @@ Status RedisLists::LRem(const Slice& key, int64_t count, const Slice& value, uin rest--; } else { ListsDataKey lists_data_key(key, version, left--); - batch.Put(handles_[1], lists_data_key.Encode(), iter->value()); + batch->Put(1, lists_data_key.Encode(), iter->value()); } } delete iter; @@ -666,7 +689,7 @@ Status RedisLists::LRem(const Slice& key, int64_t count, const Slice& value, uin rest--; } else { ListsDataKey lists_data_key(key, version, right++); - batch.Put(handles_[1], lists_data_key.Encode(), iter->value()); + batch->Put(1, lists_data_key.Encode(), iter->value()); } } delete iter; @@ -677,13 +700,13 @@ Status RedisLists::LRem(const Slice& key, int64_t count, const Slice& value, uin parsed_lists_meta_value.ModifyRightIndex(-target_index.size()); } parsed_lists_meta_value.ModifyCount(-target_index.size()); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); for (const auto& idx : delete_index) { ListsDataKey lists_data_key(key, version, idx); - batch.Delete(handles_[1], lists_data_key.Encode()); + batch->Delete(1, lists_data_key.Encode()); } *ret = target_index.size(); - return db_->Write(default_write_options_, &batch); + return batch->Commit(callback); } } } else if (s.IsNotFound()) { @@ -692,8 +715,10 @@ Status RedisLists::LRem(const Slice& key, int64_t count, const Slice& value, uin return s; } -Status RedisLists::LSet(const Slice& key, int64_t index, const Slice& value) { +Status RedisLists::LSet(const Slice& key, int64_t index, const Slice& value, + CommitCallback callback) { uint32_t statistic = 0; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); std::string meta_value; Status s = db_->Get(default_read_options_, handles_[0], key, &meta_value); @@ -712,17 +737,18 @@ Status RedisLists::LSet(const Slice& key, int64_t index, const Slice& value) { return Status::Corruption("index out of range"); } ListsDataKey lists_data_key(key, version, target_index); - s = db_->Put(default_write_options_, handles_[1], lists_data_key.Encode(), value); + batch->Put(1, lists_data_key.Encode(), value); statistic++; UpdateSpecificKeyStatistics(key.ToString(), statistic); - return s; + return batch->Commit(callback); } } return s; } -Status RedisLists::LTrim(const Slice& key, int64_t start, int64_t stop) { - rocksdb::WriteBatch batch; +Status RedisLists::LTrim(const Slice& key, int64_t start, int64_t stop, + CommitCallback callback) { + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); uint32_t statistic = 0; @@ -744,7 +770,7 @@ Status RedisLists::LTrim(const Slice& key, int64_t start, int64_t stop) { if (sublist_left_index > sublist_right_index || sublist_left_index > origin_right_index || sublist_right_index < origin_left_index) { parsed_lists_meta_value.InitialMetaValue(); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); } else { if (sublist_left_index < origin_left_index) { sublist_left_index = origin_left_index; @@ -759,32 +785,30 @@ Status RedisLists::LTrim(const Slice& key, int64_t start, int64_t stop) { parsed_lists_meta_value.ModifyLeftIndex(-(sublist_left_index - origin_left_index)); parsed_lists_meta_value.ModifyRightIndex(-(origin_right_index - sublist_right_index)); parsed_lists_meta_value.ModifyCount(-delete_node_num); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); for (uint64_t idx = origin_left_index; idx < sublist_left_index; ++idx) { statistic++; ListsDataKey lists_data_key(key, version, idx); - batch.Delete(handles_[1], lists_data_key.Encode()); + batch->Delete(1, lists_data_key.Encode()); } for (uint64_t idx = origin_right_index; idx > sublist_right_index; --idx) { statistic++; ListsDataKey lists_data_key(key, version, idx); - batch.Delete(handles_[1], lists_data_key.Encode()); + batch->Delete(1, lists_data_key.Encode()); } } } } else { return s; } - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(callback); UpdateSpecificKeyStatistics(key.ToString(), statistic); return s; } -Status RedisLists::RPop(const Slice& key, int64_t count, std::vector* elements) { - uint32_t statistic = 0; +Status RedisLists::RPop(const Slice& key, int64_t count, std::vector* elements, CommitCallback callback) { elements->clear(); - - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); std::string meta_value; @@ -798,39 +822,33 @@ Status RedisLists::RPop(const Slice& key, int64_t count, std::vector(parsed_lists_meta_value.count()); int32_t version = parsed_lists_meta_value.version(); - int32_t start_index = 0; auto stop_index = static_cast(count<=size?count-1:size-1); int32_t cur_index = 0; ListsDataKey lists_data_key(key, version, parsed_lists_meta_value.right_index()-1); rocksdb::Iterator* iter = db_->NewIterator(default_read_options_, handles_[1]); for (iter->SeekForPrev(lists_data_key.Encode()); iter->Valid() && cur_index <= stop_index; iter->Prev(), ++cur_index) { - statistic++; elements->push_back(iter->value().ToString()); - batch.Delete(handles_[1],iter->key()); + batch->Delete(1, iter->key()); parsed_lists_meta_value.ModifyCount(-1); parsed_lists_meta_value.ModifyRightIndex(-1); } - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); delete iter; } - } - if (batch.Count() != 0U) { - s = db_->Write(default_write_options_, &batch); - if (s.ok()) { - batch.Clear(); - } - UpdateSpecificKeyStatistics(key.ToString(), statistic); - } + } else { return s; } -Status RedisLists::RPoplpush(const Slice& source, const Slice& destination, std::string* element) { + return batch->Commit(callback); +} + +Status RedisLists::RPoplpush(const Slice& source, const Slice& destination, std::string* element, CommitCallback callback) { element->clear(); - uint32_t statistic = 0; - Status s; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); MultiScopeRecordLock l(lock_mgr_, {source.ToString(), destination.ToString()}); + + Status s; if (source.compare(destination) == 0) { std::string meta_value; s = db_->Get(default_read_options_, handles_[0], source, &meta_value); @@ -853,15 +871,12 @@ Status RedisLists::RPoplpush(const Slice& source, const Slice& destination, std: } else { uint64_t target_index = parsed_lists_meta_value.left_index(); ListsDataKey lists_target_key(source, version, target_index); - batch.Delete(handles_[1], lists_data_key.Encode()); - batch.Put(handles_[1], lists_target_key.Encode(), target); - statistic++; + batch->Delete(1, lists_data_key.Encode()); + batch->Put(1, lists_target_key.Encode(), target); parsed_lists_meta_value.ModifyRightIndex(-1); parsed_lists_meta_value.ModifyLeftIndex(1); - batch.Put(handles_[0], source, meta_value); - s = db_->Write(default_write_options_, &batch); - UpdateSpecificKeyStatistics(source.ToString(), statistic); - return s; + batch->Put(0, source, meta_value); + return batch->Commit(callback); } } else { return s; @@ -888,11 +903,10 @@ Status RedisLists::RPoplpush(const Slice& source, const Slice& destination, std: ListsDataKey lists_data_key(source, version, last_node_index); s = db_->Get(default_read_options_, handles_[1], lists_data_key.Encode(), &target); if (s.ok()) { - batch.Delete(handles_[1], lists_data_key.Encode()); - statistic++; + batch->Delete(1, lists_data_key.Encode()); parsed_lists_meta_value.ModifyCount(-1); parsed_lists_meta_value.ModifyRightIndex(-1); - batch.Put(handles_[0], source, source_meta_value); + batch->Put(0, source, source_meta_value); } else { return s; } @@ -912,10 +926,10 @@ Status RedisLists::RPoplpush(const Slice& source, const Slice& destination, std: } uint64_t target_index = parsed_lists_meta_value.left_index(); ListsDataKey lists_data_key(destination, version, target_index); - batch.Put(handles_[1], lists_data_key.Encode(), target); + batch->Put(1, lists_data_key.Encode(), target); parsed_lists_meta_value.ModifyCount(1); parsed_lists_meta_value.ModifyLeftIndex(1); - batch.Put(handles_[0], destination, destination_meta_value); + batch->Put(0, destination, destination_meta_value); } else if (s.IsNotFound()) { char str[8]; EncodeFixed64(str, 1); @@ -923,24 +937,21 @@ Status RedisLists::RPoplpush(const Slice& source, const Slice& destination, std: version = lists_meta_value.UpdateVersion(); uint64_t target_index = lists_meta_value.left_index(); ListsDataKey lists_data_key(destination, version, target_index); - batch.Put(handles_[1], lists_data_key.Encode(), target); + batch->Put(1, lists_data_key.Encode(), target); lists_meta_value.ModifyLeftIndex(1); - batch.Put(handles_[0], destination, lists_meta_value.Encode()); + batch->Put(0, destination, lists_meta_value.Encode()); } else { return s; } - s = db_->Write(default_write_options_, &batch); - UpdateSpecificKeyStatistics(source.ToString(), statistic); - if (s.ok()) { *element = target; - } - return s; + return batch->Commit(callback); } -Status RedisLists::RPush(const Slice& key, const std::vector& values, uint64_t* ret) { +Status RedisLists::RPush(const Slice& key, const std::vector& values, uint64_t* ret, + CommitCallback callback) { *ret = 0; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); uint64_t index = 0; int32_t version = 0; @@ -958,9 +969,9 @@ Status RedisLists::RPush(const Slice& key, const std::vector& value parsed_lists_meta_value.ModifyRightIndex(1); parsed_lists_meta_value.ModifyCount(1); ListsDataKey lists_data_key(key, version, index); - batch.Put(handles_[1], lists_data_key.Encode(), value); + batch->Put(1, lists_data_key.Encode(), value); } - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); *ret = parsed_lists_meta_value.count(); } else if (s.IsNotFound()) { char str[8]; @@ -971,19 +982,20 @@ Status RedisLists::RPush(const Slice& key, const std::vector& value index = lists_meta_value.right_index(); lists_meta_value.ModifyRightIndex(1); ListsDataKey lists_data_key(key, version, index); - batch.Put(handles_[1], lists_data_key.Encode(), value); + batch->Put(1, lists_data_key.Encode(), value); } - batch.Put(handles_[0], key, lists_meta_value.Encode()); + batch->Put(0, key, lists_meta_value.Encode()); *ret = lists_meta_value.right_index() - lists_meta_value.left_index() - 1; } else { return s; } - return db_->Write(default_write_options_, &batch); + return batch->Commit(callback); } -Status RedisLists::RPushx(const Slice& key, const std::vector& values, uint64_t* len) { +Status RedisLists::RPushx(const Slice& key, const std::vector& values, uint64_t* len, + CommitCallback callback) { *len = 0; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); std::string meta_value; @@ -1001,11 +1013,11 @@ Status RedisLists::RPushx(const Slice& key, const std::vector& valu parsed_lists_meta_value.ModifyCount(1); parsed_lists_meta_value.ModifyRightIndex(1); ListsDataKey lists_data_key(key, version, index); - batch.Put(handles_[1], lists_data_key.Encode(), value); + batch->Put(1, lists_data_key.Encode(), value); } - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); *len = parsed_lists_meta_value.count(); - return db_->Write(default_write_options_, &batch); + return batch->Commit(callback); } } return s; diff --git a/src/storage/src/redis_lists.h b/src/storage/src/redis_lists.h index d56c5e47e7..e3041363a4 100644 --- a/src/storage/src/redis_lists.h +++ b/src/storage/src/redis_lists.h @@ -32,20 +32,28 @@ class RedisLists : public Redis { // Lists commands; Status LIndex(const Slice& key, int64_t index, std::string* element); Status LInsert(const Slice& key, const BeforeOrAfter& before_or_after, const std::string& pivot, - const std::string& value, int64_t* ret); + const std::string& value, int64_t* ret, + CommitCallback callback = nullptr); Status LLen(const Slice& key, uint64_t* len); - Status LPop(const Slice& key, int64_t count, std::vector* elements); - Status LPush(const Slice& key, const std::vector& values, uint64_t* ret); - Status LPushx(const Slice& key, const std::vector& values, uint64_t* len); + Status LPop(const Slice& key, int64_t count, std::vector* elements, CommitCallback callback = nullptr); + Status LPush(const Slice& key, const std::vector& values, uint64_t* ret, + CommitCallback callback = nullptr); + Status LPushx(const Slice& key, const std::vector& values, uint64_t* len, + CommitCallback callback = nullptr); Status LRange(const Slice& key, int64_t start, int64_t stop, std::vector* ret); - Status LRem(const Slice& key, int64_t count, const Slice& value, uint64_t* ret); + Status LRem(const Slice& key, int64_t count, const Slice& value, uint64_t* ret, + CommitCallback callback = nullptr); Status LRangeWithTTL(const Slice& key, int64_t start, int64_t stop, std::vector* ret, int64_t* ttl); - Status LSet(const Slice& key, int64_t index, const Slice& value); - Status LTrim(const Slice& key, int64_t start, int64_t stop); - Status RPop(const Slice& key, int64_t count, std::vector* elements); - Status RPoplpush(const Slice& source, const Slice& destination, std::string* element); - Status RPush(const Slice& key, const std::vector& values, uint64_t* ret); - Status RPushx(const Slice& key, const std::vector& values, uint64_t* len); + Status LSet(const Slice& key, int64_t index, const Slice& value, + CommitCallback callback = nullptr); + Status LTrim(const Slice& key, int64_t start, int64_t stop, + CommitCallback callback = nullptr); + Status RPop(const Slice& key, int64_t count, std::vector* elements, CommitCallback callback = nullptr); + Status RPoplpush(const Slice& source, const Slice& destination, std::string* element, CommitCallback callback = nullptr); + Status RPush(const Slice& key, const std::vector& values, uint64_t* ret, + CommitCallback callback = nullptr); + Status RPushx(const Slice& key, const std::vector& values, uint64_t* len, + CommitCallback callback = nullptr); Status PKScanRange(const Slice& key_start, const Slice& key_end, const Slice& pattern, int32_t limit, std::vector* keys, std::string* next_key); Status PKRScanRange(const Slice& key_start, const Slice& key_end, const Slice& pattern, int32_t limit, diff --git a/src/storage/src/redis_sets.cc b/src/storage/src/redis_sets.cc index 5707e032d4..6eb757f26c 100644 --- a/src/storage/src/redis_sets.cc +++ b/src/storage/src/redis_sets.cc @@ -63,15 +63,42 @@ rocksdb::Status RedisSets::Open(const StorageOptions& storage_options, const std meta_cf_table_ops.block_cache = rocksdb::NewLRUCache(storage_options.block_cache_size); member_cf_table_ops.block_cache = rocksdb::NewLRUCache(storage_options.block_cache_size); } + meta_cf_ops.table_factory.reset(rocksdb::NewBlockBasedTableFactory(meta_cf_table_ops)); member_cf_ops.table_factory.reset(rocksdb::NewBlockBasedTableFactory(member_cf_table_ops)); + + // Add LogIndex table properties collector for Raft + meta_cf_ops.table_properties_collector_factories.push_back( + std::make_shared(log_index_collector_)); + member_cf_ops.table_properties_collector_factories.push_back( + std::make_shared(log_index_collector_)); + + // Add LogIndex event listener for Raft + if (storage_options.do_snapshot_function) { + auto purger = std::make_shared( + &handles_, &log_index_collector_, &log_index_of_all_cfs_, + storage_options.do_snapshot_function); + db_ops.listeners.push_back(purger); + } std::vector column_families; // Meta CF column_families.emplace_back(rocksdb::kDefaultColumnFamilyName, meta_cf_ops); // Member CF column_families.emplace_back("member_cf", member_cf_ops); - return rocksdb::DB::Open(db_ops, db_path, column_families, &handles_, &db_); + + s = rocksdb::DB::Open(db_ops, db_path, column_families, &handles_, &db_); + if (!s.ok()) { + return s; + } + + // Initialize log index of column families + s = log_index_of_all_cfs_.Init(this); + if (!s.ok()) { + LOG(ERROR) << "Failed to init log index of column families for sets: " << s.ToString(); + } + + return s; } rocksdb::Status RedisSets::CompactRange(const rocksdb::Slice* begin, const rocksdb::Slice* end, const ColumnFamilyType& type) { @@ -196,7 +223,8 @@ rocksdb::Status RedisSets::PKPatternMatchDelWithRemoveKeys(const DataType& data_ return s; } -rocksdb::Status RedisSets::SAdd(const Slice& key, const std::vector& members, int32_t* ret) { +rocksdb::Status RedisSets::SAdd(const Slice& key, const std::vector& members, int32_t* ret, + CommitCallback callback) { std::unordered_set unique; std::vector filtered_members; for (const auto& member : members) { @@ -206,7 +234,7 @@ rocksdb::Status RedisSets::SAdd(const Slice& key, const std::vector } } - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); int32_t version = 0; std::string meta_value; @@ -219,10 +247,10 @@ rocksdb::Status RedisSets::SAdd(const Slice& key, const std::vector return Status::InvalidArgument("set size overflow"); } parsed_sets_meta_value.set_count(static_cast(filtered_members.size())); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); for (const auto& member : filtered_members) { SetsMemberKey sets_member_key(key, version, member); - batch.Put(handles_[1], sets_member_key.Encode(), Slice()); + batch->Put(1, sets_member_key.Encode(), Slice()); } *ret = static_cast(filtered_members.size()); } else { @@ -235,7 +263,7 @@ rocksdb::Status RedisSets::SAdd(const Slice& key, const std::vector if (s.ok()) { } else if (s.IsNotFound()) { cnt++; - batch.Put(handles_[1], sets_member_key.Encode(), Slice()); + batch->Put(1, sets_member_key.Encode(), Slice()); } else { return s; } @@ -248,7 +276,7 @@ rocksdb::Status RedisSets::SAdd(const Slice& key, const std::vector return Status::InvalidArgument("set size overflow"); } parsed_sets_meta_value.ModifyCount(cnt); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); } } } else if (s.IsNotFound()) { @@ -256,16 +284,16 @@ rocksdb::Status RedisSets::SAdd(const Slice& key, const std::vector EncodeFixed32(str, filtered_members.size()); SetsMetaValue sets_meta_value(Slice(str, sizeof(int32_t))); version = sets_meta_value.UpdateVersion(); - batch.Put(handles_[0], key, sets_meta_value.Encode()); + batch->Put(0, key, sets_meta_value.Encode()); for (const auto& member : filtered_members) { SetsMemberKey sets_member_key(key, version, member); - batch.Put(handles_[1], sets_member_key.Encode(), Slice()); + batch->Put(1, sets_member_key.Encode(), Slice()); } *ret = static_cast(filtered_members.size()); } else { return s; } - return db_->Write(default_write_options_, &batch); + return batch->Commit(callback); } rocksdb::Status RedisSets::SCard(const Slice& key, int32_t* ret) { @@ -353,12 +381,13 @@ rocksdb::Status RedisSets::SDiff(const std::vector& keys, std::vect return rocksdb::Status::OK(); } -rocksdb::Status RedisSets::SDiffstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret) { +rocksdb::Status RedisSets::SDiffstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret, + CommitCallback callback) { if (keys.empty()) { return rocksdb::Status::Corruption("SDiffsotre invalid parameter, no keys"); } - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); rocksdb::ReadOptions read_options; const rocksdb::Snapshot* snapshot; @@ -430,22 +459,22 @@ rocksdb::Status RedisSets::SDiffstore(const Slice& destination, const std::vecto return Status::InvalidArgument("set size overflow"); } parsed_sets_meta_value.set_count(static_cast(members.size())); - batch.Put(handles_[0], destination, meta_value); + batch->Put(0, destination, meta_value); } else if (s.IsNotFound()) { char str[4]; EncodeFixed32(str, members.size()); SetsMetaValue sets_meta_value(Slice(str, sizeof(int32_t))); version = sets_meta_value.UpdateVersion(); - batch.Put(handles_[0], destination, sets_meta_value.Encode()); + batch->Put(0, destination, sets_meta_value.Encode()); } else { return s; } for (const auto& member : members) { SetsMemberKey sets_member_key(destination, version, member); - batch.Put(handles_[1], sets_member_key.Encode(), Slice()); + batch->Put(1, sets_member_key.Encode(), Slice()); } *ret = static_cast(members.size()); - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(callback); UpdateSpecificKeyStatistics(destination.ToString(), statistic); value_to_dest = std::move(members); return s; @@ -527,12 +556,13 @@ rocksdb::Status RedisSets::SInter(const std::vector& keys, std::vec return rocksdb::Status::OK(); } -rocksdb::Status RedisSets::SInterstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret) { +rocksdb::Status RedisSets::SInterstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret, + CommitCallback callback) { if (keys.empty()) { return rocksdb::Status::Corruption("SInterstore invalid parameter, no keys"); } - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); rocksdb::ReadOptions read_options; const rocksdb::Snapshot* snapshot; @@ -618,22 +648,22 @@ rocksdb::Status RedisSets::SInterstore(const Slice& destination, const std::vect return Status::InvalidArgument("set size overflow"); } parsed_sets_meta_value.set_count(static_cast(members.size())); - batch.Put(handles_[0], destination, meta_value); + batch->Put(0, destination, meta_value); } else if (s.IsNotFound()) { char str[4]; EncodeFixed32(str, members.size()); SetsMetaValue sets_meta_value(Slice(str, sizeof(int32_t))); version = sets_meta_value.UpdateVersion(); - batch.Put(handles_[0], destination, sets_meta_value.Encode()); + batch->Put(0, destination, sets_meta_value.Encode()); } else { return s; } for (const auto& member : members) { SetsMemberKey sets_member_key(destination, version, member); - batch.Put(handles_[1], sets_member_key.Encode(), Slice()); + batch->Put(1, sets_member_key.Encode(), Slice()); } *ret = static_cast(members.size()); - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(callback); UpdateSpecificKeyStatistics(destination.ToString(), statistic); value_to_dest = std::move(members); return s; @@ -743,13 +773,11 @@ Status RedisSets::SMembersWithTTL(const Slice& key, return s; } -rocksdb::Status RedisSets::SMove(const Slice& source, const Slice& destination, const Slice& member, int32_t* ret) { +rocksdb::Status RedisSets::SMove(const Slice& source, const Slice& destination, const Slice& member, int32_t* ret, CommitCallback callback) { *ret = 0; - rocksdb::WriteBatch batch; - rocksdb::ReadOptions read_options; + auto batch = Batch::CreateBatch(this); int32_t version = 0; - uint32_t statistic = 0; std::string meta_value; std::vector keys{source.ToString(), destination.ToString()}; MultiScopeRecordLock ml(lock_mgr_, keys); @@ -777,9 +805,8 @@ rocksdb::Status RedisSets::SMove(const Slice& source, const Slice& destination, return Status::InvalidArgument("set size overflow"); } parsed_sets_meta_value.ModifyCount(-1); - batch.Put(handles_[0], source, meta_value); - batch.Delete(handles_[1], sets_member_key.Encode()); - statistic++; + batch->Put(0, source, meta_value); + batch->Delete(1, sets_member_key.Encode()); } else if (s.IsNotFound()) { *ret = 0; return rocksdb::Status::NotFound(); @@ -800,9 +827,9 @@ rocksdb::Status RedisSets::SMove(const Slice& source, const Slice& destination, if (parsed_sets_meta_value.IsStale() || parsed_sets_meta_value.count() == 0) { version = parsed_sets_meta_value.InitialMetaValue(); parsed_sets_meta_value.set_count(1); - batch.Put(handles_[0], destination, meta_value); + batch->Put(0, destination, meta_value); SetsMemberKey sets_member_key(destination, version, member); - batch.Put(handles_[1], sets_member_key.Encode(), Slice()); + batch->Put(1, sets_member_key.Encode(), Slice()); } else { std::string member_value; version = parsed_sets_meta_value.version(); @@ -813,8 +840,8 @@ rocksdb::Status RedisSets::SMove(const Slice& source, const Slice& destination, return Status::InvalidArgument("set size overflow"); } parsed_sets_meta_value.ModifyCount(1); - batch.Put(handles_[0], destination, meta_value); - batch.Put(handles_[1], sets_member_key.Encode(), Slice()); + batch->Put(0, destination, meta_value); + batch->Put(1, sets_member_key.Encode(), Slice()); } else if (!s.ok()) { return s; } @@ -824,25 +851,23 @@ rocksdb::Status RedisSets::SMove(const Slice& source, const Slice& destination, EncodeFixed32(str, 1); SetsMetaValue sets_meta_value(Slice(str, sizeof(int32_t))); version = sets_meta_value.UpdateVersion(); - batch.Put(handles_[0], destination, sets_meta_value.Encode()); + batch->Put(0, destination, sets_meta_value.Encode()); SetsMemberKey sets_member_key(destination, version, member); - batch.Put(handles_[1], sets_member_key.Encode(), Slice()); + batch->Put(1, sets_member_key.Encode(), Slice()); } else { return s; } - s = db_->Write(default_write_options_, &batch); - UpdateSpecificKeyStatistics(source.ToString(), 1); - return s; + + return batch->Commit(callback); } -rocksdb::Status RedisSets::SPop(const Slice& key, std::vector* members, int64_t cnt) { +rocksdb::Status RedisSets::SPop(const Slice& key, std::vector* members, int64_t cnt, CommitCallback callback) { std::default_random_engine engine; std::string meta_value; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); - uint64_t start_us = pstd::NowMicros(); Status s = db_->Get(default_read_options_, handles_[0], key, &meta_value); if (s.ok()) { ParsedSetsMetaValue parsed_sets_meta_value(&meta_value); @@ -862,15 +887,13 @@ rocksdb::Status RedisSets::SPop(const Slice& key, std::vector* memb iter->Valid() && cur_index < size; iter->Next(), cur_index++) { - batch.Delete(handles_[1], iter->key()); + batch->Delete(1, iter->key()); ParsedSetsMemberKey parsed_sets_member_key(iter->key()); members->push_back(parsed_sets_member_key.member().ToString()); } - //parsed_sets_meta_value.ModifyCount(-cnt); - //batch.Put(handles_[0], key, meta_value); - batch.Delete(handles_[0], key); + batch->Delete(0, key); delete iter; } else { @@ -893,7 +916,6 @@ rocksdb::Status RedisSets::SPop(const Slice& key, std::vector* memb SetsMemberKey sets_member_key(key, version, Slice()); int64_t del_count = 0; - KeyStatisticsDurationGuard guard(this, key.ToString()); auto iter = db_->NewIterator(default_read_options_, handles_[1]); for (iter->Seek(sets_member_key.Encode()); iter->Valid() && cur_index < size; @@ -903,7 +925,7 @@ rocksdb::Status RedisSets::SPop(const Slice& key, std::vector* memb } if (sets_index.find(cur_index) != sets_index.end()) { del_count++; - batch.Delete(handles_[1], iter->key()); + batch->Delete(1, iter->key()); ParsedSetsMemberKey parsed_sets_member_key(iter->key()); members->push_back(parsed_sets_member_key.member().ToString()); } @@ -913,14 +935,15 @@ rocksdb::Status RedisSets::SPop(const Slice& key, std::vector* memb return Status::InvalidArgument("set size overflow"); } parsed_sets_meta_value.ModifyCount(static_cast(-cnt)); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); delete iter; } } } else { return s; } - return db_->Write(default_write_options_, &batch); + + return batch->Commit(callback); } rocksdb::Status RedisSets::SRandmember(const Slice& key, int32_t count, std::vector* members) { @@ -992,9 +1015,10 @@ rocksdb::Status RedisSets::SRandmember(const Slice& key, int32_t count, std::vec return s; } -rocksdb::Status RedisSets::SRem(const Slice& key, const std::vector& members, int32_t* ret) { +rocksdb::Status RedisSets::SRem(const Slice& key, const std::vector& members, int32_t* ret, + CommitCallback callback) { *ret = 0; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); int32_t version = 0; @@ -1017,7 +1041,7 @@ rocksdb::Status RedisSets::SRem(const Slice& key, const std::vector if (s.ok()) { cnt++; statistic++; - batch.Delete(handles_[1], sets_member_key.Encode()); + batch->Delete(1, sets_member_key.Encode()); } else if (s.IsNotFound()) { } else { return s; @@ -1028,7 +1052,7 @@ rocksdb::Status RedisSets::SRem(const Slice& key, const std::vector return Status::InvalidArgument("set size overflow"); } parsed_sets_meta_value.ModifyCount(-cnt); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); } } else if (s.IsNotFound()) { *ret = 0; @@ -1036,7 +1060,7 @@ rocksdb::Status RedisSets::SRem(const Slice& key, const std::vector } else { return s; } - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(callback); UpdateSpecificKeyStatistics(key.ToString(), statistic); return s; } @@ -1087,12 +1111,13 @@ rocksdb::Status RedisSets::SUnion(const std::vector& keys, std::vec return rocksdb::Status::OK(); } -rocksdb::Status RedisSets::SUnionstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret) { +rocksdb::Status RedisSets::SUnionstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret, + CommitCallback callback) { if (keys.empty()) { return rocksdb::Status::Corruption("SUnionstore invalid parameter, no keys"); } - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); rocksdb::ReadOptions read_options; const rocksdb::Snapshot* snapshot; @@ -1145,22 +1170,22 @@ rocksdb::Status RedisSets::SUnionstore(const Slice& destination, const std::vect return Status::InvalidArgument("set size overflow"); } parsed_sets_meta_value.set_count(static_cast(members.size())); - batch.Put(handles_[0], destination, meta_value); + batch->Put(0, destination, meta_value); } else if (s.IsNotFound()) { char str[4]; EncodeFixed32(str, members.size()); SetsMetaValue sets_meta_value(Slice(str, sizeof(int32_t))); version = sets_meta_value.UpdateVersion(); - batch.Put(handles_[0], destination, sets_meta_value.Encode()); + batch->Put(0, destination, sets_meta_value.Encode()); } else { return s; } for (const auto& member : members) { SetsMemberKey sets_member_key(destination, version, member); - batch.Put(handles_[1], sets_member_key.Encode(), Slice()); + batch->Put(1, sets_member_key.Encode(), Slice()); } *ret = static_cast(members.size()); - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(callback); UpdateSpecificKeyStatistics(destination.ToString(), statistic); value_to_dest = std::move(members); return s; diff --git a/src/storage/src/redis_sets.h b/src/storage/src/redis_sets.h index 139412da59..2781039a01 100644 --- a/src/storage/src/redis_sets.h +++ b/src/storage/src/redis_sets.h @@ -31,21 +31,26 @@ class RedisSets : public Redis { Status PKPatternMatchDelWithRemoveKeys(const DataType& data_type, const std::string& pattern, int64_t* ret, std::vector* remove_keys, const int64_t& max_count) override; // Setes Commands - Status SAdd(const Slice& key, const std::vector& members, int32_t* ret); + Status SAdd(const Slice& key, const std::vector& members, int32_t* ret, + CommitCallback callback = nullptr); Status SCard(const Slice& key, int32_t* ret); Status SDiff(const std::vector& keys, std::vector* members); - Status SDiffstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret); + Status SDiffstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret, + CommitCallback callback = nullptr); Status SInter(const std::vector& keys, std::vector* members); - Status SInterstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret); + Status SInterstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret, + CommitCallback callback = nullptr); Status SIsmember(const Slice& key, const Slice& member, int32_t* ret); Status SMembers(const Slice& key, std::vector* members); Status SMembersWithTTL(const Slice& key, std::vector* members, int64_t* ttl); - Status SMove(const Slice& source, const Slice& destination, const Slice& member, int32_t* ret); - Status SPop(const Slice& key, std::vector* members, int64_t cnt); + Status SMove(const Slice& source, const Slice& destination, const Slice& member, int32_t* ret, CommitCallback callback = nullptr); + Status SPop(const Slice& key, std::vector* members, int64_t cnt, CommitCallback callback = nullptr); Status SRandmember(const Slice& key, int32_t count, std::vector* members); - Status SRem(const Slice& key, const std::vector& members, int32_t* ret); + Status SRem(const Slice& key, const std::vector& members, int32_t* ret, + CommitCallback callback = nullptr); Status SUnion(const std::vector& keys, std::vector* members); - Status SUnionstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret); + Status SUnionstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret, + CommitCallback callback = nullptr); Status SScan(const Slice& key, int64_t cursor, const std::string& pattern, int64_t count, std::vector* members, int64_t* next_cursor); Status PKScanRange(const Slice& key_start, const Slice& key_end, const Slice& pattern, int32_t limit, diff --git a/src/storage/src/redis_streams.cc b/src/storage/src/redis_streams.cc index 48578ae5b5..2a8f9367ac 100644 --- a/src/storage/src/redis_streams.cc +++ b/src/storage/src/redis_streams.cc @@ -20,12 +20,14 @@ #include "src/scope_snapshot.h" #include "storage/storage.h" #include "storage/util.h" +#include "storage/batch.h" +#include "glog/logging.h" #include "pstd/include/pstd_defer.h" namespace storage { -Status RedisStreams::XAdd(const Slice& key, const std::string& serialized_message, StreamAddTrimArgs& args) { +Status RedisStreams::XAdd(const Slice& key, const std::string& serialized_message, StreamAddTrimArgs& args, CommitCallback callback) { // With the lock, we do not need snapshot for read. // And it's bugy to use snapshot for read when we try to add message with trim. // such as: XADD key 1-0 field value MINID 1-0 @@ -66,11 +68,11 @@ Status RedisStreams::XAdd(const Slice& key, const std::string& serialized_messag assert(current_id > serialized_last_id); #endif + // Use batch for Raft consistency + auto batch = Batch::CreateBatch(this); + StreamDataKey stream_data_key(key, stream_meta.version(), args.id.Serialize()); - s = db_->Put(default_write_options_, handles_[1], stream_data_key.Encode(), serialized_message); - if (!s.ok()) { - return Status::Corruption("error from XADD, insert stream message failed 1: " + s.ToString()); - } + batch->Put(1, stream_data_key.Encode(), serialized_message); // 3 update stream meta if (stream_meta.length() == 0) { @@ -91,12 +93,9 @@ Status RedisStreams::XAdd(const Slice& key, const std::string& serialized_messag } // 5 update stream meta - s = db_->Put(default_write_options_, handles_[0], key, stream_meta.value()); - if (!s.ok()) { - return s; - } - - return Status::OK(); + batch->Put(0, key, stream_meta.value()); + + return batch->Commit(callback); } Status RedisStreams::XTrim(const Slice& key, StreamAddTrimArgs& args, int32_t& count) { @@ -359,15 +358,42 @@ Status RedisStreams::Open(const StorageOptions& storage_options, const std::stri meta_cf_table_ops.block_cache = rocksdb::NewLRUCache(storage_options.block_cache_size); data_cf_table_ops.block_cache = rocksdb::NewLRUCache(storage_options.block_cache_size); } + meta_cf_ops.table_factory.reset(rocksdb::NewBlockBasedTableFactory(meta_cf_table_ops)); data_cf_ops.table_factory.reset(rocksdb::NewBlockBasedTableFactory(data_cf_table_ops)); + + // Add LogIndex table properties collector for Raft + meta_cf_ops.table_properties_collector_factories.push_back( + std::make_shared(log_index_collector_)); + data_cf_ops.table_properties_collector_factories.push_back( + std::make_shared(log_index_collector_)); + + // Add LogIndex event listener for Raft + if (storage_options.do_snapshot_function) { + auto purger = std::make_shared( + &handles_, &log_index_collector_, &log_index_of_all_cfs_, + storage_options.do_snapshot_function); + db_ops.listeners.push_back(purger); + } std::vector column_families; // Meta CF column_families.emplace_back(rocksdb::kDefaultColumnFamilyName, meta_cf_ops); // Data CF column_families.emplace_back("data_cf", data_cf_ops); - return rocksdb::DB::Open(db_ops, db_path, column_families, &handles_, &db_); + + s = rocksdb::DB::Open(db_ops, db_path, column_families, &handles_, &db_); + if (!s.ok()) { + return s; + } + + // Initialize log index of column families + s = log_index_of_all_cfs_.Init(this); + if (!s.ok()) { + LOG(ERROR) << "Failed to init log index of column families for streams: " << s.ToString(); + } + + return s; } Status RedisStreams::CompactRange(const rocksdb::Slice* begin, const rocksdb::Slice* end, diff --git a/src/storage/src/redis_streams.h b/src/storage/src/redis_streams.h index e622a3db11..88b1078d6b 100644 --- a/src/storage/src/redis_streams.h +++ b/src/storage/src/redis_streams.h @@ -16,6 +16,7 @@ #include "rocksdb/status.h" #include "src/redis.h" #include "storage/storage.h" +#include "storage/batch.h" namespace storage { @@ -129,7 +130,7 @@ class RedisStreams : public Redis { //===--------------------------------------------------------------------===// // Commands //===--------------------------------------------------------------------===// - Status XAdd(const Slice& key, const std::string& serialized_message, StreamAddTrimArgs& args); + Status XAdd(const Slice& key, const std::string& serialized_message, StreamAddTrimArgs& args, CommitCallback callback = nullptr); Status XDel(const Slice& key, const std::vector& ids, int32_t& count); Status XTrim(const Slice& key, StreamAddTrimArgs& args, int32_t& count); Status XRange(const Slice& key, const StreamScanArgs& args, std::vector& id_messages); diff --git a/src/storage/src/redis_strings.cc b/src/storage/src/redis_strings.cc index 2a39beff6b..1043526282 100644 --- a/src/storage/src/redis_strings.cc +++ b/src/storage/src/redis_strings.cc @@ -14,6 +14,8 @@ #include #include +#include "storage/batch.h" // For Batch support + #include "src/scope_record_lock.h" #include "src/scope_snapshot.h" #include "src/strings_filter.h" @@ -35,9 +37,36 @@ Status RedisStrings::Open(const StorageOptions& storage_options, const std::stri table_ops.block_cache = rocksdb::NewLRUCache(storage_options.block_cache_size); } table_ops.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true)); + ops.table_factory.reset(rocksdb::NewBlockBasedTableFactory(table_ops)); + + // Add LogIndex table properties collector for Raft + ops.table_properties_collector_factories.push_back( + std::make_shared(log_index_collector_)); + + // Add LogIndex event listener for Raft + if (storage_options.do_snapshot_function) { + auto purger = std::make_shared( + &handles_, &log_index_collector_, &log_index_of_all_cfs_, + storage_options.do_snapshot_function); + ops.listeners.push_back(purger); + } - return rocksdb::DB::Open(ops, db_path, &db_); + Status s = rocksdb::DB::Open(ops, db_path, &db_); + if (!s.ok()) { + return s; + } + + // Initialize handles for strings (default column family only) + handles_.push_back(db_->DefaultColumnFamily()); + + // Initialize log index of column families + s = log_index_of_all_cfs_.Init(this); + if (!s.ok()) { + LOG(ERROR) << "Failed to init log index of column families for strings: " << s.ToString(); + } + + return s; } Status RedisStrings::CompactRange(const rocksdb::Slice* begin, const rocksdb::Slice* end, @@ -156,18 +185,20 @@ Status RedisStrings::PKPatternMatchDelWithRemoveKeys(const DataType& data_type, return s; } -Status RedisStrings::Append(const Slice& key, const Slice& value, int32_t* ret, int32_t* expired_timestamp_sec, std::string& out_new_value) { +Status RedisStrings::Append(const Slice& key, const Slice& value, int32_t* ret, int32_t* expired_timestamp_sec, std::string& out_new_value, CommitCallback callback) { std::string old_value; *ret = 0; *expired_timestamp_sec = 0; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); Status s = db_->Get(default_read_options_, key, &old_value); if (s.ok()) { ParsedStringsValue parsed_strings_value(&old_value); if (parsed_strings_value.IsStale()) { *ret = static_cast(value.size()); + out_new_value = value.ToString(); StringsValue strings_value(value); - return db_->Put(default_write_options_, key, strings_value.Encode()); + batch->Put(0, key, strings_value.Encode()); } else { int32_t timestamp = parsed_strings_value.timestamp(); std::string old_user_value = parsed_strings_value.value().ToString(); @@ -176,16 +207,18 @@ Status RedisStrings::Append(const Slice& key, const Slice& value, int32_t* ret, StringsValue strings_value(new_value); strings_value.set_timestamp(timestamp); *ret = static_cast(new_value.size()); - return db_->Put(default_write_options_, key, strings_value.Encode()); *expired_timestamp_sec = timestamp; + batch->Put(0, key, strings_value.Encode()); } } else if (s.IsNotFound()) { *ret = static_cast(value.size()); out_new_value = value.ToString(); StringsValue strings_value(value); - return db_->Put(default_write_options_, key, strings_value.Encode()); + batch->Put(0, key, strings_value.Encode()); + } else { + return s; } - return s; + return batch->Commit(callback); } int GetBitCount(const unsigned char* value, int64_t bytes) { @@ -333,9 +366,10 @@ Status RedisStrings::BitOp(BitOpType op, const std::string& dest_key, const std: return db_->Put(default_write_options_, dest_key, strings_value.Encode()); } -Status RedisStrings::Decrby(const Slice& key, int64_t value, int64_t* ret) { +Status RedisStrings::Decrby(const Slice& key, int64_t value, int64_t* ret, CommitCallback callback) { std::string old_value; std::string new_value; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); Status s = db_->Get(default_read_options_, key, &old_value); if (s.ok()) { @@ -344,7 +378,7 @@ Status RedisStrings::Decrby(const Slice& key, int64_t value, int64_t* ret) { *ret = -value; new_value = std::to_string(*ret); StringsValue strings_value(new_value); - return db_->Put(default_write_options_, key, strings_value.Encode()); + batch->Put(0, key, strings_value.Encode()); } else { int32_t timestamp = parsed_strings_value.timestamp(); std::string old_user_value = parsed_strings_value.value().ToString(); @@ -361,16 +395,17 @@ Status RedisStrings::Decrby(const Slice& key, int64_t value, int64_t* ret) { new_value = std::to_string(*ret); StringsValue strings_value(new_value); strings_value.set_timestamp(timestamp); - return db_->Put(default_write_options_, key, strings_value.Encode()); + batch->Put(0, key, strings_value.Encode()); } } else if (s.IsNotFound()) { *ret = -value; new_value = std::to_string(*ret); StringsValue strings_value(new_value); - return db_->Put(default_write_options_, key, strings_value.Encode()); + batch->Put(0, key, strings_value.Encode()); } else { return s; } + return batch->Commit(callback); } Status RedisStrings::Get(const Slice& key, std::string* value) { @@ -526,7 +561,7 @@ Status RedisStrings::GetrangeWithValue(const Slice& key, int64_t start_offset, i return s; } -Status RedisStrings::GetSet(const Slice& key, const Slice& value, std::string* old_value) { +Status RedisStrings::GetSet(const Slice& key, const Slice& value, std::string* old_value, CommitCallback callback) { ScopeRecordLock l(lock_mgr_, key); Status s = db_->Get(default_read_options_, key, old_value); if (s.ok()) { @@ -538,15 +573,22 @@ Status RedisStrings::GetSet(const Slice& key, const Slice& value, std::string* o } } else if (!s.IsNotFound()) { return s; + } else if (s.IsNotFound()) { + *old_value = ""; } + StringsValue strings_value(value); - return db_->Put(default_write_options_, key, strings_value.Encode()); + auto batch = Batch::CreateBatch(this); + batch->Put(0, key, strings_value.Encode()); + return batch->Commit(callback); } -Status RedisStrings::Incrby(const Slice& key, int64_t value, int64_t* ret, int32_t* expired_timestamp_sec) { +Status RedisStrings::Incrby(const Slice& key, int64_t value, int64_t* ret, int32_t* expired_timestamp_sec, + CommitCallback callback) { std::string old_value; std::string new_value; *expired_timestamp_sec = 0; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); Status s = db_->Get(default_read_options_, key, &old_value); char buf[32] = {0}; @@ -556,7 +598,7 @@ Status RedisStrings::Incrby(const Slice& key, int64_t value, int64_t* ret, int32 *ret = value; Int64ToStr(buf, 32, value); StringsValue strings_value(buf); - return db_->Put(default_write_options_, key, strings_value.Encode()); + batch->Put(0, key, strings_value.Encode()); } else { int32_t timestamp = parsed_strings_value.timestamp(); std::string old_user_value = parsed_strings_value.value().ToString(); @@ -572,7 +614,7 @@ Status RedisStrings::Incrby(const Slice& key, int64_t value, int64_t* ret, int32 new_value = std::to_string(*ret); StringsValue strings_value(new_value); strings_value.set_timestamp(timestamp); - return db_->Put(default_write_options_, key, strings_value.Encode()); + batch->Put(0, key, strings_value.Encode()); *expired_timestamp_sec = timestamp; } } else if (s.IsNotFound()) { @@ -580,13 +622,15 @@ Status RedisStrings::Incrby(const Slice& key, int64_t value, int64_t* ret, int32 Int64ToStr(buf, 32, value); StringsValue strings_value(buf); *expired_timestamp_sec = 0; - return db_->Put(default_write_options_, key, strings_value.Encode()); + batch->Put(0, key, strings_value.Encode()); } else { return s; } + return batch->Commit(callback); } -Status RedisStrings::Incrbyfloat(const Slice& key, const Slice& value, std::string* ret, int32_t* expired_timestamp_sec) { +Status RedisStrings::Incrbyfloat(const Slice& key, const Slice& value, std::string* ret, int32_t* expired_timestamp_sec, + CommitCallback callback) { std::string old_value; std::string new_value; *expired_timestamp_sec = 0; @@ -594,6 +638,7 @@ Status RedisStrings::Incrbyfloat(const Slice& key, const Slice& value, std::stri if (StrToLongDouble(value.data(), value.size(), &long_double_by) == -1) { return Status::Corruption("Value is not a vaild float"); } + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); Status s = db_->Get(default_read_options_, key, &old_value); if (s.ok()) { @@ -602,7 +647,7 @@ Status RedisStrings::Incrbyfloat(const Slice& key, const Slice& value, std::stri LongDoubleToStr(long_double_by, &new_value); *ret = new_value; StringsValue strings_value(new_value); - return db_->Put(default_write_options_, key, strings_value.Encode()); + batch->Put(0, key, strings_value.Encode()); } else { int32_t timestamp = parsed_strings_value.timestamp(); std::string old_user_value = parsed_strings_value.value().ToString(); @@ -618,7 +663,7 @@ Status RedisStrings::Incrbyfloat(const Slice& key, const Slice& value, std::stri *ret = new_value; StringsValue strings_value(new_value); strings_value.set_timestamp(timestamp); - return db_->Put(default_write_options_, key, strings_value.Encode()); + batch->Put(0, key, strings_value.Encode()); *expired_timestamp_sec = timestamp; } } else if (s.IsNotFound()) { @@ -626,10 +671,11 @@ Status RedisStrings::Incrbyfloat(const Slice& key, const Slice& value, std::stri *ret = new_value; StringsValue strings_value(new_value); *expired_timestamp_sec = 0; - return db_->Put(default_write_options_, key, strings_value.Encode()); + batch->Put(0, key, strings_value.Encode()); } else { return s; } + return batch->Commit(callback); } Status RedisStrings::MGet(const std::vector& keys, std::vector* vss) { @@ -696,23 +742,23 @@ Status RedisStrings::MGetWithTTL(const std::vector& keys, std::vect return Status::OK(); } -Status RedisStrings::MSet(const std::vector& kvs) { +Status RedisStrings::MSet(const std::vector& kvs, CommitCallback callback) { std::vector keys; keys.reserve(kvs.size()); for (const auto& kv : kvs) { keys.push_back(kv.key); } + auto batch = Batch::CreateBatch(this); MultiScopeRecordLock ml(lock_mgr_, keys); - rocksdb::WriteBatch batch; for (const auto& kv : kvs) { StringsValue strings_value(kv.value); - batch.Put(kv.key, strings_value.Encode()); + batch->Put(0, kv.key, strings_value.Encode()); } - return db_->Write(default_write_options_, &batch); + return batch->Commit(callback); } -Status RedisStrings::MSetnx(const std::vector& kvs, int32_t* ret) { +Status RedisStrings::MSetnx(const std::vector& kvs, int32_t* ret, CommitCallback callback) { Status s; bool exists = false; *ret = 0; @@ -728,7 +774,7 @@ Status RedisStrings::MSetnx(const std::vector& kvs, int32_t* ret) { } } if (!exists) { - s = MSet(kvs); + s = MSet(kvs, callback); if (s.ok()) { *ret = 1; } @@ -736,13 +782,18 @@ Status RedisStrings::MSetnx(const std::vector& kvs, int32_t* ret) { return s; } -Status RedisStrings::Set(const Slice& key, const Slice& value) { +Status RedisStrings::Set(const Slice& key, const Slice& value, + CommitCallback callback) { StringsValue strings_value(value); + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); - return db_->Put(default_write_options_, key, strings_value.Encode()); + + batch->Put(0, key, strings_value.Encode()); + return batch->Commit(callback); } -Status RedisStrings::Setxx(const Slice& key, const Slice& value, int32_t* ret, const int32_t ttl) { +Status RedisStrings::Setxx(const Slice& key, const Slice& value, int32_t* ret, const int32_t ttl, + CommitCallback callback) { bool not_found = true; std::string old_value; StringsValue strings_value(value); @@ -765,11 +816,14 @@ Status RedisStrings::Setxx(const Slice& key, const Slice& value, int32_t* ret, c if (ttl > 0) { strings_value.SetRelativeTimestamp(ttl); } - return db_->Put(default_write_options_, key, strings_value.Encode()); + // Use batch for consistency + auto batch = Batch::CreateBatch(this); + batch->Put(0, key, strings_value.Encode()); + return batch->Commit(callback); } } -Status RedisStrings::SetBit(const Slice& key, int64_t offset, int32_t on, int32_t* ret) { +Status RedisStrings::SetBit(const Slice& key, int64_t offset, int32_t on, int32_t* ret, CommitCallback callback) { std::string meta_value; if (offset < 0) { return Status::InvalidArgument("offset < 0"); @@ -811,13 +865,16 @@ Status RedisStrings::SetBit(const Slice& key, int64_t offset, int32_t on, int32_ } StringsValue strings_value(data_value); strings_value.set_timestamp(timestamp); - return db_->Put(rocksdb::WriteOptions(), key, strings_value.Encode()); + auto batch = Batch::CreateBatch(this); + batch->Put(0, key, strings_value.Encode()); + return batch->Commit(callback); } else { return s; } } -Status RedisStrings::Setex(const Slice& key, const Slice& value, int32_t ttl) { +Status RedisStrings::Setex(const Slice& key, const Slice& value, int32_t ttl, + CommitCallback callback) { if (ttl <= 0) { return Status::InvalidArgument("invalid expire time"); } @@ -827,10 +884,14 @@ Status RedisStrings::Setex(const Slice& key, const Slice& value, int32_t ttl) { return s; } ScopeRecordLock l(lock_mgr_, key); - return db_->Put(default_write_options_, key, strings_value.Encode()); + // Use batch for consistency + auto batch = Batch::CreateBatch(this); + batch->Put(0, key, strings_value.Encode()); + return batch->Commit(callback); } -Status RedisStrings::Setnx(const Slice& key, const Slice& value, int32_t* ret, const int32_t ttl) { +Status RedisStrings::Setnx(const Slice& key, const Slice& value, int32_t* ret, const int32_t ttl, + CommitCallback callback) { *ret = 0; std::string old_value; ScopeRecordLock l(lock_mgr_, key); @@ -842,7 +903,10 @@ Status RedisStrings::Setnx(const Slice& key, const Slice& value, int32_t* ret, c if (ttl > 0) { strings_value.SetRelativeTimestamp(ttl); } - s = db_->Put(default_write_options_, key, strings_value.Encode()); + // Use batch for consistency + auto batch = Batch::CreateBatch(this); + batch->Put(0, key, strings_value.Encode()); + s = batch->Commit(callback); if (s.ok()) { *ret = 1; } @@ -852,16 +916,20 @@ Status RedisStrings::Setnx(const Slice& key, const Slice& value, int32_t* ret, c if (ttl > 0) { strings_value.SetRelativeTimestamp(ttl); } - s = db_->Put(default_write_options_, key, strings_value.Encode()); + // Use batch for consistency + auto batch = Batch::CreateBatch(this); + batch->Put(0, key, strings_value.Encode()); + s = batch->Commit(callback); if (s.ok()) { *ret = 1; } + } return s; } Status RedisStrings::Setvx(const Slice& key, const Slice& value, const Slice& new_value, int32_t* ret, - const int32_t ttl) { + const int32_t ttl, CommitCallback callback) { *ret = 0; std::string old_value; ScopeRecordLock l(lock_mgr_, key); @@ -876,7 +944,9 @@ Status RedisStrings::Setvx(const Slice& key, const Slice& value, const Slice& ne if (ttl > 0) { strings_value.SetRelativeTimestamp(ttl); } - s = db_->Put(default_write_options_, key, strings_value.Encode()); + auto batch = Batch::CreateBatch(this); + batch->Put(0, key, strings_value.Encode()); + s = batch->Commit(callback); if (!s.ok()) { return s; } @@ -893,7 +963,7 @@ Status RedisStrings::Setvx(const Slice& key, const Slice& value, const Slice& ne return Status::OK(); } -Status RedisStrings::Delvx(const Slice& key, const Slice& value, int32_t* ret) { +Status RedisStrings::Delvx(const Slice& key, const Slice& value, int32_t* ret, CommitCallback callback) { *ret = 0; std::string old_value; ScopeRecordLock l(lock_mgr_, key); @@ -906,7 +976,9 @@ Status RedisStrings::Delvx(const Slice& key, const Slice& value, int32_t* ret) { } else { if (value.compare(parsed_strings_value.value()) == 0) { *ret = 1; - return db_->Delete(default_write_options_, key); + auto batch = Batch::CreateBatch(this); + batch->Delete(0, key); + return batch->Commit(callback); } else { *ret = -1; } @@ -917,13 +989,14 @@ Status RedisStrings::Delvx(const Slice& key, const Slice& value, int32_t* ret) { return s; } -Status RedisStrings::Setrange(const Slice& key, int64_t start_offset, const Slice& value, int32_t* ret) { +Status RedisStrings::Setrange(const Slice& key, int64_t start_offset, const Slice& value, int32_t* ret, CommitCallback callback) { std::string old_value; std::string new_value; if (start_offset < 0) { return Status::InvalidArgument("offset < 0"); } + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); Status s = db_->Get(default_read_options_, key, &old_value); if (s.ok()) { @@ -951,15 +1024,17 @@ Status RedisStrings::Setrange(const Slice& key, int64_t start_offset, const Slic *ret = static_cast(new_value.length()); StringsValue strings_value(new_value); strings_value.set_timestamp(timestamp); - return db_->Put(default_write_options_, key, strings_value.Encode()); + batch->Put(0, key, strings_value.Encode()); } else if (s.IsNotFound()) { std::string tmp(start_offset, '\0'); new_value = tmp.append(value.data()); *ret = static_cast(new_value.length()); StringsValue strings_value(new_value); - return db_->Put(default_write_options_, key, strings_value.Encode()); + batch->Put(0, key, strings_value.Encode()); + } else { + return s; } - return s; + return batch->Commit(callback); } Status RedisStrings::Strlen(const Slice& key, int32_t* len) { @@ -1159,11 +1234,13 @@ Status RedisStrings::BitPos(const Slice& key, int32_t bit, int64_t start_offset, return Status::OK(); } -Status RedisStrings::PKSetexAt(const Slice& key, const Slice& value, int32_t timestamp) { +Status RedisStrings::PKSetexAt(const Slice& key, const Slice& value, int32_t timestamp, CommitCallback callback) { StringsValue strings_value(value); ScopeRecordLock l(lock_mgr_, key); strings_value.set_timestamp(timestamp); - return db_->Put(default_write_options_, key, strings_value.Encode()); + auto batch = Batch::CreateBatch(this); + batch->Put(0, key, strings_value.Encode()); + return batch->Commit(callback); } Status RedisStrings::PKScanRange(const Slice& key_start, const Slice& key_end, const Slice& pattern, int32_t limit, diff --git a/src/storage/src/redis_strings.h b/src/storage/src/redis_strings.h index d0365cc6ae..fe2d08d63c 100644 --- a/src/storage/src/redis_strings.h +++ b/src/storage/src/redis_strings.h @@ -27,36 +27,40 @@ class RedisStrings : public Redis { Status ScanKeys(const std::string& pattern, std::vector* keys) override; Status PKPatternMatchDelWithRemoveKeys(const DataType& data_type, const std::string& pattern, int64_t* ret, std::vector* remove_keys, const int64_t& max_count) override; // Strings Command - Status Append(const Slice& key, const Slice& value, int32_t* ret, int32_t* expired_timestamp_sec, std::string& out_new_value); + Status Append(const Slice& key, const Slice& value, int32_t* ret, int32_t* expired_timestamp_sec, std::string& out_new_value, CommitCallback callback = nullptr); Status BitCount(const Slice& key, int64_t start_offset, int64_t end_offset, int32_t* ret, bool have_range); Status BitOp(BitOpType op, const std::string& dest_key, const std::vector& src_keys, std::string &value_to_dest, int64_t* ret); - Status Decrby(const Slice& key, int64_t value, int64_t* ret); + Status Decrby(const Slice& key, int64_t value, int64_t* ret, CommitCallback callback = nullptr); Status Get(const Slice& key, std::string* value); Status GetWithTTL(const Slice& key, std::string* value, int64_t* ttl); Status GetBit(const Slice& key, int64_t offset, int32_t* ret); Status Getrange(const Slice& key, int64_t start_offset, int64_t end_offset, std::string* ret); Status GetrangeWithValue(const Slice& key, int64_t start_offset, int64_t end_offset, std::string* ret, std::string* value, int64_t* ttl); - Status GetSet(const Slice& key, const Slice& value, std::string* old_value); - Status Incrby(const Slice& key, int64_t value, int64_t* ret, int32_t* expired_timestamp_sec); - Status Incrbyfloat(const Slice& key, const Slice& value, std::string* ret, int32_t* expired_timestamp_sec); + Status GetSet(const Slice& key, const Slice& value, std::string* old_value, CommitCallback callback = nullptr); + Status Incrby(const Slice& key, int64_t value, int64_t* ret, int32_t* expired_timestamp_sec, CommitCallback callback = nullptr); + Status Incrbyfloat(const Slice& key, const Slice& value, std::string* ret, int32_t* expired_timestamp_sec, CommitCallback callback = nullptr); Status MGet(const std::vector& keys, std::vector* vss); Status MGetWithTTL(const std::vector& keys, std::vector* vss); - Status MSet(const std::vector& kvs); - Status MSetnx(const std::vector& kvs, int32_t* ret); - Status Set(const Slice& key, const Slice& value); - Status Setxx(const Slice& key, const Slice& value, int32_t* ret, int32_t ttl = 0); - Status SetBit(const Slice& key, int64_t offset, int32_t value, int32_t* ret); - Status Setex(const Slice& key, const Slice& value, int32_t ttl); - Status Setnx(const Slice& key, const Slice& value, int32_t* ret, int32_t ttl = 0); - Status Setvx(const Slice& key, const Slice& value, const Slice& new_value, int32_t* ret, int32_t ttl = 0); - Status Delvx(const Slice& key, const Slice& value, int32_t* ret); - Status Setrange(const Slice& key, int64_t start_offset, const Slice& value, int32_t* ret); + Status MSet(const std::vector& kvs, CommitCallback callback = nullptr); + Status MSetnx(const std::vector& kvs, int32_t* ret, CommitCallback callback = nullptr); + Status Set(const Slice& key, const Slice& value, + CommitCallback callback = nullptr); + Status Setxx(const Slice& key, const Slice& value, int32_t* ret, int32_t ttl = 0, + CommitCallback callback = nullptr); + Status SetBit(const Slice& key, int64_t offset, int32_t value, int32_t* ret, CommitCallback callback = nullptr); + Status Setex(const Slice& key, const Slice& value, int32_t ttl, + CommitCallback callback = nullptr); + Status Setnx(const Slice& key, const Slice& value, int32_t* ret, int32_t ttl = 0, + CommitCallback callback = nullptr); + Status Setvx(const Slice& key, const Slice& value, const Slice& new_value, int32_t* ret, int32_t ttl = 0, CommitCallback callback = nullptr); + Status Delvx(const Slice& key, const Slice& value, int32_t* ret, CommitCallback callback = nullptr); + Status Setrange(const Slice& key, int64_t start_offset, const Slice& value, int32_t* ret, CommitCallback callback = nullptr); Status Strlen(const Slice& key, int32_t* len); Status BitPos(const Slice& key, int32_t bit, int64_t* ret); Status BitPos(const Slice& key, int32_t bit, int64_t start_offset, int64_t* ret); Status BitPos(const Slice& key, int32_t bit, int64_t start_offset, int64_t end_offset, int64_t* ret); - Status PKSetexAt(const Slice& key, const Slice& value, int32_t timestamp); + Status PKSetexAt(const Slice& key, const Slice& value, int32_t timestamp, CommitCallback callback = nullptr); Status PKScanRange(const Slice& key_start, const Slice& key_end, const Slice& pattern, int32_t limit, std::vector* kvs, std::string* next_key); Status PKRScanRange(const Slice& key_start, const Slice& key_end, const Slice& pattern, int32_t limit, diff --git a/src/storage/src/redis_zsets.cc b/src/storage/src/redis_zsets.cc index a4153c9d9a..a58777e977 100644 --- a/src/storage/src/redis_zsets.cc +++ b/src/storage/src/redis_zsets.cc @@ -18,6 +18,7 @@ #include "src/scope_snapshot.h" #include "src/zsets_filter.h" #include "storage/util.h" +#include "storage/batch.h" #include "pstd/include/pstd_defer.h" @@ -75,15 +76,44 @@ Status RedisZSets::Open(const StorageOptions& storage_options, const std::string data_cf_table_ops.block_cache = rocksdb::NewLRUCache(storage_options.block_cache_size); score_cf_table_ops.block_cache = rocksdb::NewLRUCache(storage_options.block_cache_size); } + meta_cf_ops.table_factory.reset(rocksdb::NewBlockBasedTableFactory(meta_cf_table_ops)); data_cf_ops.table_factory.reset(rocksdb::NewBlockBasedTableFactory(data_cf_table_ops)); score_cf_ops.table_factory.reset(rocksdb::NewBlockBasedTableFactory(score_cf_table_ops)); + + // Add LogIndex table properties collector for Raft + meta_cf_ops.table_properties_collector_factories.push_back( + std::make_shared(log_index_collector_)); + data_cf_ops.table_properties_collector_factories.push_back( + std::make_shared(log_index_collector_)); + score_cf_ops.table_properties_collector_factories.push_back( + std::make_shared(log_index_collector_)); + + // Add LogIndex event listener for Raft + if (storage_options.do_snapshot_function) { + auto purger = std::make_shared( + &handles_, &log_index_collector_, &log_index_of_all_cfs_, + storage_options.do_snapshot_function); + db_ops.listeners.push_back(purger); + } std::vector column_families; column_families.emplace_back(rocksdb::kDefaultColumnFamilyName, meta_cf_ops); column_families.emplace_back("data_cf", data_cf_ops); column_families.emplace_back("score_cf", score_cf_ops); - return rocksdb::DB::Open(db_ops, db_path, column_families, &handles_, &db_); + + s = rocksdb::DB::Open(db_ops, db_path, column_families, &handles_, &db_); + if (!s.ok()) { + return s; + } + + // Initialize log index of column families + s = log_index_of_all_cfs_.Init(this); + if (!s.ok()) { + LOG(ERROR) << "Failed to init log index of column families for zsets: " << s.ToString(); + } + + return s; } Status RedisZSets::CompactRange(const rocksdb::Slice* begin, const rocksdb::Slice* end, const ColumnFamilyType& type) { @@ -211,11 +241,11 @@ Status RedisZSets::PKPatternMatchDelWithRemoveKeys(const DataType& data_type, co return s; } -Status RedisZSets::ZPopMax(const Slice& key, const int64_t count, std::vector* score_members) { - uint32_t statistic = 0; +Status RedisZSets::ZPopMax(const Slice& key, const int64_t count, std::vector* score_members, CommitCallback callback) { score_members->clear(); - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); + std::string meta_value; Status s = db_->Get(default_read_options_, handles_[0], key, &meta_value); if (s.ok()) { @@ -229,7 +259,6 @@ Status RedisZSets::ZPopMax(const Slice& key, const int64_t count, std::vector::max(), Slice()); - KeyStatisticsDurationGuard guard(this, key.ToString()); rocksdb::Iterator* iter = db_->NewIterator(default_read_options_, handles_[2]); int32_t del_cnt = 0; for (iter->SeekForPrev(zsets_score_key.Encode()); iter->Valid() && del_cnt < num; iter->Prev()) { @@ -237,31 +266,28 @@ Status RedisZSets::ZPopMax(const Slice& key, const int64_t count, std::vectoremplace_back( ScoreMember{parsed_zsets_score_key.score(), parsed_zsets_score_key.member().ToString()}); ZSetsMemberKey zsets_member_key(key, version, parsed_zsets_score_key.member()); - ++statistic; ++del_cnt; - batch.Delete(handles_[1], zsets_member_key.Encode()); - batch.Delete(handles_[2], iter->key()); + batch->Delete(1, zsets_member_key.Encode()); + batch->Delete(2, iter->key()); } delete iter; if (!parsed_zsets_meta_value.CheckModifyCount(-del_cnt)){ return Status::InvalidArgument("zset size overflow"); } parsed_zsets_meta_value.ModifyCount(-del_cnt); - batch.Put(handles_[0], key, meta_value); - s = db_->Write(default_write_options_, &batch); - UpdateSpecificKeyStatistics(key.ToString(), statistic); - return s; + batch->Put(0, key, meta_value); + return batch->Commit(callback); } } else { return s; } } -Status RedisZSets::ZPopMin(const Slice& key, const int64_t count, std::vector* score_members) { - uint32_t statistic = 0; +Status RedisZSets::ZPopMin(const Slice& key, const int64_t count, std::vector* score_members, CommitCallback callback) { score_members->clear(); - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); + std::string meta_value; Status s = db_->Get(default_read_options_, handles_[0], key, &meta_value); if (s.ok()) { @@ -275,7 +301,6 @@ Status RedisZSets::ZPopMin(const Slice& key, const int64_t count, std::vector::lowest(), Slice()); - KeyStatisticsDurationGuard guard(this, key.ToString()); rocksdb::Iterator* iter = db_->NewIterator(default_read_options_, handles_[2]); int32_t del_cnt = 0; for (iter->Seek(zsets_score_key.Encode()); iter->Valid() && del_cnt < num; iter->Next()) { @@ -283,27 +308,25 @@ Status RedisZSets::ZPopMin(const Slice& key, const int64_t count, std::vectoremplace_back( ScoreMember{parsed_zsets_score_key.score(), parsed_zsets_score_key.member().ToString()}); ZSetsMemberKey zsets_member_key(key, version, parsed_zsets_score_key.member()); - ++statistic; ++del_cnt; - batch.Delete(handles_[1], zsets_member_key.Encode()); - batch.Delete(handles_[2], iter->key()); + batch->Delete(1, zsets_member_key.Encode()); + batch->Delete(2, iter->key()); } delete iter; if (!parsed_zsets_meta_value.CheckModifyCount(-del_cnt)){ return Status::InvalidArgument("zset size overflow"); } parsed_zsets_meta_value.ModifyCount(-del_cnt); - batch.Put(handles_[0], key, meta_value); - s = db_->Write(default_write_options_, &batch); - UpdateSpecificKeyStatistics(key.ToString(), statistic); - return s; + batch->Put(0, key, meta_value); + return batch->Commit(callback); } } else { return s; } } -Status RedisZSets::ZAdd(const Slice& key, const std::vector& score_members, int32_t* ret) { +Status RedisZSets::ZAdd(const Slice& key, const std::vector& score_members, int32_t* ret, + CommitCallback callback) { *ret = 0; uint32_t statistic = 0; std::unordered_set unique; @@ -322,7 +345,7 @@ Status RedisZSets::ZAdd(const Slice& key, const std::vector& score_ char score_buf[8]; int32_t version = 0; std::string meta_value; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); Status s = db_->Get(default_read_options_, handles_[0], key, &meta_value); if (s.ok()) { @@ -352,7 +375,7 @@ Status RedisZSets::ZAdd(const Slice& key, const std::vector& score_ continue; } else { ZSetsScoreKey zsets_score_key(key, version, old_score, sm.member); - batch.Delete(handles_[2], zsets_score_key.Encode()); + batch->Delete(2, zsets_score_key.Encode()); // delete old zsets_score_key and overwirte zsets_member_key // but in different column_families so we accumulative 1 statistic++; @@ -364,10 +387,10 @@ Status RedisZSets::ZAdd(const Slice& key, const std::vector& score_ const void* ptr_score = reinterpret_cast(&sm.score); EncodeFixed64(score_buf, *reinterpret_cast(ptr_score)); - batch.Put(handles_[1], zsets_member_key.Encode(), Slice(score_buf, sizeof(uint64_t))); + batch->Put(1, zsets_member_key.Encode(), Slice(score_buf, sizeof(uint64_t))); ZSetsScoreKey zsets_score_key(key, version, sm.score, sm.member); - batch.Put(handles_[2], zsets_score_key.Encode(), Slice()); + batch->Put(2, zsets_score_key.Encode(), Slice()); if (not_found) { cnt++; } @@ -376,28 +399,28 @@ Status RedisZSets::ZAdd(const Slice& key, const std::vector& score_ return Status::InvalidArgument("zset size overflow"); } parsed_zsets_meta_value.ModifyCount(cnt); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); *ret = cnt; } else if (s.IsNotFound()) { char buf[4]; EncodeFixed32(buf, filtered_score_members.size()); ZSetsMetaValue zsets_meta_value(Slice(buf, sizeof(int32_t))); version = zsets_meta_value.UpdateVersion(); - batch.Put(handles_[0], key, zsets_meta_value.Encode()); + batch->Put(0, key, zsets_meta_value.Encode()); for (const auto& sm : filtered_score_members) { ZSetsMemberKey zsets_member_key(key, version, sm.member); const void* ptr_score = reinterpret_cast(&sm.score); EncodeFixed64(score_buf, *reinterpret_cast(ptr_score)); - batch.Put(handles_[1], zsets_member_key.Encode(), Slice(score_buf, sizeof(uint64_t))); + batch->Put(1, zsets_member_key.Encode(), Slice(score_buf, sizeof(uint64_t))); ZSetsScoreKey zsets_score_key(key, version, sm.score, sm.member); - batch.Put(handles_[2], zsets_score_key.Encode(), Slice()); + batch->Put(2, zsets_score_key.Encode(), Slice()); } *ret = static_cast(filtered_score_members.size()); } else { return s; } - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(callback); UpdateSpecificKeyStatistics(key.ToString(), statistic); return s; } @@ -478,15 +501,15 @@ Status RedisZSets::ZCount(const Slice& key, double min, double max, bool left_cl return s; } -Status RedisZSets::ZIncrby(const Slice& key, const Slice& member, double increment, double* ret) { +Status RedisZSets::ZIncrby(const Slice& key, const Slice& member, double increment, double* ret, CommitCallback callback) { *ret = 0; - uint32_t statistic = 0; double score = 0; char score_buf[8]; int32_t version = 0; std::string meta_value; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); + Status s = db_->Get(default_read_options_, handles_[0], key, &meta_value); if (s.ok()) { ParsedZSetsMetaValue parsed_zsets_meta_value(&meta_value); @@ -504,17 +527,14 @@ Status RedisZSets::ZIncrby(const Slice& key, const Slice& member, double increme double old_score = *reinterpret_cast(ptr_tmp); score = old_score + increment; ZSetsScoreKey zsets_score_key(key, version, old_score, member); - batch.Delete(handles_[2], zsets_score_key.Encode()); - // delete old zsets_score_key and overwirte zsets_member_key - // but in different column_families so we accumulative 1 - statistic++; + batch->Delete(2, zsets_score_key.Encode()); } else if (s.IsNotFound()) { score = increment; if (!parsed_zsets_meta_value.CheckModifyCount(1)){ return Status::InvalidArgument("zset size overflow"); } parsed_zsets_meta_value.ModifyCount(1); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); } else { return s; } @@ -523,7 +543,7 @@ Status RedisZSets::ZIncrby(const Slice& key, const Slice& member, double increme EncodeFixed32(buf, 1); ZSetsMetaValue zsets_meta_value(Slice(buf, sizeof(int32_t))); version = zsets_meta_value.UpdateVersion(); - batch.Put(handles_[0], key, zsets_meta_value.Encode()); + batch->Put(0, key, zsets_meta_value.Encode()); score = increment; } else { return s; @@ -531,14 +551,13 @@ Status RedisZSets::ZIncrby(const Slice& key, const Slice& member, double increme ZSetsMemberKey zsets_member_key(key, version, member); const void* ptr_score = reinterpret_cast(&score); EncodeFixed64(score_buf, *reinterpret_cast(ptr_score)); - batch.Put(handles_[1], zsets_member_key.Encode(), Slice(score_buf, sizeof(uint64_t))); + batch->Put(1, zsets_member_key.Encode(), Slice(score_buf, sizeof(uint64_t))); ZSetsScoreKey zsets_score_key(key, version, score, member); - batch.Put(handles_[2], zsets_score_key.Encode(), Slice()); + batch->Put(2, zsets_score_key.Encode(), Slice()); *ret = score; - s = db_->Write(default_write_options_, &batch); - UpdateSpecificKeyStatistics(key.ToString(), statistic); - return s; + + return batch->Commit(callback); } Status RedisZSets::ZRange(const Slice& key, int32_t start, int32_t stop, std::vector* score_members) { @@ -755,7 +774,8 @@ Status RedisZSets::ZRank(const Slice& key, const Slice& member, int32_t* rank) { return s; } -Status RedisZSets::ZRem(const Slice& key, const std::vector& members, int32_t* ret) { +Status RedisZSets::ZRem(const Slice& key, const std::vector& members, int32_t* ret, + CommitCallback callback) { *ret = 0; uint32_t statistic = 0; std::unordered_set unique; @@ -768,7 +788,7 @@ Status RedisZSets::ZRem(const Slice& key, const std::vector& member } std::string meta_value; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); Status s = db_->Get(default_read_options_, handles_[0], key, &meta_value); if (s.ok()) { @@ -790,10 +810,10 @@ Status RedisZSets::ZRem(const Slice& key, const std::vector& member uint64_t tmp = DecodeFixed64(data_value.data()); const void* ptr_tmp = reinterpret_cast(&tmp); double score = *reinterpret_cast(ptr_tmp); - batch.Delete(handles_[1], zsets_member_key.Encode()); + batch->Delete(1, zsets_member_key.Encode()); ZSetsScoreKey zsets_score_key(key, version, score, member); - batch.Delete(handles_[2], zsets_score_key.Encode()); + batch->Delete(2, zsets_score_key.Encode()); } else if (!s.IsNotFound()) { return s; } @@ -803,21 +823,22 @@ Status RedisZSets::ZRem(const Slice& key, const std::vector& member return Status::InvalidArgument("zset size overflow"); } parsed_zsets_meta_value.ModifyCount(-del_cnt); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); } } else { return s; } - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(callback); UpdateSpecificKeyStatistics(key.ToString(), statistic); return s; } -Status RedisZSets::ZRemrangebyrank(const Slice& key, int32_t start, int32_t stop, int32_t* ret) { +Status RedisZSets::ZRemrangebyrank(const Slice& key, int32_t start, int32_t stop, int32_t* ret, + CommitCallback callback) { *ret = 0; uint32_t statistic = 0; std::string meta_value; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); Status s = db_->Get(default_read_options_, handles_[0], key, &meta_value); if (s.ok()) { @@ -846,8 +867,8 @@ Status RedisZSets::ZRemrangebyrank(const Slice& key, int32_t start, int32_t stop if (cur_index >= start_index) { ParsedZSetsScoreKey parsed_zsets_score_key(iter->key()); ZSetsMemberKey zsets_member_key(key, version, parsed_zsets_score_key.member()); - batch.Delete(handles_[1], zsets_member_key.Encode()); - batch.Delete(handles_[2], iter->key()); + batch->Delete(1, zsets_member_key.Encode()); + batch->Delete(2, iter->key()); del_cnt++; statistic++; } @@ -858,22 +879,22 @@ Status RedisZSets::ZRemrangebyrank(const Slice& key, int32_t start, int32_t stop return Status::InvalidArgument("zset size overflow"); } parsed_zsets_meta_value.ModifyCount(-del_cnt); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); } } else { return s; } - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(callback); UpdateSpecificKeyStatistics(key.ToString(), statistic); return s; } Status RedisZSets::ZRemrangebyscore(const Slice& key, double min, double max, bool left_close, bool right_close, - int32_t* ret) { + int32_t* ret, CommitCallback callback) { *ret = 0; uint32_t statistic = 0; std::string meta_value; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); Status s = db_->Get(default_read_options_, handles_[0], key, &meta_value); if (s.ok()) { @@ -911,8 +932,8 @@ Status RedisZSets::ZRemrangebyscore(const Slice& key, double min, double max, bo } if (left_pass && right_pass) { ZSetsMemberKey zsets_member_key(key, version, parsed_zsets_score_key.member()); - batch.Delete(handles_[1], zsets_member_key.Encode()); - batch.Delete(handles_[2], iter->key()); + batch->Delete(1, zsets_member_key.Encode()); + batch->Delete(2, iter->key()); del_cnt++; statistic++; } @@ -926,12 +947,12 @@ Status RedisZSets::ZRemrangebyscore(const Slice& key, double min, double max, bo return Status::InvalidArgument("zset size overflow"); } parsed_zsets_meta_value.ModifyCount(-del_cnt); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); } } else { return s; } - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(callback); UpdateSpecificKeyStatistics(key.ToString(), statistic); return s; } @@ -1127,10 +1148,11 @@ Status RedisZSets::ZScore(const Slice& key, const Slice& member, double* score) } Status RedisZSets::ZUnionstore(const Slice& destination, const std::vector& keys, - const std::vector& weights, const AGGREGATE agg, std::map& value_to_dest, int32_t* ret) { + const std::vector& weights, const AGGREGATE agg, std::map& value_to_dest, int32_t* ret, + CommitCallback callback) { *ret = 0; uint32_t statistic = 0; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); rocksdb::ReadOptions read_options; const rocksdb::Snapshot* snapshot = nullptr; @@ -1196,13 +1218,13 @@ Status RedisZSets::ZUnionstore(const Slice& destination, const std::vector(member_score_map.size())); - batch.Put(handles_[0], destination, meta_value); + batch->Put(0, destination, meta_value); } else { char buf[4]; EncodeFixed32(buf, member_score_map.size()); ZSetsMetaValue zsets_meta_value(Slice(buf, sizeof(int32_t))); version = zsets_meta_value.UpdateVersion(); - batch.Put(handles_[0], destination, zsets_meta_value.Encode()); + batch->Put(0, destination, zsets_meta_value.Encode()); } char score_buf[8]; @@ -1211,27 +1233,28 @@ Status RedisZSets::ZUnionstore(const Slice& destination, const std::vector(&sm.second); EncodeFixed64(score_buf, *reinterpret_cast(ptr_score)); - batch.Put(handles_[1], zsets_member_key.Encode(), Slice(score_buf, sizeof(uint64_t))); + batch->Put(1, zsets_member_key.Encode(), Slice(score_buf, sizeof(uint64_t))); ZSetsScoreKey zsets_score_key(destination, version, sm.second, sm.first); - batch.Put(handles_[2], zsets_score_key.Encode(), Slice()); + batch->Put(2, zsets_score_key.Encode(), Slice()); } *ret = static_cast(member_score_map.size()); - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(callback); UpdateSpecificKeyStatistics(destination.ToString(), statistic); value_to_dest = std::move(member_score_map); return s; } Status RedisZSets::ZInterstore(const Slice& destination, const std::vector& keys, - const std::vector& weights, const AGGREGATE agg, std::vector& value_to_dest, int32_t* ret) { + const std::vector& weights, const AGGREGATE agg, std::vector& value_to_dest, int32_t* ret, + CommitCallback callback) { if (keys.empty()) { return Status::Corruption("ZInterstore invalid parameter, no keys"); } *ret = 0; uint32_t statistic = 0; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); rocksdb::ReadOptions read_options; const rocksdb::Snapshot* snapshot = nullptr; ScopeSnapshot ss(db_, &snapshot); @@ -1327,13 +1350,13 @@ Status RedisZSets::ZInterstore(const Slice& destination, const std::vector(final_score_members.size())); - batch.Put(handles_[0], destination, meta_value); + batch->Put(0, destination, meta_value); } else { char buf[4]; EncodeFixed32(buf, final_score_members.size()); ZSetsMetaValue zsets_meta_value(Slice(buf, sizeof(int32_t))); version = zsets_meta_value.UpdateVersion(); - batch.Put(handles_[0], destination, zsets_meta_value.Encode()); + batch->Put(0, destination, zsets_meta_value.Encode()); } char score_buf[8]; for (const auto& sm : final_score_members) { @@ -1341,13 +1364,13 @@ Status RedisZSets::ZInterstore(const Slice& destination, const std::vector(&sm.score); EncodeFixed64(score_buf, *reinterpret_cast(ptr_score)); - batch.Put(handles_[1], zsets_member_key.Encode(), Slice(score_buf, sizeof(uint64_t))); + batch->Put(1, zsets_member_key.Encode(), Slice(score_buf, sizeof(uint64_t))); ZSetsScoreKey zsets_score_key(destination, version, sm.score, sm.member); - batch.Put(handles_[2], zsets_score_key.Encode(), Slice()); + batch->Put(2, zsets_score_key.Encode(), Slice()); } *ret = static_cast(final_score_members.size()); - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(callback); UpdateSpecificKeyStatistics(destination.ToString(), statistic); value_to_dest = std::move(final_score_members); return s; @@ -1411,10 +1434,10 @@ Status RedisZSets::ZLexcount(const Slice& key, const Slice& min, const Slice& ma } Status RedisZSets::ZRemrangebylex(const Slice& key, const Slice& min, const Slice& max, bool left_close, - bool right_close, int32_t* ret) { + bool right_close, int32_t* ret, CommitCallback callback) { *ret = 0; uint32_t statistic = 0; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); rocksdb::ReadOptions read_options; const rocksdb::Snapshot* snapshot = nullptr; @@ -1451,13 +1474,13 @@ Status RedisZSets::ZRemrangebylex(const Slice& key, const Slice& min, const Slic right_pass = true; } if (left_pass && right_pass) { - batch.Delete(handles_[1], iter->key()); + batch->Delete(1, iter->key()); uint64_t tmp = DecodeFixed64(iter->value().data()); const void* ptr_tmp = reinterpret_cast(&tmp); double score = *reinterpret_cast(ptr_tmp); ZSetsScoreKey zsets_score_key(key, version, score, member); - batch.Delete(handles_[2], zsets_score_key.Encode()); + batch->Delete(2, zsets_score_key.Encode()); del_cnt++; statistic++; } @@ -1472,13 +1495,13 @@ Status RedisZSets::ZRemrangebylex(const Slice& key, const Slice& min, const Slic return Status::InvalidArgument("zset size overflow"); } parsed_zsets_meta_value.ModifyCount(-del_cnt); - batch.Put(handles_[0], key, meta_value); + batch->Put(0, key, meta_value); *ret = del_cnt; } } else { return s; } - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(callback); UpdateSpecificKeyStatistics(key.ToString(), statistic); return s; } diff --git a/src/storage/src/redis_zsets.h b/src/storage/src/redis_zsets.h index 1a3ccb9b72..c0d50bdb6d 100644 --- a/src/storage/src/redis_zsets.h +++ b/src/storage/src/redis_zsets.h @@ -29,42 +29,48 @@ class RedisZSets : public Redis { Status ScanKeys(const std::string& pattern, std::vector* keys) override; Status PKPatternMatchDelWithRemoveKeys(const DataType& data_type, const std::string& pattern, int64_t* ret, std::vector* remove_keys, const int64_t& max_count) override; // ZSets Commands - Status ZAdd(const Slice& key, const std::vector& score_members, int32_t* ret); + Status ZAdd(const Slice& key, const std::vector& score_members, int32_t* ret, + CommitCallback callback = nullptr); Status ZCard(const Slice& key, int32_t* card); Status ZCount(const Slice& key, double min, double max, bool left_close, bool right_close, int32_t* ret); - Status ZIncrby(const Slice& key, const Slice& member, double increment, double* ret); + Status ZIncrby(const Slice& key, const Slice& member, double increment, double* ret, CommitCallback callback = nullptr); Status ZRange(const Slice& key, int32_t start, int32_t stop, std::vector* score_members); Status ZRangeWithTTL(const Slice& key, int32_t start, int32_t stop, std::vector* score_members, int64_t* ttl); Status ZRangebyscore(const Slice& key, double min, double max, bool left_close, bool right_close, int64_t count, int64_t offset, std::vector* score_members); Status ZRank(const Slice& key, const Slice& member, int32_t* rank); - Status ZRem(const Slice& key, const std::vector& members, int32_t* ret); - Status ZRemrangebyrank(const Slice& key, int32_t start, int32_t stop, int32_t* ret); - Status ZRemrangebyscore(const Slice& key, double min, double max, bool left_close, bool right_close, int32_t* ret); + Status ZRem(const Slice& key, const std::vector& members, int32_t* ret, + CommitCallback callback = nullptr); + Status ZRemrangebyrank(const Slice& key, int32_t start, int32_t stop, int32_t* ret, + CommitCallback callback = nullptr); + Status ZRemrangebyscore(const Slice& key, double min, double max, bool left_close, bool right_close, int32_t* ret, + CommitCallback callback = nullptr); Status ZRevrange(const Slice& key, int32_t start, int32_t stop, std::vector* score_members); Status ZRevrangebyscore(const Slice& key, double min, double max, bool left_close, bool right_close, int64_t count, int64_t offset, std::vector* score_members); Status ZRevrank(const Slice& key, const Slice& member, int32_t* rank); Status ZScore(const Slice& key, const Slice& member, double* score); Status ZUnionstore(const Slice& destination, const std::vector& keys, const std::vector& weights, - AGGREGATE agg, std::map& value_to_dest, int32_t* ret); + AGGREGATE agg, std::map& value_to_dest, int32_t* ret, + CommitCallback callback = nullptr); Status ZInterstore(const Slice& destination, const std::vector& keys, const std::vector& weights, - AGGREGATE agg, std::vector& value_to_dest, int32_t* ret); + AGGREGATE agg, std::vector& value_to_dest, int32_t* ret, + CommitCallback callback = nullptr); Status ZRangebylex(const Slice& key, const Slice& min, const Slice& max, bool left_close, bool right_close, std::vector* members); Status ZLexcount(const Slice& key, const Slice& min, const Slice& max, bool left_close, bool right_close, int32_t* ret); Status ZRemrangebylex(const Slice& key, const Slice& min, const Slice& max, bool left_close, bool right_close, - int32_t* ret); + int32_t* ret, CommitCallback callback = nullptr); Status ZScan(const Slice& key, int64_t cursor, const std::string& pattern, int64_t count, std::vector* score_members, int64_t* next_cursor); Status PKScanRange(const Slice& key_start, const Slice& key_end, const Slice& pattern, int32_t limit, std::vector* keys, std::string* next_key); Status PKRScanRange(const Slice& key_start, const Slice& key_end, const Slice& pattern, int32_t limit, std::vector* keys, std::string* next_key); - Status ZPopMax(const Slice& key, int64_t count, std::vector* score_members); - Status ZPopMin(const Slice& key, int64_t count, std::vector* score_members); + Status ZPopMax(const Slice& key, int64_t count, std::vector* score_members, CommitCallback callback = nullptr); + Status ZPopMin(const Slice& key, int64_t count, std::vector* score_members, CommitCallback callback = nullptr); // Keys Commands Status Expire(const Slice& key, int32_t ttl) override; diff --git a/src/storage/src/storage.cc b/src/storage/src/storage.cc index 3126717859..be3fc01154 100644 --- a/src/storage/src/storage.cc +++ b/src/storage/src/storage.cc @@ -8,8 +8,21 @@ #include +#include #include +#if __has_include() +#include +namespace fs = std::filesystem; +#elif __has_include() +#include +namespace fs = std::experimental::filesystem; +#else +#error "std::filesystem is required" +#endif + +#include "pstd/include/env.h" +#include "rocksdb/utilities/checkpoint.h" #include "scope_snapshot.h" #include "src/lru_cache.h" #include "src/mutex_impl.h" @@ -22,6 +35,9 @@ #include "src/redis_strings.h" #include "src/redis_zsets.h" +// Binlog support for Raft +#include "binlog.pb.h" + namespace storage { Status StorageOptions::ResetOptions(const OptionType& option_type, @@ -79,6 +95,29 @@ Storage::~Storage() { } } +Status Storage::Close() { + if (!is_opened_) { + return Status::OK(); + } + + auto shutdown_db = [](auto& db) { + if (db && db->GetDB()) { + rocksdb::CancelAllBackgroundWork(db->GetDB(), true); + } + db.reset(); + }; + + shutdown_db(strings_db_); + shutdown_db(hashes_db_); + shutdown_db(sets_db_); + shutdown_db(lists_db_); + shutdown_db(zsets_db_); + shutdown_db(streams_db_); + + is_opened_.store(false); + return Status::OK(); +} + static std::string AppendSubDirectory(const std::string& db_path, const std::string& sub_db) { if (db_path.back() == '/') { return db_path + sub_db; @@ -87,8 +126,94 @@ static std::string AppendSubDirectory(const std::string& db_path, const std::str } } +static Status CopyDirectoryRecursive(const std::string& src, const std::string& dst) { + std::error_code ec; + if (!fs::exists(src, ec) || !fs::is_directory(src, ec)) { + return Status::NotFound("Source checkpoint directory missing: " + src); + } + + fs::create_directories(dst, ec); + if (ec) { + return Status::IOError("Failed to create target directory: " + dst + ", reason: " + ec.message()); + } + + const fs::path src_path(src); + const fs::path dst_path(dst); + + for (fs::recursive_directory_iterator it(src_path, ec), end; it != end && !ec; ++it) { + auto relative = fs::relative(it->path(), src_path, ec); + if (ec) { + break; + } + fs::path target = dst_path / relative; + + if (it->is_directory()) { + fs::create_directories(target, ec); + } else if (it->is_regular_file()) { + fs::create_directories(target.parent_path(), ec); + if (!ec) { + fs::copy_file(it->path(), target, fs::copy_options::overwrite_existing, ec); + } + } else if (it->is_symlink()) { + auto link_target = fs::read_symlink(it->path(), ec); + if (!ec) { + fs::create_directories(target.parent_path(), ec); + if (!ec) { + fs::create_symlink(link_target, target, ec); + } + } + } + + if (ec) { + break; + } + } + + if (ec) { + return Status::IOError("Failed to copy checkpoint data from " + src + " to " + dst + ": " + ec.message()); + } + + return Status::OK(); +} + +static Status ReplaceDirectoryWithCheckpoint(const std::string& source_dir, const std::string& target_dir) { + if (!pstd::FileExists(source_dir)) { + return Status::NotFound("Checkpoint source directory missing: " + source_dir); + } + + auto tmp_dir = target_dir + ".tmp"; + if (!pstd::DeleteDirIfExist(tmp_dir)) { + return Status::IOError("Failed to remove temporary directory: " + tmp_dir); + } + + const bool target_exists = pstd::FileExists(target_dir); + if (target_exists) { + if (pstd::RenameFile(target_dir, tmp_dir) != 0) { + return Status::IOError("Failed to rename directory " + target_dir); + } + } + + auto copy_status = CopyDirectoryRecursive(source_dir, target_dir); + if (!copy_status.ok()) { + pstd::DeleteDir(target_dir); + if (target_exists && pstd::RenameFile(tmp_dir, target_dir) != 0) { + LOG(WARNING) << "Failed to rollback directory from " << tmp_dir << " to " << target_dir; + } + return copy_status; + } + + if (target_exists && !pstd::DeleteDirIfExist(tmp_dir)) { + LOG(WARNING) << "Failed to cleanup temporary directory: " << tmp_dir; + } + + return Status::OK(); +} + Status Storage::Open(const StorageOptions& storage_options, const std::string& db_path) { mkpath(db_path.c_str(), 0755); + db_path_ = db_path; + open_options_ = storage_options; + open_options_initialized_ = true; strings_db_ = std::make_unique(this, kStrings); Status s = strings_db_->Open(storage_options, AppendSubDirectory(db_path, "strings")); @@ -125,6 +250,11 @@ Status Storage::Open(const StorageOptions& storage_options, const std::string& d if (!s.ok()) { LOG(FATAL) << "open stream db failed, " << s.ToString(); } + + if (storage_options.append_log_function) { + append_log_function_ = storage_options.append_log_function; + LOG(INFO) << "Raft append_log_function registered for storage"; + } is_opened_.store(true); return Status::OK(); @@ -141,10 +271,13 @@ Status Storage::StoreCursorStartKey(const DataType& dtype, int64_t cursor, const } // Strings Commands -Status Storage::Set(const Slice& key, const Slice& value) { return strings_db_->Set(key, value); } +Status Storage::Set(const Slice& key, const Slice& value, CommitCallback callback) { + return strings_db_->Set(key, value, callback); +} -Status Storage::Setxx(const Slice& key, const Slice& value, int32_t* ret, const int32_t ttl) { - return strings_db_->Setxx(key, value, ret, ttl); +Status Storage::Setxx(const Slice& key, const Slice& value, int32_t* ret, const int32_t ttl, + CommitCallback callback) { + return strings_db_->Setxx(key, value, ret, ttl, callback); } Status Storage::Get(const Slice& key, std::string* value) { return strings_db_->Get(key, value); } @@ -153,17 +286,19 @@ Status Storage::GetWithTTL(const Slice& key, std::string* value, int64_t* ttl) { return strings_db_->GetWithTTL(key, value, ttl); } -Status Storage::GetSet(const Slice& key, const Slice& value, std::string* old_value) { - return strings_db_->GetSet(key, value, old_value); +Status Storage::GetSet(const Slice& key, const Slice& value, std::string* old_value, CommitCallback callback) { + return strings_db_->GetSet(key, value, old_value, callback); } -Status Storage::SetBit(const Slice& key, int64_t offset, int32_t value, int32_t* ret) { - return strings_db_->SetBit(key, offset, value, ret); +Status Storage::SetBit(const Slice& key, int64_t offset, int32_t value, int32_t* ret, CommitCallback callback) { + return strings_db_->SetBit(key, offset, value, ret, callback); } Status Storage::GetBit(const Slice& key, int64_t offset, int32_t* ret) { return strings_db_->GetBit(key, offset, ret); } -Status Storage::MSet(const std::vector& kvs) { return strings_db_->MSet(kvs); } +Status Storage::MSet(const std::vector& kvs, CommitCallback callback) { + return strings_db_->MSet(kvs, callback); +} Status Storage::MGet(const std::vector& keys, std::vector* vss) { return strings_db_->MGet(keys, vss); @@ -173,22 +308,27 @@ Status Storage::MGetWithTTL(const std::vector& keys, std::vectorMGetWithTTL(keys, vss); } -Status Storage::Setnx(const Slice& key, const Slice& value, int32_t* ret, const int32_t ttl) { - return strings_db_->Setnx(key, value, ret, ttl); +Status Storage::Setnx(const Slice& key, const Slice& value, int32_t* ret, const int32_t ttl, + CommitCallback callback) { + return strings_db_->Setnx(key, value, ret, ttl, callback); } -Status Storage::MSetnx(const std::vector& kvs, int32_t* ret) { return strings_db_->MSetnx(kvs, ret); } +Status Storage::MSetnx(const std::vector& kvs, int32_t* ret, CommitCallback callback) { + return strings_db_->MSetnx(kvs, ret, callback); +} -Status Storage::Setvx(const Slice& key, const Slice& value, const Slice& new_value, int32_t* ret, const int32_t ttl) { - return strings_db_->Setvx(key, value, new_value, ret, ttl); +Status Storage::Setvx(const Slice& key, const Slice& value, const Slice& new_value, int32_t* ret, const int32_t ttl, + CommitCallback callback) { + return strings_db_->Setvx(key, value, new_value, ret, ttl, callback); } -Status Storage::Delvx(const Slice& key, const Slice& value, int32_t* ret) { - return strings_db_->Delvx(key, value, ret); +Status Storage::Delvx(const Slice& key, const Slice& value, int32_t* ret, CommitCallback callback) { + return strings_db_->Delvx(key, value, ret, callback); } -Status Storage::Setrange(const Slice& key, int64_t start_offset, const Slice& value, int32_t* ret) { - return strings_db_->Setrange(key, start_offset, value, ret); +Status Storage::Setrange(const Slice& key, int64_t start_offset, const Slice& value, int32_t* ret, + CommitCallback callback) { + return strings_db_->Setrange(key, start_offset, value, ret, callback); } Status Storage::Getrange(const Slice& key, int64_t start_offset, int64_t end_offset, std::string* ret) { @@ -200,8 +340,9 @@ Status Storage::GetrangeWithValue(const Slice& key, int64_t start_offset, int64_ return strings_db_->GetrangeWithValue(key, start_offset, end_offset, ret, value, ttl); } -Status Storage::Append(const Slice& key, const Slice& value, int32_t* ret, int32_t* expired_timestamp_sec, std::string& out_new_value) { - return strings_db_->Append(key, value, ret, expired_timestamp_sec, out_new_value); +Status Storage::Append(const Slice& key, const Slice& value, int32_t* ret, int32_t* expired_timestamp_sec, + std::string& out_new_value, CommitCallback callback) { + return strings_db_->Append(key, value, ret, expired_timestamp_sec, out_new_value, callback); } Status Storage::BitCount(const Slice& key, int64_t start_offset, int64_t end_offset, int32_t* ret, bool have_range) { @@ -223,35 +364,44 @@ Status Storage::BitPos(const Slice& key, int32_t bit, int64_t start_offset, int6 return strings_db_->BitPos(key, bit, start_offset, end_offset, ret); } -Status Storage::Decrby(const Slice& key, int64_t value, int64_t* ret) { return strings_db_->Decrby(key, value, ret); } +Status Storage::Decrby(const Slice& key, int64_t value, int64_t* ret, CommitCallback callback) { + return strings_db_->Decrby(key, value, ret, callback); +} -Status Storage::Incrby(const Slice& key, int64_t value, int64_t* ret, int32_t* expired_timestamp_sec) { - return strings_db_->Incrby(key, value, ret, expired_timestamp_sec); +Status Storage::Incrby(const Slice& key, int64_t value, int64_t* ret, int32_t* expired_timestamp_sec, + CommitCallback callback) { + return strings_db_->Incrby(key, value, ret, expired_timestamp_sec, callback); } -Status Storage::Incrbyfloat(const Slice& key, const Slice& value, std::string* ret, int32_t* expired_timestamp_sec) { - return strings_db_->Incrbyfloat(key, value, ret, expired_timestamp_sec); +Status Storage::Incrbyfloat(const Slice& key, const Slice& value, std::string* ret, int32_t* expired_timestamp_sec, + CommitCallback callback) { + return strings_db_->Incrbyfloat(key, value, ret, expired_timestamp_sec, callback); } -Status Storage::Setex(const Slice& key, const Slice& value, int32_t ttl) { return strings_db_->Setex(key, value, ttl); } +Status Storage::Setex(const Slice& key, const Slice& value, int32_t ttl, CommitCallback callback) { + return strings_db_->Setex(key, value, ttl, callback); +} Status Storage::Strlen(const Slice& key, int32_t* len) { return strings_db_->Strlen(key, len); } -Status Storage::PKSetexAt(const Slice& key, const Slice& value, int32_t timestamp) { - return strings_db_->PKSetexAt(key, value, timestamp); +Status Storage::PKSetexAt(const Slice& key, const Slice& value, int32_t timestamp, CommitCallback callback) { + return strings_db_->PKSetexAt(key, value, timestamp, callback); } // Hashes Commands -Status Storage::HSet(const Slice& key, const Slice& field, const Slice& value, int32_t* res) { - return hashes_db_->HSet(key, field, value, res); +Status Storage::HSet(const Slice& key, const Slice& field, const Slice& value, int32_t* res, + CommitCallback callback) { + return hashes_db_->HSet(key, field, value, res, callback); } Status Storage::HGet(const Slice& key, const Slice& field, std::string* value) { return hashes_db_->HGet(key, field, value); } -Status Storage::HMSet(const Slice& key, const std::vector& fvs) { return hashes_db_->HMSet(key, fvs); } +Status Storage::HMSet(const Slice& key, const std::vector& fvs, CommitCallback callback) { + return hashes_db_->HMSet(key, fvs, callback); +} Status Storage::HMGet(const Slice& key, const std::vector& fields, std::vector* vss) { return hashes_db_->HMGet(key, fields, vss); @@ -267,8 +417,9 @@ Status Storage::HKeys(const Slice& key, std::vector* fields) { retu Status Storage::HVals(const Slice& key, std::vector* values) { return hashes_db_->HVals(key, values); } -Status Storage::HSetnx(const Slice& key, const Slice& field, const Slice& value, int32_t* ret) { - return hashes_db_->HSetnx(key, field, value, ret); +Status Storage::HSetnx(const Slice& key, const Slice& field, const Slice& value, int32_t* ret, + CommitCallback callback) { + return hashes_db_->HSetnx(key, field, value, ret, callback); } Status Storage::HLen(const Slice& key, int32_t* ret) { return hashes_db_->HLen(key, ret); } @@ -279,16 +430,17 @@ Status Storage::HStrlen(const Slice& key, const Slice& field, int32_t* len) { Status Storage::HExists(const Slice& key, const Slice& field) { return hashes_db_->HExists(key, field); } -Status Storage::HIncrby(const Slice& key, const Slice& field, int64_t value, int64_t* ret) { - return hashes_db_->HIncrby(key, field, value, ret); +Status Storage::HIncrby(const Slice& key, const Slice& field, int64_t value, int64_t* ret, CommitCallback callback) { + return hashes_db_->HIncrby(key, field, value, ret, callback); } -Status Storage::HIncrbyfloat(const Slice& key, const Slice& field, const Slice& by, std::string* new_value) { - return hashes_db_->HIncrbyfloat(key, field, by, new_value); +Status Storage::HIncrbyfloat(const Slice& key, const Slice& field, const Slice& by, std::string* new_value, CommitCallback callback) { + return hashes_db_->HIncrbyfloat(key, field, by, new_value, callback); } -Status Storage::HDel(const Slice& key, const std::vector& fields, int32_t* ret) { - return hashes_db_->HDel(key, fields, ret); +Status Storage::HDel(const Slice& key, const std::vector& fields, int32_t* ret, + CommitCallback callback) { + return hashes_db_->HDel(key, fields, ret, callback); } Status Storage::HScan(const Slice& key, int64_t cursor, const std::string& pattern, int64_t count, @@ -314,8 +466,9 @@ Status Storage::PKHRScanRange(const Slice& key, const Slice& field_start, const } // Sets Commands -Status Storage::SAdd(const Slice& key, const std::vector& members, int32_t* ret) { - return sets_db_->SAdd(key, members, ret); +Status Storage::SAdd(const Slice& key, const std::vector& members, int32_t* ret, + CommitCallback callback) { + return sets_db_->SAdd(key, members, ret, callback); } Status Storage::SCard(const Slice& key, int32_t* ret) { return sets_db_->SCard(key, ret); } @@ -324,16 +477,18 @@ Status Storage::SDiff(const std::vector& keys, std::vectorSDiff(keys, members); } -Status Storage::SDiffstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret) { - return sets_db_->SDiffstore(destination, keys, value_to_dest, ret); +Status Storage::SDiffstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret, + CommitCallback callback) { + return sets_db_->SDiffstore(destination, keys, value_to_dest, ret, callback); } Status Storage::SInter(const std::vector& keys, std::vector* members) { return sets_db_->SInter(keys, members); } -Status Storage::SInterstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret) { - return sets_db_->SInterstore(destination, keys, value_to_dest, ret); +Status Storage::SInterstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret, + CommitCallback callback) { + return sets_db_->SInterstore(destination, keys, value_to_dest, ret, callback); } Status Storage::SIsmember(const Slice& key, const Slice& member, int32_t* ret) { @@ -348,29 +503,30 @@ Status Storage::SMembersWithTTL(const Slice& key, std::vector* memb return sets_db_->SMembersWithTTL(key, members, ttl); } -Status Storage::SMove(const Slice& source, const Slice& destination, const Slice& member, int32_t* ret) { - return sets_db_->SMove(source, destination, member, ret); +Status Storage::SMove(const Slice& source, const Slice& destination, const Slice& member, int32_t* ret, CommitCallback callback) { + return sets_db_->SMove(source, destination, member, ret, callback); } -Status Storage::SPop(const Slice& key, std::vector* members, int64_t count) { - Status status = sets_db_->SPop(key, members, count); - return status; +Status Storage::SPop(const Slice& key, std::vector* members, int64_t count, CommitCallback callback) { + return sets_db_->SPop(key, members, count, callback); } Status Storage::SRandmember(const Slice& key, int32_t count, std::vector* members) { return sets_db_->SRandmember(key, count, members); } -Status Storage::SRem(const Slice& key, const std::vector& members, int32_t* ret) { - return sets_db_->SRem(key, members, ret); +Status Storage::SRem(const Slice& key, const std::vector& members, int32_t* ret, + CommitCallback callback) { + return sets_db_->SRem(key, members, ret, callback); } Status Storage::SUnion(const std::vector& keys, std::vector* members) { return sets_db_->SUnion(keys, members); } -Status Storage::SUnionstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret) { - return sets_db_->SUnionstore(destination, keys, value_to_dest, ret); +Status Storage::SUnionstore(const Slice& destination, const std::vector& keys, std::vector& value_to_dest, int32_t* ret, + CommitCallback callback) { + return sets_db_->SUnionstore(destination, keys, value_to_dest, ret, callback); } Status Storage::SScan(const Slice& key, int64_t cursor, const std::string& pattern, int64_t count, @@ -378,12 +534,14 @@ Status Storage::SScan(const Slice& key, int64_t cursor, const std::string& patte return sets_db_->SScan(key, cursor, pattern, count, members, next_cursor); } -Status Storage::LPush(const Slice& key, const std::vector& values, uint64_t* ret) { - return lists_db_->LPush(key, values, ret); +Status Storage::LPush(const Slice& key, const std::vector& values, uint64_t* ret, + CommitCallback callback) { + return lists_db_->LPush(key, values, ret, callback); } -Status Storage::RPush(const Slice& key, const std::vector& values, uint64_t* ret) { - return lists_db_->RPush(key, values, ret); +Status Storage::RPush(const Slice& key, const std::vector& values, uint64_t* ret, + CommitCallback callback) { + return lists_db_->RPush(key, values, ret, callback); } Status Storage::LRange(const Slice& key, int64_t start, int64_t stop, std::vector* ret) { @@ -394,51 +552,63 @@ Status Storage::LRangeWithTTL(const Slice& key, int64_t start, int64_t stop, std return lists_db_->LRangeWithTTL(key, start, stop, ret, ttl); } -Status Storage::LTrim(const Slice& key, int64_t start, int64_t stop) { return lists_db_->LTrim(key, start, stop); } +Status Storage::LTrim(const Slice& key, int64_t start, int64_t stop, CommitCallback callback) { + return lists_db_->LTrim(key, start, stop, callback); +} Status Storage::LLen(const Slice& key, uint64_t* len) { return lists_db_->LLen(key, len); } -Status Storage::LPop(const Slice& key, int64_t count, std::vector* elements) { return lists_db_->LPop(key, count, elements); } +Status Storage::LPop(const Slice& key, int64_t count, std::vector* elements, CommitCallback callback) { + return lists_db_->LPop(key, count, elements, callback); +} -Status Storage::RPop(const Slice& key, int64_t count, std::vector* elements) { return lists_db_->RPop(key, count, elements); } +Status Storage::RPop(const Slice& key, int64_t count, std::vector* elements, CommitCallback callback) { + return lists_db_->RPop(key, count, elements, callback); +} Status Storage::LIndex(const Slice& key, int64_t index, std::string* element) { return lists_db_->LIndex(key, index, element); } Status Storage::LInsert(const Slice& key, const BeforeOrAfter& before_or_after, const std::string& pivot, - const std::string& value, int64_t* ret) { - return lists_db_->LInsert(key, before_or_after, pivot, value, ret); + const std::string& value, int64_t* ret, CommitCallback callback) { + return lists_db_->LInsert(key, before_or_after, pivot, value, ret, callback); } -Status Storage::LPushx(const Slice& key, const std::vector& values, uint64_t* len) { - return lists_db_->LPushx(key, values, len); +Status Storage::LPushx(const Slice& key, const std::vector& values, uint64_t* len, + CommitCallback callback) { + return lists_db_->LPushx(key, values, len, callback); } -Status Storage::RPushx(const Slice& key, const std::vector& values, uint64_t* len) { - return lists_db_->RPushx(key, values, len); +Status Storage::RPushx(const Slice& key, const std::vector& values, uint64_t* len, + CommitCallback callback) { + return lists_db_->RPushx(key, values, len, callback); } -Status Storage::LRem(const Slice& key, int64_t count, const Slice& value, uint64_t* ret) { - return lists_db_->LRem(key, count, value, ret); +Status Storage::LRem(const Slice& key, int64_t count, const Slice& value, uint64_t* ret, + CommitCallback callback) { + return lists_db_->LRem(key, count, value, ret, callback); } -Status Storage::LSet(const Slice& key, int64_t index, const Slice& value) { return lists_db_->LSet(key, index, value); } +Status Storage::LSet(const Slice& key, int64_t index, const Slice& value, CommitCallback callback) { + return lists_db_->LSet(key, index, value, callback); +} -Status Storage::RPoplpush(const Slice& source, const Slice& destination, std::string* element) { - return lists_db_->RPoplpush(source, destination, element); +Status Storage::RPoplpush(const Slice& source, const Slice& destination, std::string* element, CommitCallback callback) { + return lists_db_->RPoplpush(source, destination, element, callback); } -Status Storage::ZPopMax(const Slice& key, const int64_t count, std::vector* score_members) { - return zsets_db_->ZPopMax(key, count, score_members); +Status Storage::ZPopMax(const Slice& key, const int64_t count, std::vector* score_members, CommitCallback callback) { + return zsets_db_->ZPopMax(key, count, score_members, callback); } -Status Storage::ZPopMin(const Slice& key, const int64_t count, std::vector* score_members) { - return zsets_db_->ZPopMin(key, count, score_members); +Status Storage::ZPopMin(const Slice& key, const int64_t count, std::vector* score_members, CommitCallback callback) { + return zsets_db_->ZPopMin(key, count, score_members, callback); } -Status Storage::ZAdd(const Slice& key, const std::vector& score_members, int32_t* ret) { - return zsets_db_->ZAdd(key, score_members, ret); +Status Storage::ZAdd(const Slice& key, const std::vector& score_members, int32_t* ret, + CommitCallback callback) { + return zsets_db_->ZAdd(key, score_members, ret, callback); } Status Storage::ZCard(const Slice& key, int32_t* ret) { return zsets_db_->ZCard(key, ret); } @@ -447,8 +617,8 @@ Status Storage::ZCount(const Slice& key, double min, double max, bool left_close return zsets_db_->ZCount(key, min, max, left_close, right_close, ret); } -Status Storage::ZIncrby(const Slice& key, const Slice& member, double increment, double* ret) { - return zsets_db_->ZIncrby(key, member, increment, ret); +Status Storage::ZIncrby(const Slice& key, const Slice& member, double increment, double* ret, CommitCallback callback) { + return zsets_db_->ZIncrby(key, member, increment, ret, callback); } Status Storage::ZRange(const Slice& key, int32_t start, int32_t stop, std::vector* score_members) { @@ -475,17 +645,19 @@ Status Storage::ZRank(const Slice& key, const Slice& member, int32_t* rank) { return zsets_db_->ZRank(key, member, rank); } -Status Storage::ZRem(const Slice& key, const std::vector& members, int32_t* ret) { - return zsets_db_->ZRem(key, members, ret); +Status Storage::ZRem(const Slice& key, const std::vector& members, int32_t* ret, + CommitCallback callback) { + return zsets_db_->ZRem(key, members, ret, callback); } -Status Storage::ZRemrangebyrank(const Slice& key, int32_t start, int32_t stop, int32_t* ret) { - return zsets_db_->ZRemrangebyrank(key, start, stop, ret); +Status Storage::ZRemrangebyrank(const Slice& key, int32_t start, int32_t stop, int32_t* ret, + CommitCallback callback) { + return zsets_db_->ZRemrangebyrank(key, start, stop, ret, callback); } Status Storage::ZRemrangebyscore(const Slice& key, double min, double max, bool left_close, bool right_close, - int32_t* ret) { - return zsets_db_->ZRemrangebyscore(key, min, max, left_close, right_close, ret); + int32_t* ret, CommitCallback callback) { + return zsets_db_->ZRemrangebyscore(key, min, max, left_close, right_close, ret, callback); } Status Storage::ZRevrangebyscore(const Slice& key, double min, double max, bool left_close, bool right_close, @@ -513,13 +685,15 @@ Status Storage::ZScore(const Slice& key, const Slice& member, double* ret) { } Status Storage::ZUnionstore(const Slice& destination, const std::vector& keys, - const std::vector& weights, const AGGREGATE agg, std::map& value_to_dest, int32_t* ret) { - return zsets_db_->ZUnionstore(destination, keys, weights, agg, value_to_dest, ret); + const std::vector& weights, const AGGREGATE agg, std::map& value_to_dest, int32_t* ret, + CommitCallback callback) { + return zsets_db_->ZUnionstore(destination, keys, weights, agg, value_to_dest, ret, callback); } Status Storage::ZInterstore(const Slice& destination, const std::vector& keys, - const std::vector& weights, const AGGREGATE agg, std::vector& value_to_dest, int32_t* ret) { - return zsets_db_->ZInterstore(destination, keys, weights, agg, value_to_dest, ret); + const std::vector& weights, const AGGREGATE agg, std::vector& value_to_dest, int32_t* ret, + CommitCallback callback) { + return zsets_db_->ZInterstore(destination, keys, weights, agg, value_to_dest, ret, callback); } Status Storage::ZRangebylex(const Slice& key, const Slice& min, const Slice& max, bool left_close, bool right_close, @@ -533,8 +707,8 @@ Status Storage::ZLexcount(const Slice& key, const Slice& min, const Slice& max, } Status Storage::ZRemrangebylex(const Slice& key, const Slice& min, const Slice& max, bool left_close, bool right_close, - int32_t* ret) { - return zsets_db_->ZRemrangebylex(key, min, max, left_close, right_close, ret); + int32_t* ret, CommitCallback callback) { + return zsets_db_->ZRemrangebylex(key, min, max, left_close, right_close, ret, callback); } Status Storage::ZScan(const Slice& key, int64_t cursor, const std::string& pattern, int64_t count, @@ -1509,7 +1683,7 @@ void Storage::ScanDatabase(const DataType& type) { } // HyperLogLog -Status Storage::PfAdd(const Slice& key, const std::vector& values, bool* update) { +Status Storage::PfAdd(const Slice& key, const std::vector& values, bool* update, CommitCallback callback) { *update = false; if (values.size() >= kMaxKeys) { return Status::InvalidArgument("Invalid the number of key"); @@ -1536,7 +1710,7 @@ Status Storage::PfAdd(const Slice& key, const std::vector& values, if (previous != now || (s.IsNotFound() && values.empty())) { *update = true; } - s = strings_db_->Set(key, result); + s = strings_db_->Set(key, result, callback); return s; } @@ -1573,7 +1747,7 @@ Status Storage::PfCount(const std::vector& keys, int64_t* result) { return Status::OK(); } -Status Storage::PfMerge(const std::vector& keys, std::string& value_to_dest) { +Status Storage::PfMerge(const std::vector& keys, std::string& value_to_dest, CommitCallback callback) { if (keys.size() >= kMaxKeys || keys.empty()) { return Status::InvalidArgument("Invalid the number of key"); } @@ -1605,7 +1779,7 @@ Status Storage::PfMerge(const std::vector& keys, std::string& value HyperLogLog log(kPrecision, registers); result = first_log.Merge(log); } - s = strings_db_->Set(keys[0], result); + s = strings_db_->Set(keys[0], result, callback); value_to_dest = std::move(result); return s; } @@ -2055,4 +2229,340 @@ void Storage::DisableWal(const bool is_wal_disable) { zsets_db_->SetWriteWalOptions(is_wal_disable); } +rocksdb::Status Storage::OnBinlogWrite(const ::pikiwidb::Binlog& binlog, uint64_t log_index) { + rocksdb::WriteBatch batch; + + // Check if there are any entries + if (binlog.entries().empty()) { + return rocksdb::Status::OK(); + } + + // Get the data type from the first entry (assuming all entries have the same data type) + ::pikiwidb::DataType data_type = binlog.entries(0).data_type(); + + Redis* redis_db = nullptr; + switch (data_type) { + case ::pikiwidb::DataType::kStrings: + redis_db = strings_db_.get(); + break; + case ::pikiwidb::DataType::kHashes: + redis_db = hashes_db_.get(); + break; + case ::pikiwidb::DataType::kLists: + redis_db = lists_db_.get(); + break; + case ::pikiwidb::DataType::kSets: + redis_db = sets_db_.get(); + break; + case ::pikiwidb::DataType::kZSets: + redis_db = zsets_db_.get(); + break; + case ::pikiwidb::DataType::kStreams: + redis_db = streams_db_.get(); + break; + default: + LOG(WARNING) << "Unknown data type: " << static_cast(data_type); + return rocksdb::Status::InvalidArgument("Unknown data type"); + + } + + if (!redis_db) { + LOG(ERROR) << "Redis DB is null for data type: " << static_cast(data_type); + return rocksdb::Status::NotFound("Redis DB not found"); + } + + rocksdb::DB* db = redis_db->GetDB(); + if (!db) { + LOG(ERROR) << "RocksDB instance is null for data type: " << static_cast(data_type); + return rocksdb::Status::NotFound("RocksDB instance not found"); + } + + const auto& handles = redis_db->GetHandles(); + auto seqno = redis_db->GetDB()->GetLatestSequenceNumber(); + + for (const auto& entry : binlog.entries()) { + uint32_t cf_idx = entry.cf_idx(); + + // Check if restarting and log already applied + if (redis_db->IsApplied(cf_idx, log_index)) [[unlikely]] { + // If the starting phase is over, the log must not have been applied + // If the starting phase is not over and the log has been applied, skip it. + LOG(WARNING) << "Log " << log_index << " has been applied"; + continue; + } + + rocksdb::ColumnFamilyHandle* cf_handle = nullptr; + + if (data_type == ::pikiwidb::DataType::kStrings) { + if (cf_idx != 0) { + LOG(WARNING) << "Strings type should use cf_idx=0, got cf_idx=" << cf_idx; + } + } else { + if (cf_idx >= handles.size()) { + LOG(ERROR) << "Invalid cf_idx " << cf_idx << " for data type " + << static_cast(data_type) << ", available cf count: " << handles.size(); + return rocksdb::Status::InvalidArgument("Invalid column family index"); + } + cf_handle = handles[cf_idx]; + } + + switch (entry.op_type()) { + case ::pikiwidb::OperateType::kPut: + if (cf_handle) { + batch.Put(cf_handle, entry.key(), entry.value()); + } else { + batch.Put(entry.key(), entry.value()); + } + break; + + case ::pikiwidb::OperateType::kDelete: + if (cf_handle) { + batch.Delete(cf_handle, entry.key()); + } else { + batch.Delete(entry.key()); + } + break; + + default: + LOG(WARNING) << "Unknown operate type: " << static_cast(entry.op_type()); + continue; + } + + // Update applied log index for this column family + redis_db->UpdateAppliedLogIndexOfColumnFamily(cf_idx, log_index, ++seqno); + } + + auto first_seqno = redis_db->GetDB()->GetLatestSequenceNumber() + 1; + + rocksdb::WriteOptions write_options; + write_options.disableWAL = true; + + // Commit the batch + rocksdb::Status s = redis_db->GetDB()->Write(write_options, &batch); + + if (!s.ok()) { + LOG(ERROR) << "Failed to apply binlog batch: " << s.ToString(); + return s; + } + + // Update log index mapping with actual sequence number + redis_db->UpdateLogIndex(log_index, first_seqno); + + return rocksdb::Status::OK(); +} + +Status Storage::LoadFromCheckpoint(const std::string& checkpoint_path) { + if (!open_options_initialized_) { + return Status::Corruption("Storage options are not initialized"); + } + + if (!pstd::FileExists(checkpoint_path)) { + return Status::NotFound("Checkpoint path does not exist: " + checkpoint_path); + } + + auto cancel_bg = [](const auto& db) { + if (db && db->GetDB()) { + rocksdb::CancelAllBackgroundWork(db->GetDB(), true); + } + }; + + cancel_bg(strings_db_); + cancel_bg(hashes_db_); + cancel_bg(sets_db_); + cancel_bg(lists_db_); + cancel_bg(zsets_db_); + cancel_bg(streams_db_); + + strings_db_.reset(); + hashes_db_.reset(); + sets_db_.reset(); + lists_db_.reset(); + zsets_db_.reset(); + streams_db_.reset(); + is_opened_.store(false); + + auto checkpoint_tasks = LoadCheckpoint(checkpoint_path, db_path_); + for (auto& task : checkpoint_tasks) { + auto status = task.get(); + if (!status.ok()) { + return status; + } + } + + strings_db_ = std::make_unique(this, kStrings); + Status s = strings_db_->Open(open_options_, AppendSubDirectory(db_path_, STRINGS_DB)); + if (!s.ok()) { + return s; + } + + hashes_db_ = std::make_unique(this, kHashes); + s = hashes_db_->Open(open_options_, AppendSubDirectory(db_path_, HASHES_DB)); + if (!s.ok()) { + return s; + } + + lists_db_ = std::make_unique(this, kLists); + s = lists_db_->Open(open_options_, AppendSubDirectory(db_path_, LISTS_DB)); + if (!s.ok()) { + return s; + } + + sets_db_ = std::make_unique(this, kSets); + s = sets_db_->Open(open_options_, AppendSubDirectory(db_path_, SETS_DB)); + if (!s.ok()) { + return s; + } + + zsets_db_ = std::make_unique(this, kZSets); + s = zsets_db_->Open(open_options_, AppendSubDirectory(db_path_, ZSETS_DB)); + if (!s.ok()) { + return s; + } + + streams_db_ = std::make_unique(this, kStreams); + s = streams_db_->Open(open_options_, AppendSubDirectory(db_path_, STREAMS_DB)); + if (!s.ok()) { + return s; + } + + is_opened_.store(true); + return Status::OK(); +} + +std::vector> Storage::CreateCheckpoint(const std::string& checkpoint_path) { + if (!is_opened_) { + return {}; + } + + if (pstd::FileExists(checkpoint_path) && !pstd::DeleteDirIfExist(checkpoint_path)) { + return {}; + } + if (mkpath(checkpoint_path.c_str(), 0755) != 0) {。、b + return {}; + } + + std::vector> result; + result.reserve(6); + + static const std::vector kDbTypes = {STRINGS_DB, HASHES_DB, LISTS_DB, SETS_DB, ZSETS_DB, STREAMS_DB}; + for (const auto& type : kDbTypes) { + auto task = std::async(std::launch::async, &Storage::CreateCheckpointInternal, this, checkpoint_path, type); + result.push_back(std::move(task)); + } + + return result; +} + +Status Storage::CreateCheckpointInternal(const std::string& checkpoint_path, const std::string& db_name) { + Redis* db = nullptr; + if (db_name == STRINGS_DB) { + db = strings_db_.get(); + } else if (db_name == HASHES_DB) { + db = hashes_db_.get(); + } else if (db_name == LISTS_DB) { + db = lists_db_.get(); + } else if (db_name == SETS_DB) { + db = sets_db_.get(); + } else if (db_name == ZSETS_DB) { + db = zsets_db_.get(); + } else if (db_name == STREAMS_DB) { + db = streams_db_.get(); + } + + if (db == nullptr) { + return Status::OK(); + } + + std::string db_checkpoint_path = checkpoint_path + "/" + db_name; + std::string tmp_checkpoint_path = db_checkpoint_path + ".tmp"; + + if (!pstd::DeleteDirIfExist(tmp_checkpoint_path)) { + return Status::IOError("Failed to remove temporary checkpoint directory: " + tmp_checkpoint_path); + } + + rocksdb::Checkpoint* checkpoint = nullptr; + auto s = rocksdb::Checkpoint::Create(db->GetDB(), &checkpoint); + if (!s.ok()) { + return Status::IOError("Create checkpoint object failed: " + s.ToString()); + } + + std::unique_ptr guard(checkpoint); + s = checkpoint->CreateCheckpoint(tmp_checkpoint_path); + if (!s.ok()) { + pstd::DeleteDirIfExist(tmp_checkpoint_path); + return Status::IOError("Create checkpoint failed: " + s.ToString()); + } + + if (!pstd::DeleteDirIfExist(db_checkpoint_path)) { + pstd::DeleteDirIfExist(tmp_checkpoint_path); + return Status::IOError("Failed to clean checkpoint directory: " + db_checkpoint_path); + } + + if (pstd::RenameFile(tmp_checkpoint_path, db_checkpoint_path) != 0) { + pstd::DeleteDirIfExist(tmp_checkpoint_path); + return Status::IOError("Failed to finalize checkpoint directory: " + tmp_checkpoint_path); + } + + LOG(INFO) << "CreateCheckpoint: Successfully created checkpoint for " << db_name << " at: " << db_checkpoint_path; + return Status::OK(); +} + +std::vector> Storage::LoadCheckpoint(const std::string& checkpoint_sub_path, + const std::string& db_sub_path) { + static const std::vector kDbTypes = {STRINGS_DB, HASHES_DB, SETS_DB, LISTS_DB, ZSETS_DB, STREAMS_DB}; + std::vector> result; + result.reserve(kDbTypes.size()); + + for (const auto& db_type : kDbTypes) { + auto task = std::async(std::launch::async, &Storage::LoadCheckpointInternal, this, checkpoint_sub_path, + db_sub_path, db_type); + result.push_back(std::move(task)); + } + return result; +} + +Status Storage::LoadCheckpointInternal(const std::string& checkpoint_sub_path, const std::string& db_sub_path, + const std::string& db_type) { + auto source_dir = checkpoint_sub_path + "/" + db_type; + if (!pstd::FileExists(source_dir)) { + LOG(INFO) << "Checkpoint directory for " << db_type << " not found, skip replacing"; + return Status::OK(); + } + + auto target_dir = AppendSubDirectory(db_sub_path, db_type); + auto status = ReplaceDirectoryWithCheckpoint(source_dir, target_dir); + if (!status.ok()) { + LOG(ERROR) << "Failed to load checkpoint for " << db_type << ": " << status.ToString(); + } + return status; +} + +uint64_t Storage::GetSmallestFlushedLogIndex() { + uint64_t max_log_index = 0; + + std::vector dbs = { + strings_db_.get(), + hashes_db_.get(), + lists_db_.get(), + sets_db_.get(), + zsets_db_.get(), + streams_db_.get() + }; + + for (auto* db : dbs) { + if (db) { + auto [smallest_applied_log_index_cf, smallest_applied_log_index, + smallest_flushed_log_index_cf, smallest_flushed_log_index, + smallest_flushed_seqno] = db->GetLogIndexOfColumnFamilies().GetSmallestLogIndex(-1); + + // Use the maximum of all smallest_flushed_log_index as the recovery point + if (smallest_flushed_log_index != std::numeric_limits::max()) { + max_log_index = std::max(max_log_index, static_cast(smallest_flushed_log_index)); + } + } + } + + return max_log_index; +} + } // namespace storage