impl(bigtable): add QueryPlan refresh with simple tests

scotthart · scotthart · commit a5d627de7ac8 · 2025-11-03T12:59:08.000-05:00
diff --git a/google/cloud/bigtable/BUILD.bazel b/google/cloud/bigtable/BUILD.bazel
@@ -114,8 +114,11 @@ cc_library(
     name = test.replace("/", "_").replace(".cc", ""),
     srcs = [test],
     local_defines = select({
-        ":metrics_enabled": ["GOOGLE_CLOUD_CPP_BIGTABLE_WITH_OTEL_METRICS"],
-        "//conditions:default": [],
+        ":metrics_enabled": [
+            "GOOGLE_CLOUD_CPP_BIGTABLE_WITH_OTEL_METRICS",
+            "GOOGLE_CLOUD_CPP_BIGTABLE_QUERY_PLAN_REFRESH_ASSERT",
+        ],
+        "//conditions:default": ["GOOGLE_CLOUD_CPP_BIGTABLE_QUERY_PLAN_REFRESH_ASSERT"],
     }),
     deps = [
         ":bigtable_client_testing",
diff --git a/google/cloud/bigtable/internal/query_plan.cc b/google/cloud/bigtable/internal/query_plan.cc
@@ -14,38 +14,162 @@
 
 #include "google/cloud/bigtable/internal/query_plan.h"
 #include "google/cloud/completion_queue.h"
+#include "google/cloud/internal/time_utils.h"
 #include <google/bigtable/v2/data.pb.h>
 
 namespace google {
 namespace cloud {
 namespace bigtable_internal {
 GOOGLE_CLOUD_CPP_INLINE_NAMESPACE_BEGIN
+namespace {
+auto constexpr kRefreshDeadlineOffset = 120;
+}  // namespace
 
 std::shared_ptr<QueryPlan> QueryPlan::Create(
     CompletionQueue cq, google::bigtable::v2::PrepareQueryResponse response,
-    RefreshFn fn) {
-  auto plan = std::shared_ptr<QueryPlan>(
-      new QueryPlan(std::move(cq), std::move(response), std::move(fn)));
+    RefreshFn fn, std::shared_ptr<Clock> clock) {
+  auto plan = std::shared_ptr<QueryPlan>(new QueryPlan(
+      std::move(cq), std::move(clock), std::move(fn), std::move(response)));
   plan->Initialize();
   return plan;
 }
 
-bool QueryPlan::IsExpired() { return false; }
+void QueryPlan::Initialize() {
+  std::unique_lock<std::mutex> lock(mu_);
+  ScheduleRefresh(lock);
+}
+
+// ScheduleRefresh should only be called after updating response_.
+void QueryPlan::ScheduleRefresh(std::unique_lock<std::mutex> const&) {
+  if (!response_.ok()) return;
+  // We want to start the refresh process before the query plan expires.
+  auto refresh_deadline =
+      internal::ToChronoTimePoint(response_->valid_until()) -
+      std::chrono::seconds(kRefreshDeadlineOffset);
+  std::weak_ptr<QueryPlan> plan = shared_from_this();
+  refresh_timer_ =
+      cq_.MakeDeadlineTimer(refresh_deadline)
+          .then([plan](future<StatusOr<std::chrono::system_clock::time_point>>
+                           result) {
+            if (result.get().ok()) {
+              if (auto p = plan.lock()) {
+                p->ExpiredRefresh();
+              }
+            }
+          });
+}
+
+bool QueryPlan::IsRefreshing(std::unique_lock<std::mutex> const&) const {
+  return state_ == RefreshState::kBegin || state_ == RefreshState::kPending;
+}
+
+void QueryPlan::ExpiredRefresh() {
+  {
+    std::unique_lock<std::mutex> lock(mu_);
+    if (!(IsRefreshing(lock))) {
+      if (response_.ok()) old_query_plan_id_ = response_->prepared_query();
+      state_ = RefreshState::kBegin;
+    }
+  }
+  RefreshQueryPlan(RefreshMode::kExpired);
+}
+
+void QueryPlan::Invalidate(Status status,
+                           std::string const& invalid_query_plan_id) {
+  {
+    std::unique_lock<std::mutex> lock(mu_);
+    // We want to avoid a late arrival causing a refresh of an already refreshed
+    // query plan, so we track what the previous plan id was.
+    if (!IsRefreshing(lock) && old_query_plan_id_ != invalid_query_plan_id) {
+      old_query_plan_id_ = invalid_query_plan_id;
+      state_ = RefreshState::kBegin;
+    }
+  }
+  RefreshQueryPlan(RefreshMode::kInvalidated, std::move(status));
+}
 
-StatusOr<std::string> QueryPlan::prepared_query() const {
-  std::lock_guard<std::mutex> lock(mu_);
-  if (IsExpired()) {
-    return Status(StatusCode::kUnavailable, "Query plan has expired");
+void QueryPlan::RefreshQueryPlan(RefreshMode mode, Status error) {
+  {
+    std::unique_lock<std::mutex> lock_1(mu_);
+#ifdef GOOGLE_CLOUD_CPP_BIGTABLE_QUERY_PLAN_REFRESH_ASSERT
+    assert(waiting_threads_ >= 0);
+#endif
+    ++waiting_threads_;
+    cond_.wait(lock_1, [this] { return state_ != RefreshState::kPending; });
+    --waiting_threads_;
+#ifdef GOOGLE_CLOUD_CPP_BIGTABLE_QUERY_PLAN_REFRESH_ASSERT
+    assert(waiting_threads_ >= 0);
+#endif
+    if (state_ == RefreshState::kDone) return;
+    if (mode == RefreshMode::kInvalidated) response_ = std::move(error);
+    state_ = RefreshState::kPending;
+  }
+  auto response = refresh_fn_().get();
+  bool done = false;
+  {
+    std::unique_lock<std::mutex> lock_2(mu_);
+    response_ = std::move(response);
+    if (response_.ok()) {
+      state_ = RefreshState::kDone;
+      done = true;
+      // If we have to refresh an invalidated query plan, cancel any existing
+      // timer before starting a new one.
+      refresh_timer_.cancel();
+      ScheduleRefresh(lock_2);
+    } else {
+      // If there are no waiting threads that could call the refresh_fn, then
+      // we need to accept that the refresh is in a failed state and wait for
+      // some new event that would start this refresh process anew.
+      //
+      // If there are waiting threads, then we want to try again to get a
+      // refreshed query plan, but we want to avoid a stampede of refresh RPCs
+      // so we only notify one of the waiting threads.
+#ifdef GOOGLE_CLOUD_CPP_BIGTABLE_QUERY_PLAN_REFRESH_ASSERT
+      assert(waiting_threads_ >= 0);
+#endif
+      if (waiting_threads_ == 0) {
+        state_ = RefreshState::kDone;
+        done = true;
+      }
+    }
+  }
+  if (done) {
+    cond_.notify_all();
+  } else {
+    cond_.notify_one();
   }
-  return response_.prepared_query();
 }
 
-StatusOr<google::bigtable::v2::ResultSetMetadata> QueryPlan::metadata() const {
-  std::lock_guard<std::mutex> lock(mu_);
-  if (IsExpired()) {
-    return Status(StatusCode::kUnavailable, "Query plan has expired");
+StatusOr<QueryPlan::ResponseData> QueryPlan::response_data() {
+  std::unique_lock<std::mutex> lock(mu_);
+  if (IsRefreshing(lock)) {
+    if (response_.ok()) {
+      return QueryPlan::ResponseData{response_->prepared_query(),
+                                     response_->metadata()};
+    }
+    lock.unlock();
+    RefreshQueryPlan(RefreshMode::kAlreadyRefreshing);
+    lock.lock();
+  }
+
+  if (state_ == RefreshState::kDone && !response_.ok()) {
+    return response_.status();
   }
-  return response_.metadata();
+
+  return QueryPlan::ResponseData{response_->prepared_query(),
+                                 response_->metadata()};
+}
+
+StatusOr<std::string> QueryPlan::prepared_query() {
+  auto data = response_data();
+  if (!data.ok()) return std::move(data.status());
+  return std::move(data->prepared_query);
+}
+
+StatusOr<google::bigtable::v2::ResultSetMetadata> QueryPlan::metadata() {
+  auto data = response_data();
+  if (!data.ok()) return std::move(data.status());
+  return std::move(data->metadata);
 }
 
 GOOGLE_CLOUD_CPP_INLINE_NAMESPACE_END
diff --git a/google/cloud/bigtable/internal/query_plan.h b/google/cloud/bigtable/internal/query_plan.h
@@ -18,6 +18,7 @@
 
 #include "google/cloud/bigtable/version.h"
 #include "google/cloud/completion_queue.h"
+#include "google/cloud/internal/clock.h"
 #include <google/bigtable/v2/bigtable.pb.h>
 #include <string>
 #include <utility>
@@ -30,46 +31,86 @@ GOOGLE_CLOUD_CPP_INLINE_NAMESPACE_BEGIN
 class QueryPlan : public std::enable_shared_from_this<QueryPlan> {
  public:
   // Typically, a lambda capturing the original PrepareQueryRequest and
-  // DataConnection pointer necessary to call the PrepareQuery RPC.
-  using RefreshFn = std::function<google::bigtable::v2::PrepareQueryResponse()>;
+  // DataConnection pointer necessary to call the AsyncPrepareQuery RPC.
+  using RefreshFn = std::function<
+      future<StatusOr<google::bigtable::v2::PrepareQueryResponse>>()>;
+
+  using Clock = ::google::cloud::internal::SystemClock;
 
   // Calls the constructor and then Initialize.
   static std::shared_ptr<QueryPlan> Create(
       CompletionQueue cq, google::bigtable::v2::PrepareQueryResponse response,
-      RefreshFn fn);
+      RefreshFn fn, std::shared_ptr<Clock> clock = std::make_shared<Clock>());
+
+  // Invalidates the current QueryPlan and triggers a refresh.
+  void Invalidate(Status status, std::string const& invalid_query_plan_id);
+
+  struct ResponseData {
+    std::string prepared_query;
+    google::bigtable::v2::ResultSetMetadata metadata;
+  };
+
+  // Accessor for the prepared_query and metadata fields in response_.
+  // Triggers a refresh if needed.
+  StatusOr<ResponseData> response_data();
 
-  // Accessor for the prepared_query field in response_.
-  StatusOr<std::string> prepared_query() const;
+  GOOGLE_CLOUD_CPP_DEPRECATED("Use response_data() instead")
+  StatusOr<std::string> prepared_query();
 
-  // Accessor for the metadata field in  response_.
-  StatusOr<google::bigtable::v2::ResultSetMetadata> metadata() const;
+  GOOGLE_CLOUD_CPP_DEPRECATED("Use response_data() instead")
+  StatusOr<google::bigtable::v2::ResultSetMetadata> metadata();
 
  private:
-  QueryPlan(CompletionQueue cq,
-            google::bigtable::v2::PrepareQueryResponse response, RefreshFn fn)
+  QueryPlan(CompletionQueue cq, std::shared_ptr<Clock> clock, RefreshFn fn,
+            google::bigtable::v2::PrepareQueryResponse response)
       : cq_(std::move(cq)),
-        response_(std::move(response)),
-        fn_(std::move(fn)) {}
-  static bool IsExpired();
+        clock_(std::move(clock)),
+        refresh_fn_(std::move(fn)),
+        response_(std::move(response)) {}
+
+  bool IsRefreshing(std::unique_lock<std::mutex> const&) const;
 
   // Performs the first call to ScheduleRefresh and any other initialization not
   // possible in the constructor.
-  void Initialize() {}
+  void Initialize();
 
   // Calls MakeDeadlineTimer on the CompletionQueue with a continuation lambda
   // capturing a std::weak_ptr to this that calls RefreshQueryPlan.
-  void ScheduleRefresh() {}
+  void ScheduleRefresh(std::unique_lock<std::mutex> const&);
 
+  enum class RefreshMode { kExpired, kInvalidated, kAlreadyRefreshing };
   // Performs the synchronization around calling RefreshFn and updating
   // response_.
-  void RefreshQueryPlan() {}
+  //  void RefreshQueryPlan();
+
+  void RefreshQueryPlan(RefreshMode mode, Status error = {});
+
+  void ExpiredRefresh();
+
+  // State machine where the only valid transitions are:
+  //   kDone -> kBegin
+  //   kBegin -> kPending
+  //   kPending -> kDone
+  // When refreshing the same previous query plan.
+  enum class RefreshState {
+    kBegin,    // waiting for a future thread to refresh response_
+    kPending,  // waiting for an active thread to refresh response_
+    kDone,     // response_ has been refreshed
+  };
+  RefreshState state_ = RefreshState::kDone;
 
   CompletionQueue cq_;
+  std::shared_ptr<Clock> clock_;
+  RefreshFn refresh_fn_;
   future<void> refresh_timer_;
   mutable std::mutex mu_;
   std::condition_variable cond_;
-  google::bigtable::v2::PrepareQueryResponse response_;  // GUARDED_BY(mu_)
-  RefreshFn fn_;
+  // waiting_threads_ is only a snapshot, but it helps us reduce the number of
+  // RPCs in flight to refresh the same query plan.
+  int waiting_threads_ = 0;
+  std::string old_query_plan_id_;
+  StatusOr<google::bigtable::v2::PrepareQueryResponse>
+      response_;  // GUARDED_BY(mu_)
 };
 
 GOOGLE_CLOUD_CPP_INLINE_NAMESPACE_END
diff --git a/google/cloud/bigtable/internal/query_plan_test.cc b/google/cloud/bigtable/internal/query_plan_test.cc