Merge branch 'master' into master

pervazea · web-flow · commit f3722345bc64 · 2018-02-05T12:43:46.000-05:00
diff --git a/src/brain/query_logger.cpp b/src/brain/query_logger.cpp
@@ -11,22 +11,40 @@
 //===----------------------------------------------------------------------===//
 
 #include "brain/query_logger.h"
+
 #include "catalog/query_history_catalog.h"
 #include "concurrency/transaction_context.h"
 #include "concurrency/transaction_manager_factory.h"
-#include "parser/pg_query.h"
 
 namespace peloton {
 namespace brain {
 
+QueryLogger::Fingerprint::Fingerprint(const std::string &query)
+    : query_(query),
+      fingerprint_(""),
+      fingerprint_result_(pg_query_fingerprint(query.c_str())) {
+  if (fingerprint_result_.hexdigest != nullptr) {
+    fingerprint_ = fingerprint_result_.hexdigest;
+  }
+}
+
+QueryLogger::Fingerprint::~Fingerprint() {
+  pg_query_free_fingerprint_result(fingerprint_result_);
+}
+
 void QueryLogger::LogQuery(std::string query_string, uint64_t timestamp) {
   auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance();
-  auto txn = txn_manager.BeginTransaction();
-  std::string fingerprint = pg_query_fingerprint(query_string.c_str()).hexdigest;
+  auto *txn = txn_manager.BeginTransaction();
+
+  // Perform fingerprint
+  Fingerprint fingerprint{query_string};
 
-  catalog::QueryHistoryCatalog::GetInstance()->InsertQueryHistory(
-      query_string, fingerprint, timestamp, nullptr, txn);
+  // Log query + fingerprint
+  auto &query_history_catalog = catalog::QueryHistoryCatalog::GetInstance();
+  query_history_catalog.InsertQueryHistory(
+      query_string, fingerprint.GetFingerprint(), timestamp, nullptr, txn);
 
+  // We're done
   txn_manager.CommitTransaction(txn);
 }
 
diff --git a/src/catalog/query_history_catalog.cpp b/src/catalog/query_history_catalog.cpp
@@ -6,25 +6,23 @@
 //
 // Identification: src/catalog/query_history_catalog.cpp
 //
-// Copyright (c) 2015-18, Carnegie Mellon University Database Group
+// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
 //
 //===----------------------------------------------------------------------===//
 
 #include "catalog/query_history_catalog.h"
 
 #include "catalog/catalog.h"
-#include "executor/logical_tile.h"
-#include "parser/pg_query.h"
 #include "storage/data_table.h"
 #include "type/value_factory.h"
 
 namespace peloton {
 namespace catalog {
 
-QueryHistoryCatalog *QueryHistoryCatalog::GetInstance(
+QueryHistoryCatalog &QueryHistoryCatalog::GetInstance(
     concurrency::TransactionContext *txn) {
   static QueryHistoryCatalog query_history_catalog{txn};
-  return &query_history_catalog;
+  return query_history_catalog;
 }
 
 QueryHistoryCatalog::QueryHistoryCatalog(concurrency::TransactionContext *txn)
@@ -36,22 +34,23 @@ QueryHistoryCatalog::QueryHistoryCatalog(concurrency::TransactionContext *txn)
                       "timestamp      TIMESTAMP NOT NULL);",
                       txn) {}
 
-QueryHistoryCatalog::~QueryHistoryCatalog() {}
+QueryHistoryCatalog::~QueryHistoryCatalog() = default;
 
-bool QueryHistoryCatalog::InsertQueryHistory(const std::string &query_string, 
-                                  std::string &fingerprint, uint64_t timestamp,
-                                  type::AbstractPool *pool,
-                                  concurrency::TransactionContext *txn) {
+bool QueryHistoryCatalog::InsertQueryHistory(
+    const std::string &query_string, const std::string &fingerprint,
+    uint64_t timestamp, type::AbstractPool *pool,
+    concurrency::TransactionContext *txn) {
   std::unique_ptr<storage::Tuple> tuple(
       new storage::Tuple(catalog_table_->GetSchema(), true));
 
-  auto val0 = type::ValueFactory::GetVarcharValue(query_string, pool);
-  auto val1 = type::ValueFactory::GetVarcharValue(fingerprint, pool);
+  auto val0 = type::ValueFactory::GetVarcharValue(query_string);
+  auto val1 = type::ValueFactory::GetVarcharValue(fingerprint);
   auto val2 = type::ValueFactory::GetTimestampValue(timestamp);
 
-  tuple->SetValue(ColumnId::QUERY_STRING, val0, pool);
-  tuple->SetValue(ColumnId::FINGERPRINT, val1, pool);
-  tuple->SetValue(ColumnId::TIMESTAMP, val2, pool);
+  tuple->SetValue(ColumnId::QUERY_STRING, val0,
+                  pool != nullptr ? pool : &pool_);
+  tuple->SetValue(ColumnId::FINGERPRINT, val1, pool != nullptr ? pool : &pool_);
+  tuple->SetValue(ColumnId::TIMESTAMP, val2, pool != nullptr ? pool : &pool_);
 
   // Insert the tuple
   return InsertTuple(std::move(tuple), txn);
diff --git a/src/include/brain/query_logger.h b/src/include/brain/query_logger.h
@@ -12,8 +12,11 @@
 
 #pragma once
 
+#include <cstdint>
 #include <string>
 
+#include "parser/pg_query.h"
+
 namespace peloton {
 namespace brain {
 
@@ -25,8 +28,28 @@ class QueryLogger {
  public:
   QueryLogger() = default;
 
-  /*
+  class Fingerprint {
+   public:
+    /// Constructor
+    explicit Fingerprint(const std::string &query);
+
+    /// Destructor
+    ~Fingerprint();
+
+    /// Get original string
+    const std::string &GetQueryString() { return query_; }
+    const std::string &GetFingerprint() { return fingerprint_; }
+
+   private:
+    // Accessors
+    std::string query_;
+    std::string fingerprint_;
+    PgQueryFingerprintResult fingerprint_result_;
+  };
+
+  /**
    * @brief This function logs the query into query_history_catalog
+   *
    * @param the sql string corresponding to the query
    * @param timestamp of the transaction that executed the query
    */
diff --git a/src/include/catalog/query_history_catalog.h b/src/include/catalog/query_history_catalog.h
@@ -6,7 +6,7 @@
 //
 // Identification: src/include/catalog/query_history_catalog.h
 //
-// Copyright (c) 2015-18, Carnegie Mellon University Database Group
+// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
 //
 //===----------------------------------------------------------------------===//
 
@@ -23,6 +23,7 @@
 #pragma once
 
 #include "catalog/abstract_catalog.h"
+#include "type/ephemeral_pool.h"
 
 #define QUERY_HISTORY_CATALOG_NAME "pg_query_history"
 
@@ -34,14 +35,14 @@ class QueryHistoryCatalog : public AbstractCatalog {
   ~QueryHistoryCatalog();
 
   // Global Singleton
-  static QueryHistoryCatalog *GetInstance(
+  static QueryHistoryCatalog &GetInstance(
       concurrency::TransactionContext *txn = nullptr);
 
   //===--------------------------------------------------------------------===//
   // write Related API
   //===--------------------------------------------------------------------===//
   bool InsertQueryHistory(const std::string &query_string,
-                          std::string &fingerprint, uint64_t timestamp,
+                          const std::string &fingerprint, uint64_t timestamp,
                           type::AbstractPool *pool,
                           concurrency::TransactionContext *txn);
 
@@ -54,6 +55,8 @@ class QueryHistoryCatalog : public AbstractCatalog {
  private:
   QueryHistoryCatalog(concurrency::TransactionContext *txn);
 
+  // Pool to use for variable length strings
+  type::EphemeralPool pool_;
 };
 
 }  // namespace catalog
diff --git a/src/optimizer/README.md b/src/optimizer/README.md
@@ -0,0 +1,58 @@
+## Optimizer Overview
+
+The optimizer follows the [Cascade](http://15721.courses.cs.cmu.edu/spring2018/papers/15-optimizer1/graefe-ieee1995.pdf) framework, it takes an annotated parse tree as input, goes through multiple optimization phases, and output an execution plan. The major phases are:
+* Parse tree to logical operator tree transformation.
+* Predicate push-down, which pushes predicates to the lowest possible operator to evaluate.
+* Unnesting, which turns arbitary correlated subqeries into logical join operator.
+* Logical transformation, which enumerates all possible join orders.
+* Stats derivation, which derives stats needed to compute the cost for each group.
+* Phyisical implementation, which enumerates all possible implementation for a logical operator and cost them, e.g. hash join v.s. nested-loop join
+* Property enforcing, which adds missing properties descirbing the output format, e.g. sort order.
+* Operator to plan transformation, which turns the best physical operator tree into an execution plan.
+
+The rewrite phase consists of predicate push-down and unnesting, they will run once separately before later optimization phases. Transformation, stats derivation, physical implementation and property enforcing will not be separated, a new logical operator tree generated from a transformation rule is going to be implemented as a physical operator and cost immediately, this allows us to do pruning based on the cost to avoid enumerating inefficient plans. Consider we are calculating cost for an operator tree `((A join B join C) join D)`. If the cost of only joining the intermidiate table `(A join B join C)` and the base table `D` already exceeded the cost of another operator tree in the same group, say, `((A join B) join (C join D))`, we'll do the pruning by avoiding join order enumeration for `(A join B join C)`. If we do transformation and implementation separately, the pruning is not possible, because when we're costing an operator tree after the implementation phase, the join order of all it's child groups have already been enumerated in the transformation phase.
+
+The entrance of the optimizer is `Optimizer::BuildPelotonPlanTree()` in [`optimizer.cpp`](https://github.com/chenboy/peloton/blob/optimizer_doc/src/optimizer/optimizer.cpp). We'll cover each phase in the following sections.
+
+## Parse Tree Transformation
+
+The first step in the optimizer is to transform a peloton parse tree into a logical operator tree which follows relational algreba. This is implemented in [`query_to_operator_transformer.cpp`](https://github.com/chenboy/peloton/blob/optimizer_doc/src/optimizer/query_to_operator_transformer.cpp). Most of the transformations are trivial since the parse tree is mostly structurally similar to a relational algreba tree. There are two things worth mentioning:
+* First, we extract conjunction expressions into a vector of expressions, annotate them with the table aliases occurred in the expression, and uses a separate predicate operators to store them. This will allow us to implement rule-based predicate push-down much easier. 
+* Second, we transform correlated subqueries into `dependent join` and `mark join` intrudoced in [this paper](http://btw2017.informatik.uni-stuttgart.de/slidesandpapers/F1-10-37/paper_web.pdf) from Hyper, in the unnesting phase we'll transform dependent join into regular join operators.
+
+## Rewrite Phase
+
+The rewrite phase includes heuristic-based optimization passes that will be run once. For extensibility, we implement these passes as rule-based optimization. These passes assume the transformed tree is always better than the original tree so when a rule is applied, the new pattern will always replace the old one. 
+
+We implemented two different rewrite frameworks, top-down rewrite and bottom-up rewrite in [`optimizer_task.cpp`](https://github.com/chenboy/peloton/blob/optimizer_doc/src/optimizer/optimizer_task.cpp). Both of them are designed for a single optimization pass. Basically, a top-down pass will start from the root operator, apply a set of rewrite rules until saturated, then move to lower level to apply rules for those operators, whereas a bottom-up pass will first recursively apply rules for current operator's children, then apply rules for the current operator. If any rule triggers for an operator in a bottom-up pass, we'll apply rules for its child again, since the children may have changed so rewrite rules may be applicable to them.
+
+The difference between top-down and bottom-up rewrite is top-down rewrite may have less ability of expression, but usually is more efficient. Rewrite phases that could be done in a top-down pass probably could also be implemented using bottom-up framework. For optimizations with top-down nature, e.g. predicate push-down, a top-down pass usually will be more efficient since it only applies a set of rules for each operator once, but bottom-up framework may apply for each operator multiple time because when a rule is applied for an operator, we need to recurisively apply rules for all its children.
+
+For people want to add new rewrite passes, they should specify a set of rules, pick a rewrite framework from top-down rewrite and bottom-up rewrite and push the rewrite task to the task queue.
+
+Predicate push-down is a top-down rewrite pass, what it does is to push predicate throgh operators and when the predicate cannot be further push-down, the predicate is combined with the underlying operator.
+
+Unnesting is a bottom-up pass that eliminates dependent join. It uses a bunch of techniques mentioned in Patrick's report.
+
+## Cascade Style Optimization
+
+After the rewrite phase we'll get a logical operator tree, the next step is to feed the logical operator tree into a Cascade style query optimizer to generate the lowest cost operator tree. The implementation basically follows the [Columbia Optimizer paper](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.54.1153&rep=rep1&type=pdf), we'll add more details in the documentation, for now please just take the paper as reference. The tasks are implemented in [`optimizer_task.cpp`](https://github.com/chenboy/peloton/blob/optimizer_doc/src/optimizer/optimizer_task.cpp). 
+
+There's one task `DeriveStats` that is not mentioned in the Columnbia paper. We follow the [Orca paper](http://15721.courses.cs.cmu.edu/spring2018/papers/15-optimizer1/p337-soliman.pdf) to derive stats for each group on the fly. When a new group is generated, we'll recursively collect stats for the column used in the root operator's predicate (When a new group is generated, there's only one expression in the group, thus only one root operator), compute stats for the root group and cache those stats in the group so that we only derive stats for columns we need, which is efficient for both time and space.
+
+The design of property enforcing also follows Orca rather than Columbia. We'll add enforcers after applying physical rules.
+
+## Operator to plan transformation
+
+When all the optimizations are done, we'll pick the lowest cost operator tree and use it to generate an execution plan.
+
+We'll first derive the output column, which are a vector of expressions, for the root group based on the query's select list, then we'll recursively derive output columns for each operator (implemented in [`input_column_deriver.cpp`](https://github.com/chenboy/peloton/blob/optimizer_doc/src/optimizer/input_column_deriver.cpp)) based on columns used in each operator. At last, we generate peloton plan node using operators in the best operator tree and the input/output column pairs for these operators. What we do here is basically setting column offset for the plan nodes, e.g. output column offset, sort column's offset and column offset in the predicates (implemented in [`plan_generator.cpp`](https://github.com/chenboy/peloton/blob/optimizer_doc/src/optimizer/plan_generator.cpp)).
+
+## WIP
+
+There are still a lot of interesting work needed to be implemented, including:
+* Join order enumeration. 
+* Expression rewrite, my current thought is it should be done in the binder after annotating expressions or in the optimizer before predicate push-down.
+* Implement sampling-based stats derivation and cost calculation
+* Support unnesting arbitary queries so that we can support a wider range of queries in TPC-H. This would need the codegen engine to support `semi join`, `anti-semi join`, `mark join`, `single join`.
+* More documentations.
diff --git a/test/brain/query_logger_test.cpp b/test/brain/query_logger_test.cpp
@@ -6,43 +6,45 @@
 //
 // Identification: test/brain/query_logger_test.cpp
 //
-// Copyright (c) 2015-16, Carnegie Mellon University Database Group
+// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
 //
 //===----------------------------------------------------------------------===//
 
 #include "common/harness.h"
+
+#include "brain/query_logger.h"
 #include "sql/testing_sql_util.h"
 #include "settings/settings_manager.h"
-#include "parser/pg_query.h"
-
-using std::vector;
-using std::string;
 
 namespace peloton {
 namespace test {
 
 class QueryLoggerTests : public PelotonTest {
  protected:
-  virtual void SetUp() override {
+  void SetUp() override {
     settings::SettingsManager::SetBool(settings::SettingId::brain, true);
     PelotonInit::Initialize();
 
     // query to check that logging is done
     select_query_ =
         "SELECT query_string, fingerprint FROM pg_catalog.pg_query_history;";
-    select_query_fingerprint_ =
-        pg_query_fingerprint(select_query_.c_str()).hexdigest;
+
+    brain::QueryLogger::Fingerprint fingerprint{select_query_};
+    select_query_fingerprint_ = fingerprint.GetFingerprint();
+
     wait_time_ = 2;
   }
 
-  virtual void TearDown() override { PelotonInit::Shutdown(); }
+  void TearDown() override { PelotonInit::Shutdown(); }
 
   // Executes the given query and then checks if the queries that are executed
   // till now are actually logged
-  void TestSimpleUtil(string const &test_query,
-                      vector<std::string> &expected_result) {
-    string test_query_fingerprint =
-        pg_query_fingerprint(test_query.c_str()).hexdigest;
+  void TestSimpleUtil(std::string const &test_query,
+                      std::vector<std::string> &expected_result) {
+    brain::QueryLogger::Fingerprint fingerprint{test_query};
+
+    std::string test_query_fingerprint = fingerprint.GetFingerprint();
+
     expected_result.push_back(test_query + "|" + test_query_fingerprint);
     TestingSQLUtil::ExecuteSQLQuery(test_query.c_str());
 
@@ -57,14 +59,16 @@ class QueryLoggerTests : public PelotonTest {
   }
 
   // Executes the given query and then checks if the queries that are executed
-  // till now are actually logged only when the transaction commits. Otherwise
+  // until now are actually logged only when the transaction commits. Otherwise
   // stores to queries for checking this later when commit happens.
-  void TestTransactionUtil(string const &test_query,
-                           vector<std::string> &expected_result,
+  void TestTransactionUtil(std::string const &test_query,
+                           std::vector<std::string> &expected_result,
                            bool committed) {
-    static vector<std::string> temporary_expected_result;
-    string test_query_fingerprint =
-        pg_query_fingerprint(test_query.c_str()).hexdigest;
+    static std::vector<std::string> temporary_expected_result;
+
+    brain::QueryLogger::Fingerprint fingerprint{test_query};
+    std::string test_query_fingerprint = fingerprint.GetFingerprint();
+
     temporary_expected_result.push_back(test_query + "|" +
                                         test_query_fingerprint);
     TestingSQLUtil::ExecuteSQLQuery(test_query.c_str());
@@ -100,15 +104,20 @@ class QueryLoggerTests : public PelotonTest {
   }
 
  protected:
-  string select_query_;  // fixed query to check the queries logged in the table
-  string select_query_fingerprint_;  // fingerprint for the fixed query
-  int wait_time_;  // time to wait in seconds for the query to log into the
-                   // table
+  // fixed query to check the queries logged in the table
+  std::string select_query_;
+
+  // fingerprint for the fixed query
+  std::string select_query_fingerprint_;
+
+  // time to wait in seconds for the query to log into the table
+  int wait_time_;
 };
 
 // Testing the functionality of query logging
 TEST_F(QueryLoggerTests, QueriesTest) {
-  vector<std::string> expected_result;  // used to store the expected result
+  // used to store the expected result
+  std::vector<std::string> expected_result;
 
   // create the table and do some inserts and check
   TestSimpleUtil("CREATE TABLE test(a INT);", expected_result);