Skip to content
This repository was archived by the owner on Sep 27, 2019. It is now read-only.

Commit f372234

Browse files
authored
Merge branch 'master' into master
2 parents dd2cdfd + 366bcf0 commit f372234

File tree

6 files changed

+158
-48
lines changed

6 files changed

+158
-48
lines changed

src/brain/query_logger.cpp

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,22 +11,40 @@
1111
//===----------------------------------------------------------------------===//
1212

1313
#include "brain/query_logger.h"
14+
1415
#include "catalog/query_history_catalog.h"
1516
#include "concurrency/transaction_context.h"
1617
#include "concurrency/transaction_manager_factory.h"
17-
#include "parser/pg_query.h"
1818

1919
namespace peloton {
2020
namespace brain {
2121

22+
QueryLogger::Fingerprint::Fingerprint(const std::string &query)
23+
: query_(query),
24+
fingerprint_(""),
25+
fingerprint_result_(pg_query_fingerprint(query.c_str())) {
26+
if (fingerprint_result_.hexdigest != nullptr) {
27+
fingerprint_ = fingerprint_result_.hexdigest;
28+
}
29+
}
30+
31+
QueryLogger::Fingerprint::~Fingerprint() {
32+
pg_query_free_fingerprint_result(fingerprint_result_);
33+
}
34+
2235
void QueryLogger::LogQuery(std::string query_string, uint64_t timestamp) {
2336
auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance();
24-
auto txn = txn_manager.BeginTransaction();
25-
std::string fingerprint = pg_query_fingerprint(query_string.c_str()).hexdigest;
37+
auto *txn = txn_manager.BeginTransaction();
38+
39+
// Perform fingerprint
40+
Fingerprint fingerprint{query_string};
2641

27-
catalog::QueryHistoryCatalog::GetInstance()->InsertQueryHistory(
28-
query_string, fingerprint, timestamp, nullptr, txn);
42+
// Log query + fingerprint
43+
auto &query_history_catalog = catalog::QueryHistoryCatalog::GetInstance();
44+
query_history_catalog.InsertQueryHistory(
45+
query_string, fingerprint.GetFingerprint(), timestamp, nullptr, txn);
2946

47+
// We're done
3048
txn_manager.CommitTransaction(txn);
3149
}
3250

src/catalog/query_history_catalog.cpp

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,25 +6,23 @@
66
//
77
// Identification: src/catalog/query_history_catalog.cpp
88
//
9-
// Copyright (c) 2015-18, Carnegie Mellon University Database Group
9+
// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
1010
//
1111
//===----------------------------------------------------------------------===//
1212

1313
#include "catalog/query_history_catalog.h"
1414

1515
#include "catalog/catalog.h"
16-
#include "executor/logical_tile.h"
17-
#include "parser/pg_query.h"
1816
#include "storage/data_table.h"
1917
#include "type/value_factory.h"
2018

2119
namespace peloton {
2220
namespace catalog {
2321

24-
QueryHistoryCatalog *QueryHistoryCatalog::GetInstance(
22+
QueryHistoryCatalog &QueryHistoryCatalog::GetInstance(
2523
concurrency::TransactionContext *txn) {
2624
static QueryHistoryCatalog query_history_catalog{txn};
27-
return &query_history_catalog;
25+
return query_history_catalog;
2826
}
2927

3028
QueryHistoryCatalog::QueryHistoryCatalog(concurrency::TransactionContext *txn)
@@ -36,22 +34,23 @@ QueryHistoryCatalog::QueryHistoryCatalog(concurrency::TransactionContext *txn)
3634
"timestamp TIMESTAMP NOT NULL);",
3735
txn) {}
3836

39-
QueryHistoryCatalog::~QueryHistoryCatalog() {}
37+
QueryHistoryCatalog::~QueryHistoryCatalog() = default;
4038

41-
bool QueryHistoryCatalog::InsertQueryHistory(const std::string &query_string,
42-
std::string &fingerprint, uint64_t timestamp,
43-
type::AbstractPool *pool,
44-
concurrency::TransactionContext *txn) {
39+
bool QueryHistoryCatalog::InsertQueryHistory(
40+
const std::string &query_string, const std::string &fingerprint,
41+
uint64_t timestamp, type::AbstractPool *pool,
42+
concurrency::TransactionContext *txn) {
4543
std::unique_ptr<storage::Tuple> tuple(
4644
new storage::Tuple(catalog_table_->GetSchema(), true));
4745

48-
auto val0 = type::ValueFactory::GetVarcharValue(query_string, pool);
49-
auto val1 = type::ValueFactory::GetVarcharValue(fingerprint, pool);
46+
auto val0 = type::ValueFactory::GetVarcharValue(query_string);
47+
auto val1 = type::ValueFactory::GetVarcharValue(fingerprint);
5048
auto val2 = type::ValueFactory::GetTimestampValue(timestamp);
5149

52-
tuple->SetValue(ColumnId::QUERY_STRING, val0, pool);
53-
tuple->SetValue(ColumnId::FINGERPRINT, val1, pool);
54-
tuple->SetValue(ColumnId::TIMESTAMP, val2, pool);
50+
tuple->SetValue(ColumnId::QUERY_STRING, val0,
51+
pool != nullptr ? pool : &pool_);
52+
tuple->SetValue(ColumnId::FINGERPRINT, val1, pool != nullptr ? pool : &pool_);
53+
tuple->SetValue(ColumnId::TIMESTAMP, val2, pool != nullptr ? pool : &pool_);
5554

5655
// Insert the tuple
5756
return InsertTuple(std::move(tuple), txn);

src/include/brain/query_logger.h

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@
1212

1313
#pragma once
1414

15+
#include <cstdint>
1516
#include <string>
1617

18+
#include "parser/pg_query.h"
19+
1720
namespace peloton {
1821
namespace brain {
1922

@@ -25,8 +28,28 @@ class QueryLogger {
2528
public:
2629
QueryLogger() = default;
2730

28-
/*
31+
class Fingerprint {
32+
public:
33+
/// Constructor
34+
explicit Fingerprint(const std::string &query);
35+
36+
/// Destructor
37+
~Fingerprint();
38+
39+
/// Get original string
40+
const std::string &GetQueryString() { return query_; }
41+
const std::string &GetFingerprint() { return fingerprint_; }
42+
43+
private:
44+
// Accessors
45+
std::string query_;
46+
std::string fingerprint_;
47+
PgQueryFingerprintResult fingerprint_result_;
48+
};
49+
50+
/**
2951
* @brief This function logs the query into query_history_catalog
52+
*
3053
* @param the sql string corresponding to the query
3154
* @param timestamp of the transaction that executed the query
3255
*/

src/include/catalog/query_history_catalog.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
//
77
// Identification: src/include/catalog/query_history_catalog.h
88
//
9-
// Copyright (c) 2015-18, Carnegie Mellon University Database Group
9+
// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
1010
//
1111
//===----------------------------------------------------------------------===//
1212

@@ -23,6 +23,7 @@
2323
#pragma once
2424

2525
#include "catalog/abstract_catalog.h"
26+
#include "type/ephemeral_pool.h"
2627

2728
#define QUERY_HISTORY_CATALOG_NAME "pg_query_history"
2829

@@ -34,14 +35,14 @@ class QueryHistoryCatalog : public AbstractCatalog {
3435
~QueryHistoryCatalog();
3536

3637
// Global Singleton
37-
static QueryHistoryCatalog *GetInstance(
38+
static QueryHistoryCatalog &GetInstance(
3839
concurrency::TransactionContext *txn = nullptr);
3940

4041
//===--------------------------------------------------------------------===//
4142
// write Related API
4243
//===--------------------------------------------------------------------===//
4344
bool InsertQueryHistory(const std::string &query_string,
44-
std::string &fingerprint, uint64_t timestamp,
45+
const std::string &fingerprint, uint64_t timestamp,
4546
type::AbstractPool *pool,
4647
concurrency::TransactionContext *txn);
4748

@@ -54,6 +55,8 @@ class QueryHistoryCatalog : public AbstractCatalog {
5455
private:
5556
QueryHistoryCatalog(concurrency::TransactionContext *txn);
5657

58+
// Pool to use for variable length strings
59+
type::EphemeralPool pool_;
5760
};
5861

5962
} // namespace catalog

src/optimizer/README.md

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
## Optimizer Overview
2+
3+
The optimizer follows the [Cascade](http://15721.courses.cs.cmu.edu/spring2018/papers/15-optimizer1/graefe-ieee1995.pdf) framework, it takes an annotated parse tree as input, goes through multiple optimization phases, and output an execution plan. The major phases are:
4+
* Parse tree to logical operator tree transformation.
5+
* Predicate push-down, which pushes predicates to the lowest possible operator to evaluate.
6+
* Unnesting, which turns arbitary correlated subqeries into logical join operator.
7+
* Logical transformation, which enumerates all possible join orders.
8+
* Stats derivation, which derives stats needed to compute the cost for each group.
9+
* Phyisical implementation, which enumerates all possible implementation for a logical operator and cost them, e.g. hash join v.s. nested-loop join
10+
* Property enforcing, which adds missing properties descirbing the output format, e.g. sort order.
11+
* Operator to plan transformation, which turns the best physical operator tree into an execution plan.
12+
13+
The rewrite phase consists of predicate push-down and unnesting, they will run once separately before later optimization phases. Transformation, stats derivation, physical implementation and property enforcing will not be separated, a new logical operator tree generated from a transformation rule is going to be implemented as a physical operator and cost immediately, this allows us to do pruning based on the cost to avoid enumerating inefficient plans. Consider we are calculating cost for an operator tree `((A join B join C) join D)`. If the cost of only joining the intermidiate table `(A join B join C)` and the base table `D` already exceeded the cost of another operator tree in the same group, say, `((A join B) join (C join D))`, we'll do the pruning by avoiding join order enumeration for `(A join B join C)`. If we do transformation and implementation separately, the pruning is not possible, because when we're costing an operator tree after the implementation phase, the join order of all it's child groups have already been enumerated in the transformation phase.
14+
15+
The entrance of the optimizer is `Optimizer::BuildPelotonPlanTree()` in [`optimizer.cpp`](https://github.com/chenboy/peloton/blob/optimizer_doc/src/optimizer/optimizer.cpp). We'll cover each phase in the following sections.
16+
17+
## Parse Tree Transformation
18+
19+
The first step in the optimizer is to transform a peloton parse tree into a logical operator tree which follows relational algreba. This is implemented in [`query_to_operator_transformer.cpp`](https://github.com/chenboy/peloton/blob/optimizer_doc/src/optimizer/query_to_operator_transformer.cpp). Most of the transformations are trivial since the parse tree is mostly structurally similar to a relational algreba tree. There are two things worth mentioning:
20+
* First, we extract conjunction expressions into a vector of expressions, annotate them with the table aliases occurred in the expression, and uses a separate predicate operators to store them. This will allow us to implement rule-based predicate push-down much easier.
21+
* Second, we transform correlated subqueries into `dependent join` and `mark join` intrudoced in [this paper](http://btw2017.informatik.uni-stuttgart.de/slidesandpapers/F1-10-37/paper_web.pdf) from Hyper, in the unnesting phase we'll transform dependent join into regular join operators.
22+
23+
## Rewrite Phase
24+
25+
The rewrite phase includes heuristic-based optimization passes that will be run once. For extensibility, we implement these passes as rule-based optimization. These passes assume the transformed tree is always better than the original tree so when a rule is applied, the new pattern will always replace the old one.
26+
27+
We implemented two different rewrite frameworks, top-down rewrite and bottom-up rewrite in [`optimizer_task.cpp`](https://github.com/chenboy/peloton/blob/optimizer_doc/src/optimizer/optimizer_task.cpp). Both of them are designed for a single optimization pass. Basically, a top-down pass will start from the root operator, apply a set of rewrite rules until saturated, then move to lower level to apply rules for those operators, whereas a bottom-up pass will first recursively apply rules for current operator's children, then apply rules for the current operator. If any rule triggers for an operator in a bottom-up pass, we'll apply rules for its child again, since the children may have changed so rewrite rules may be applicable to them.
28+
29+
The difference between top-down and bottom-up rewrite is top-down rewrite may have less ability of expression, but usually is more efficient. Rewrite phases that could be done in a top-down pass probably could also be implemented using bottom-up framework. For optimizations with top-down nature, e.g. predicate push-down, a top-down pass usually will be more efficient since it only applies a set of rules for each operator once, but bottom-up framework may apply for each operator multiple time because when a rule is applied for an operator, we need to recurisively apply rules for all its children.
30+
31+
For people want to add new rewrite passes, they should specify a set of rules, pick a rewrite framework from top-down rewrite and bottom-up rewrite and push the rewrite task to the task queue.
32+
33+
Predicate push-down is a top-down rewrite pass, what it does is to push predicate throgh operators and when the predicate cannot be further push-down, the predicate is combined with the underlying operator.
34+
35+
Unnesting is a bottom-up pass that eliminates dependent join. It uses a bunch of techniques mentioned in Patrick's report.
36+
37+
## Cascade Style Optimization
38+
39+
After the rewrite phase we'll get a logical operator tree, the next step is to feed the logical operator tree into a Cascade style query optimizer to generate the lowest cost operator tree. The implementation basically follows the [Columbia Optimizer paper](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.54.1153&rep=rep1&type=pdf), we'll add more details in the documentation, for now please just take the paper as reference. The tasks are implemented in [`optimizer_task.cpp`](https://github.com/chenboy/peloton/blob/optimizer_doc/src/optimizer/optimizer_task.cpp).
40+
41+
There's one task `DeriveStats` that is not mentioned in the Columnbia paper. We follow the [Orca paper](http://15721.courses.cs.cmu.edu/spring2018/papers/15-optimizer1/p337-soliman.pdf) to derive stats for each group on the fly. When a new group is generated, we'll recursively collect stats for the column used in the root operator's predicate (When a new group is generated, there's only one expression in the group, thus only one root operator), compute stats for the root group and cache those stats in the group so that we only derive stats for columns we need, which is efficient for both time and space.
42+
43+
The design of property enforcing also follows Orca rather than Columbia. We'll add enforcers after applying physical rules.
44+
45+
## Operator to plan transformation
46+
47+
When all the optimizations are done, we'll pick the lowest cost operator tree and use it to generate an execution plan.
48+
49+
We'll first derive the output column, which are a vector of expressions, for the root group based on the query's select list, then we'll recursively derive output columns for each operator (implemented in [`input_column_deriver.cpp`](https://github.com/chenboy/peloton/blob/optimizer_doc/src/optimizer/input_column_deriver.cpp)) based on columns used in each operator. At last, we generate peloton plan node using operators in the best operator tree and the input/output column pairs for these operators. What we do here is basically setting column offset for the plan nodes, e.g. output column offset, sort column's offset and column offset in the predicates (implemented in [`plan_generator.cpp`](https://github.com/chenboy/peloton/blob/optimizer_doc/src/optimizer/plan_generator.cpp)).
50+
51+
## WIP
52+
53+
There are still a lot of interesting work needed to be implemented, including:
54+
* Join order enumeration.
55+
* Expression rewrite, my current thought is it should be done in the binder after annotating expressions or in the optimizer before predicate push-down.
56+
* Implement sampling-based stats derivation and cost calculation
57+
* Support unnesting arbitary queries so that we can support a wider range of queries in TPC-H. This would need the codegen engine to support `semi join`, `anti-semi join`, `mark join`, `single join`.
58+
* More documentations.

test/brain/query_logger_test.cpp

Lines changed: 33 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,43 +6,45 @@
66
//
77
// Identification: test/brain/query_logger_test.cpp
88
//
9-
// Copyright (c) 2015-16, Carnegie Mellon University Database Group
9+
// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
1010
//
1111
//===----------------------------------------------------------------------===//
1212

1313
#include "common/harness.h"
14+
15+
#include "brain/query_logger.h"
1416
#include "sql/testing_sql_util.h"
1517
#include "settings/settings_manager.h"
16-
#include "parser/pg_query.h"
17-
18-
using std::vector;
19-
using std::string;
2018

2119
namespace peloton {
2220
namespace test {
2321

2422
class QueryLoggerTests : public PelotonTest {
2523
protected:
26-
virtual void SetUp() override {
24+
void SetUp() override {
2725
settings::SettingsManager::SetBool(settings::SettingId::brain, true);
2826
PelotonInit::Initialize();
2927

3028
// query to check that logging is done
3129
select_query_ =
3230
"SELECT query_string, fingerprint FROM pg_catalog.pg_query_history;";
33-
select_query_fingerprint_ =
34-
pg_query_fingerprint(select_query_.c_str()).hexdigest;
31+
32+
brain::QueryLogger::Fingerprint fingerprint{select_query_};
33+
select_query_fingerprint_ = fingerprint.GetFingerprint();
34+
3535
wait_time_ = 2;
3636
}
3737

38-
virtual void TearDown() override { PelotonInit::Shutdown(); }
38+
void TearDown() override { PelotonInit::Shutdown(); }
3939

4040
// Executes the given query and then checks if the queries that are executed
4141
// till now are actually logged
42-
void TestSimpleUtil(string const &test_query,
43-
vector<std::string> &expected_result) {
44-
string test_query_fingerprint =
45-
pg_query_fingerprint(test_query.c_str()).hexdigest;
42+
void TestSimpleUtil(std::string const &test_query,
43+
std::vector<std::string> &expected_result) {
44+
brain::QueryLogger::Fingerprint fingerprint{test_query};
45+
46+
std::string test_query_fingerprint = fingerprint.GetFingerprint();
47+
4648
expected_result.push_back(test_query + "|" + test_query_fingerprint);
4749
TestingSQLUtil::ExecuteSQLQuery(test_query.c_str());
4850

@@ -57,14 +59,16 @@ class QueryLoggerTests : public PelotonTest {
5759
}
5860

5961
// Executes the given query and then checks if the queries that are executed
60-
// till now are actually logged only when the transaction commits. Otherwise
62+
// until now are actually logged only when the transaction commits. Otherwise
6163
// stores to queries for checking this later when commit happens.
62-
void TestTransactionUtil(string const &test_query,
63-
vector<std::string> &expected_result,
64+
void TestTransactionUtil(std::string const &test_query,
65+
std::vector<std::string> &expected_result,
6466
bool committed) {
65-
static vector<std::string> temporary_expected_result;
66-
string test_query_fingerprint =
67-
pg_query_fingerprint(test_query.c_str()).hexdigest;
67+
static std::vector<std::string> temporary_expected_result;
68+
69+
brain::QueryLogger::Fingerprint fingerprint{test_query};
70+
std::string test_query_fingerprint = fingerprint.GetFingerprint();
71+
6872
temporary_expected_result.push_back(test_query + "|" +
6973
test_query_fingerprint);
7074
TestingSQLUtil::ExecuteSQLQuery(test_query.c_str());
@@ -100,15 +104,20 @@ class QueryLoggerTests : public PelotonTest {
100104
}
101105

102106
protected:
103-
string select_query_; // fixed query to check the queries logged in the table
104-
string select_query_fingerprint_; // fingerprint for the fixed query
105-
int wait_time_; // time to wait in seconds for the query to log into the
106-
// table
107+
// fixed query to check the queries logged in the table
108+
std::string select_query_;
109+
110+
// fingerprint for the fixed query
111+
std::string select_query_fingerprint_;
112+
113+
// time to wait in seconds for the query to log into the table
114+
int wait_time_;
107115
};
108116

109117
// Testing the functionality of query logging
110118
TEST_F(QueryLoggerTests, QueriesTest) {
111-
vector<std::string> expected_result; // used to store the expected result
119+
// used to store the expected result
120+
std::vector<std::string> expected_result;
112121

113122
// create the table and do some inserts and check
114123
TestSimpleUtil("CREATE TABLE test(a INT);", expected_result);

0 commit comments

Comments
 (0)