Skip to content
This repository was archived by the owner on Sep 27, 2019. It is now read-only.

Commit 078a438

Browse files
pmenonapavlo
authored andcommitted
Support COPY for CSV files (#1371)
* Add more information to CopyStatement. Cleaned up includes. * Add CSVScan node to ToString and FromString. Removed BINARY external format for now. * Move COPY from DDL to DML processing. COPY now goes through planner/optimization. * Propagate external file information * Removed unused serialization stuff from plan nodes * Codegen can now have constant generic/opaque bytes in module * When no columns specified during copy, all columns are inserted * Added function to throw expception with ill-formatted input string when converting to number * Removed serialization * Added input functions in prepartion to read table data from files * All SQL types must now provide an input function to convert a string into a SQL type * Added test for value integrity * First take at CSV Scan translator * Fix after rebase * file api * CSV scanner reads lines * Process CSV line in scanner * Free memory when re-allocating line buffer * Added memcmp to codegen interface. Renamed CallPrintf() to Printf(). * Cleaned up CSV scan translator. Added null checking. * Moved TupleRuntime::CreateVarlen() into ValuesRuntime::WriteVarlen(). Better code organization and clearer name. * Added error handling for long columns. Added null-terminator byte for when read-buffers are copied to line-buffers. * Added inputs for decimal types * Moved type-specific functions into function namespace * REALLY simple Date support * Compile fixes for GCC 6+ * Get string inputs working * Beefed up tests * Simple CSV scan test * Updated optimize to continue support for old/weird/strange AF copy executor * Extracted implementation into CPP file for plan node * * Propagatge file options through optimization. * Added codegen.cpp to source validator whitelist, since we have the ability to call printf() from codegen for debug. * Beefed up overflow checks in NumericRuntime. * Fixed tests. * Fixes after rebase * Simple function to convert tuple to string CSV * Fix void* -> i8* conversion * More tests * Address reviews * Revert "Removed serialization" This reverts commit d055ff9. * Revert "Removed unused serialization stuff from plan nodes" This reverts commit 74427c7. * Beefed up tests, which caught more bugs * Fix tests * Reducing copying overhead for columns, constraints and loop variables during CheckConstraints(). We were spending 50% of our time here during bulk insertions into wide tables due to unnecessary copying!
1 parent 9e5be9b commit 078a438

File tree

133 files changed

+4282
-1057
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

133 files changed

+4282
-1057
lines changed

script/validators/source_validator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,12 @@
5858
"src/network/protocol.cpp",
5959
"src/include/common/macros.h",
6060
"src/common/stack_trace.cpp",
61-
"src/include/parser/sql_scanner.h", # There is a free() in comments
6261
"src/include/index/bloom_filter.h",
6362
"src/include/index/compact_ints_key.h",
6463
"src/include/index/bwtree.h",
6564
"src/codegen/util/oa_hash_table.cpp",
66-
"src/codegen/util/cc_hash_table.cpp"
65+
"src/codegen/util/cc_hash_table.cpp",
66+
"src/codegen/codegen.cpp", # We allow calling printf() from codegen for debugging
6767
]
6868

6969
## ==============================================

src/binder/bind_node_visitor.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,19 @@ void BindNodeVisitor::Visit(parser::DeleteStatement *node) {
166166
}
167167

168168
void BindNodeVisitor::Visit(parser::LimitDescription *) {}
169-
void BindNodeVisitor::Visit(parser::CopyStatement *) {}
169+
170+
void BindNodeVisitor::Visit(parser::CopyStatement *node) {
171+
context_ = std::make_shared<BinderContext>(nullptr);
172+
if (node->table != nullptr) {
173+
node->table->Accept(this);
174+
175+
// If the table is given, we're either writing or reading all columns
176+
context_->GenerateAllColumnExpressions(node->select_list);
177+
} else {
178+
node->select_stmt->Accept(this);
179+
}
180+
}
181+
170182
void BindNodeVisitor::Visit(parser::CreateFunctionStatement *) {}
171183
void BindNodeVisitor::Visit(parser::CreateStatement *node) {
172184
node->TryBindDatabaseName(default_database_name_);

src/catalog/abstract_catalog.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include "executor/plan_executor.h"
3636
#include "executor/seq_scan_executor.h"
3737
#include "executor/update_executor.h"
38+
#include "expression/constant_value_expression.h"
3839

3940
#include "storage/database.h"
4041
#include "storage/storage_manager.h"

src/catalog/catalog.cpp

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
#include "codegen/code_context.h"
3131
#include "concurrency/transaction_manager_factory.h"
3232
#include "function/date_functions.h"
33-
#include "function/decimal_functions.h"
33+
#include "function/numeric_functions.h"
3434
#include "function/old_engine_string_functions.h"
3535
#include "function/timestamp_functions.h"
3636
#include "index/index_factory.h"
@@ -1283,43 +1283,43 @@ void Catalog::InitializeFunctions() {
12831283
AddBuiltinFunction("abs", {type::TypeId::DECIMAL}, type::TypeId::DECIMAL,
12841284
internal_lang, "Abs",
12851285
function::BuiltInFuncType{
1286-
OperatorId::Abs, function::DecimalFunctions::_Abs},
1286+
OperatorId::Abs, function::NumericFunctions::_Abs},
12871287
txn);
12881288
AddBuiltinFunction(
12891289
"sqrt", {type::TypeId::TINYINT}, type::TypeId::DECIMAL, internal_lang,
12901290
"Sqrt",
12911291
function::BuiltInFuncType{OperatorId::Sqrt,
1292-
function::DecimalFunctions::Sqrt},
1292+
function::NumericFunctions::Sqrt},
12931293
txn);
12941294
AddBuiltinFunction(
12951295
"sqrt", {type::TypeId::SMALLINT}, type::TypeId::DECIMAL,
12961296
internal_lang, "Sqrt",
12971297
function::BuiltInFuncType{OperatorId::Sqrt,
1298-
function::DecimalFunctions::Sqrt},
1298+
function::NumericFunctions::Sqrt},
12991299
txn);
13001300
AddBuiltinFunction(
13011301
"sqrt", {type::TypeId::INTEGER}, type::TypeId::DECIMAL, internal_lang,
13021302
"Sqrt",
13031303
function::BuiltInFuncType{OperatorId::Sqrt,
1304-
function::DecimalFunctions::Sqrt},
1304+
function::NumericFunctions::Sqrt},
13051305
txn);
13061306
AddBuiltinFunction(
13071307
"sqrt", {type::TypeId::BIGINT}, type::TypeId::DECIMAL, internal_lang,
13081308
"Sqrt",
13091309
function::BuiltInFuncType{OperatorId::Sqrt,
1310-
function::DecimalFunctions::Sqrt},
1310+
function::NumericFunctions::Sqrt},
13111311
txn);
13121312
AddBuiltinFunction(
13131313
"sqrt", {type::TypeId::DECIMAL}, type::TypeId::DECIMAL, internal_lang,
13141314
"Sqrt",
13151315
function::BuiltInFuncType{OperatorId::Sqrt,
1316-
function::DecimalFunctions::Sqrt},
1316+
function::NumericFunctions::Sqrt},
13171317
txn);
13181318
AddBuiltinFunction(
13191319
"floor", {type::TypeId::DECIMAL}, type::TypeId::DECIMAL,
13201320
internal_lang, "Floor",
13211321
function::BuiltInFuncType{OperatorId::Floor,
1322-
function::DecimalFunctions::_Floor},
1322+
function::NumericFunctions::_Floor},
13231323
txn);
13241324

13251325
/**
@@ -1328,126 +1328,126 @@ void Catalog::InitializeFunctions() {
13281328
AddBuiltinFunction("abs", {type::TypeId::TINYINT}, type::TypeId::TINYINT,
13291329
internal_lang, "Abs",
13301330
function::BuiltInFuncType{
1331-
OperatorId::Abs, function::DecimalFunctions::_Abs},
1331+
OperatorId::Abs, function::NumericFunctions::_Abs},
13321332
txn);
13331333

13341334
AddBuiltinFunction("abs", {type::TypeId::SMALLINT},
13351335
type::TypeId::SMALLINT, internal_lang, "Abs",
13361336
function::BuiltInFuncType{
1337-
OperatorId::Abs, function::DecimalFunctions::_Abs},
1337+
OperatorId::Abs, function::NumericFunctions::_Abs},
13381338
txn);
13391339

13401340
AddBuiltinFunction("abs", {type::TypeId::INTEGER}, type::TypeId::INTEGER,
13411341
internal_lang, "Abs",
13421342
function::BuiltInFuncType{
1343-
OperatorId::Abs, function::DecimalFunctions::_Abs},
1343+
OperatorId::Abs, function::NumericFunctions::_Abs},
13441344
txn);
13451345

13461346
AddBuiltinFunction("abs", {type::TypeId::BIGINT}, type::TypeId::BIGINT,
13471347
internal_lang, "Abs",
13481348
function::BuiltInFuncType{
1349-
OperatorId::Abs, function::DecimalFunctions::_Abs},
1349+
OperatorId::Abs, function::NumericFunctions::_Abs},
13501350
txn);
13511351

13521352
AddBuiltinFunction(
13531353
"floor", {type::TypeId::INTEGER}, type::TypeId::DECIMAL,
13541354
internal_lang, "Floor",
13551355
function::BuiltInFuncType{OperatorId::Floor,
1356-
function::DecimalFunctions::_Floor},
1356+
function::NumericFunctions::_Floor},
13571357
txn);
13581358
AddBuiltinFunction(
13591359
"floor", {type::TypeId::BIGINT}, type::TypeId::DECIMAL, internal_lang,
13601360
"Floor",
13611361
function::BuiltInFuncType{OperatorId::Floor,
1362-
function::DecimalFunctions::_Floor},
1362+
function::NumericFunctions::_Floor},
13631363
txn);
13641364
AddBuiltinFunction(
13651365
"floor", {type::TypeId::TINYINT}, type::TypeId::DECIMAL,
13661366
internal_lang, "Floor",
13671367
function::BuiltInFuncType{OperatorId::Floor,
1368-
function::DecimalFunctions::_Floor},
1368+
function::NumericFunctions::_Floor},
13691369
txn);
13701370
AddBuiltinFunction(
13711371
"floor", {type::TypeId::SMALLINT}, type::TypeId::DECIMAL,
13721372
internal_lang, "Floor",
13731373
function::BuiltInFuncType{OperatorId::Floor,
1374-
function::DecimalFunctions::_Floor},
1374+
function::NumericFunctions::_Floor},
13751375
txn);
13761376
AddBuiltinFunction(
13771377
"round", {type::TypeId::DECIMAL}, type::TypeId::DECIMAL,
13781378
internal_lang, "Round",
13791379
function::BuiltInFuncType{OperatorId::Round,
1380-
function::DecimalFunctions::_Round},
1380+
function::NumericFunctions::_Round},
13811381
txn);
13821382

13831383
AddBuiltinFunction(
13841384
"ceil", {type::TypeId::DECIMAL}, type::TypeId::DECIMAL, internal_lang,
13851385
"Ceil",
13861386
function::BuiltInFuncType{OperatorId::Ceil,
1387-
function::DecimalFunctions::_Ceil},
1387+
function::NumericFunctions::_Ceil},
13881388
txn);
13891389

13901390
AddBuiltinFunction(
13911391
"ceil", {type::TypeId::TINYINT}, type::TypeId::DECIMAL, internal_lang,
13921392
"Ceil",
13931393
function::BuiltInFuncType{OperatorId::Ceil,
1394-
function::DecimalFunctions::_Ceil},
1394+
function::NumericFunctions::_Ceil},
13951395
txn);
13961396

13971397
AddBuiltinFunction(
13981398
"ceil", {type::TypeId::SMALLINT}, type::TypeId::DECIMAL,
13991399
internal_lang, "Ceil",
14001400
function::BuiltInFuncType{OperatorId::Ceil,
1401-
function::DecimalFunctions::_Ceil},
1401+
function::NumericFunctions::_Ceil},
14021402
txn);
14031403

14041404
AddBuiltinFunction(
14051405
"ceil", {type::TypeId::INTEGER}, type::TypeId::DECIMAL, internal_lang,
14061406
"Ceil",
14071407
function::BuiltInFuncType{OperatorId::Ceil,
1408-
function::DecimalFunctions::_Ceil},
1408+
function::NumericFunctions::_Ceil},
14091409
txn);
14101410

14111411
AddBuiltinFunction(
14121412
"ceil", {type::TypeId::BIGINT}, type::TypeId::DECIMAL, internal_lang,
14131413
"Ceil",
14141414
function::BuiltInFuncType{OperatorId::Ceil,
1415-
function::DecimalFunctions::_Ceil},
1415+
function::NumericFunctions::_Ceil},
14161416
txn);
14171417

14181418
AddBuiltinFunction(
14191419
"ceiling", {type::TypeId::DECIMAL}, type::TypeId::DECIMAL,
14201420
internal_lang, "Ceil",
14211421
function::BuiltInFuncType{OperatorId::Ceil,
1422-
function::DecimalFunctions::_Ceil},
1422+
function::NumericFunctions::_Ceil},
14231423
txn);
14241424

14251425
AddBuiltinFunction(
14261426
"ceiling", {type::TypeId::TINYINT}, type::TypeId::DECIMAL,
14271427
internal_lang, "Ceil",
14281428
function::BuiltInFuncType{OperatorId::Ceil,
1429-
function::DecimalFunctions::_Ceil},
1429+
function::NumericFunctions::_Ceil},
14301430
txn);
14311431

14321432
AddBuiltinFunction(
14331433
"ceiling", {type::TypeId::SMALLINT}, type::TypeId::DECIMAL,
14341434
internal_lang, "Ceil",
14351435
function::BuiltInFuncType{OperatorId::Ceil,
1436-
function::DecimalFunctions::_Ceil},
1436+
function::NumericFunctions::_Ceil},
14371437
txn);
14381438

14391439
AddBuiltinFunction(
14401440
"ceiling", {type::TypeId::INTEGER}, type::TypeId::DECIMAL,
14411441
internal_lang, "Ceil",
14421442
function::BuiltInFuncType{OperatorId::Ceil,
1443-
function::DecimalFunctions::_Ceil},
1443+
function::NumericFunctions::_Ceil},
14441444
txn);
14451445

14461446
AddBuiltinFunction(
14471447
"ceiling", {type::TypeId::BIGINT}, type::TypeId::DECIMAL,
14481448
internal_lang, "Ceil",
14491449
function::BuiltInFuncType{OperatorId::Ceil,
1450-
function::DecimalFunctions::_Ceil},
1450+
function::NumericFunctions::_Ceil},
14511451
txn);
14521452

14531453
/**

src/codegen/buffering_consumer.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,15 @@ WrappedTuple &WrappedTuple::operator=(const WrappedTuple &o) {
4040
return *this;
4141
}
4242

43+
std::string WrappedTuple::ToCSV() const {
44+
std::string ret;
45+
for (uint32_t i = 0; i < tuple_.size(); i++) {
46+
if (i != 0) ret.append(",");
47+
ret.append(tuple_[i].ToString());
48+
}
49+
return ret;
50+
}
51+
4352
//===----------------------------------------------------------------------===//
4453
// BufferTuple() Proxy
4554
//===----------------------------------------------------------------------===//

src/codegen/codegen.cpp

Lines changed: 73 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,30 @@ llvm::Constant *CodeGen::ConstDouble(double val) const {
5959
return llvm::ConstantFP::get(DoubleType(), val);
6060
}
6161

62-
llvm::Constant *CodeGen::ConstString(const std::string &s) const {
62+
llvm::Value *CodeGen::ConstString(const std::string &str_val,
63+
const std::string &name) const {
6364
// Strings are treated as arrays of bytes
64-
auto *str = llvm::ConstantDataArray::getString(GetContext(), s);
65-
return new llvm::GlobalVariable(GetModule(), str->getType(), true,
66-
llvm::GlobalValue::InternalLinkage, str,
67-
"str");
65+
auto *str = llvm::ConstantDataArray::getString(GetContext(), str_val);
66+
auto *global_var =
67+
new llvm::GlobalVariable(GetModule(), str->getType(), true,
68+
llvm::GlobalValue::InternalLinkage, str, name);
69+
return GetBuilder().CreateInBoundsGEP(global_var, {Const32(0), Const32(0)});
70+
}
71+
72+
llvm::Value *CodeGen::ConstGenericBytes(const void *data, uint32_t length,
73+
const std::string &name) const {
74+
// Create the constant data array that wraps the input data
75+
llvm::ArrayRef<uint8_t> elements{reinterpret_cast<const uint8_t *>(data),
76+
length};
77+
auto *arr = llvm::ConstantDataArray::get(GetContext(), elements);
78+
79+
// Create a global variable for the data
80+
auto *global_var =
81+
new llvm::GlobalVariable(GetModule(), arr->getType(), true,
82+
llvm::GlobalValue::InternalLinkage, arr, name);
83+
84+
// Return a pointer to the first element
85+
return GetBuilder().CreateInBoundsGEP(global_var, {Const32(0), Const32(0)});
6886
}
6987

7088
llvm::Constant *CodeGen::Null(llvm::Type *type) const {
@@ -75,11 +93,6 @@ llvm::Constant *CodeGen::NullPtr(llvm::PointerType *type) const {
7593
return llvm::ConstantPointerNull::get(type);
7694
}
7795

78-
llvm::Value *CodeGen::ConstStringPtr(const std::string &s) const {
79-
auto &ir_builder = GetBuilder();
80-
return ir_builder.CreateConstInBoundsGEP2_32(nullptr, ConstString(s), 0, 0);
81-
}
82-
8396
llvm::Value *CodeGen::AllocateVariable(llvm::Type *type,
8497
const std::string &name) {
8598
// To allocate a variable, a function must be under construction
@@ -135,26 +148,68 @@ llvm::Value *CodeGen::CallFunc(llvm::Value *fn,
135148
return GetBuilder().CreateCall(fn, args);
136149
}
137150

138-
llvm::Value *CodeGen::CallPrintf(const std::string &format,
139-
const std::vector<llvm::Value *> &args) {
151+
llvm::Value *CodeGen::Printf(const std::string &format,
152+
const std::vector<llvm::Value *> &args) {
140153
auto *printf_fn = LookupBuiltin("printf");
141154
if (printf_fn == nullptr) {
155+
#if GCC_AT_LEAST_6
156+
// In newer GCC versions (i.e., GCC 6+), function attributes are part of the
157+
// type system and are attached to the function signature. For example, printf()
158+
// comes with the "noexcept" attribute. Moreover, GCC 6+ will complain when
159+
// attributes attached to a function (e.g., noexcept()) are not used at
160+
// their call-site. Below, we use decltype(printf) to get the C/C++ function
161+
// type of printf(...), but we discard the attributes since we don't need
162+
// them. Hence, on GCC 6+, compilation will fail without adding the
163+
// "-Wignored-attributes" flag. So, we add it here only.
164+
#pragma GCC diagnostic push
165+
#pragma GCC diagnostic ignored "-Wignored-attributes"
166+
#endif
142167
printf_fn = RegisterBuiltin(
143-
"printf", llvm::TypeBuilder<int(char *, ...), false>::get(GetContext()),
168+
"printf", llvm::TypeBuilder<decltype(printf), false>::get(GetContext()),
144169
reinterpret_cast<void *>(printf));
170+
#if GCC_AT_LEAST_6
171+
#pragma GCC diagnostic pop
172+
#endif
145173
}
146-
auto &ir_builder = code_context_.GetBuilder();
147-
auto *format_str =
148-
ir_builder.CreateGEP(ConstString(format), {Const32(0), Const32(0)});
149174

150175
// Collect all the arguments into a vector
151-
std::vector<llvm::Value *> printf_args{format_str};
176+
std::vector<llvm::Value *> printf_args = {ConstString(format, "format")};
152177
printf_args.insert(printf_args.end(), args.begin(), args.end());
153178

154-
// Call the function
179+
// Call printf()
155180
return CallFunc(printf_fn, printf_args);
156181
}
157182

183+
llvm::Value *CodeGen::Memcmp(llvm::Value *ptr1, llvm::Value *ptr2,
184+
llvm::Value *len) {
185+
static constexpr char kMemcmpFnName[] = "memcmp";
186+
auto *memcmp_fn = LookupBuiltin(kMemcmpFnName);
187+
if (memcmp_fn == nullptr) {
188+
#if GCC_AT_LEAST_6
189+
// In newer GCC versions (i.e., GCC 6+), function attributes are part of the
190+
// type system and are attached to the function signature. For example, memcmp()
191+
// comes with the "throw()" attribute, among many others. Moreover, GCC 6+ will
192+
// complain when attributes attached to a function are not used at their
193+
// call-site. Below, we use decltype(memcmp) to get the C/C++ function type
194+
// of memcmp(...), but we discard the attributes since we don't need them.
195+
// Hence, on GCC 6+, compilation will fail without adding the
196+
// "-Wignored-attributes" flag. So, we add it here only.
197+
#pragma GCC diagnostic push
198+
#pragma GCC diagnostic ignored "-Wignored-attributes"
199+
#endif
200+
memcmp_fn = RegisterBuiltin(
201+
kMemcmpFnName,
202+
llvm::TypeBuilder<decltype(memcmp), false>::get(GetContext()),
203+
reinterpret_cast<void *>(memcmp));
204+
#if GCC_AT_LEAST_6
205+
#pragma GCC diagnostic pop
206+
#endif
207+
}
208+
209+
// Call memcmp()
210+
return CallFunc(memcmp_fn, {ptr1, ptr2, len});
211+
}
212+
158213
llvm::Value *CodeGen::Sqrt(llvm::Value *val) {
159214
llvm::Function *sqrt_func = llvm::Intrinsic::getDeclaration(
160215
&GetModule(), llvm::Intrinsic::sqrt, val->getType());

0 commit comments

Comments
 (0)