diff --git a/velox/type/CMakeLists.txt b/velox/type/CMakeLists.txt index 8e251dac4b09..d615e5823ba4 100644 --- a/velox/type/CMakeLists.txt +++ b/velox/type/CMakeLists.txt @@ -18,6 +18,9 @@ endif() add_subdirectory(parser) add_subdirectory(tz) add_subdirectory(fbhive) +if(VELOX_ENABLE_CLP_CONNECTOR) + add_subdirectory(fbclp) +endif() velox_add_library( velox_type diff --git a/velox/type/fbclp/CMakeLists.txt b/velox/type/fbclp/CMakeLists.txt new file mode 100644 index 000000000000..7e9301a4218e --- /dev/null +++ b/velox/type/fbclp/CMakeLists.txt @@ -0,0 +1,45 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(${VELOX_ENABLE_CLP_CONNECTOR}) + if(${VELOX_BUILD_TESTING}) + add_subdirectory(tests) + endif() + + bison_target( + ClpTypeParser ClpTypeParser.yy + ${CMAKE_CURRENT_BINARY_DIR}/ClpTypeParser.yy.cc + DEFINES_FILE ${CMAKE_CURRENT_BINARY_DIR}/ClpTypeParser.yy.h + COMPILE_FLAGS "-Werror -Wno-deprecated") + + flex_target( + ClpTypeParserScanner ClpTypeParser.ll + ${CMAKE_CURRENT_BINARY_DIR}/ClpScanner.cpp + COMPILE_FLAGS "-Cf --prefix=veloxtpclp") + + add_flex_bison_dependency(ClpTypeParserScanner ClpTypeParser) + + if(VELOX_MONO_LIBRARY) + add_custom_target( + velox_type_fbclp_parser_gen_src + DEPENDS ${BISON_ClpTypeParser_OUTPUTS} + ${FLEX_ClpTypeParserScanner_OUTPUTS}) + add_dependencies(velox velox_type_fbclp_parser_gen_src) + endif() + velox_add_library(velox_type_fbclp_parser ${BISON_ClpTypeParser_OUTPUTS} + ${FLEX_ClpTypeParserScanner_OUTPUTS} ClpParserUtil.cpp) + velox_include_directories(velox_type_fbclp_parser + PRIVATE ${PROJECT_BINARY_DIR} ${FLEX_INCLUDE_DIRS}) + velox_link_libraries(velox_type_fbclp_parser velox_common_base) +endif() diff --git a/velox/type/fbclp/ClpParserUtil.cpp b/velox/type/fbclp/ClpParserUtil.cpp new file mode 100644 index 000000000000..46f02c3cadea --- /dev/null +++ b/velox/type/fbclp/ClpParserUtil.cpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "velox/type/Type.h" + +namespace facebook::velox::type::fbclp { + +TypePtr typeFromString( + const std::string& type, + bool failIfNotRegistered = true) { + auto upper = type; + std::transform(upper.begin(), upper.end(), upper.begin(), ::toupper); + if (upper == "INT") { + upper = "INTEGER"; + } else if (upper == "DOUBLE PRECISION") { + upper = "DOUBLE"; + } + auto inferredType = getType(upper, {}); + if (failIfNotRegistered == true && inferredType == nullptr) { + VELOX_UNSUPPORTED("Failed to parse type [{}]. Type not registered.", type); + } + return inferredType; +} + +TypePtr customTypeWithChildren( + const std::string& name, + const std::vector& children) { + std::vector params; + params.reserve(children.size()); + for (auto& child : children) { + params.emplace_back(child); + } + auto type = getType(name, params); + VELOX_CHECK_NOT_NULL( + type, "Failed to parse custom type with children [{}]", name); + return type; +} + +std::pair> inferTypeWithSpaces( + std::vector& words, + bool cannotHaveFieldName = false) { + VELOX_CHECK_GE(words.size(), 2); + const auto& fieldName = words[0]; + const auto allWords = folly::join(" ", words); + // Fail if cannotHaveFieldName = true. + auto type = typeFromString(allWords, cannotHaveFieldName); + if (type) { + return std::make_pair("", type); + } + return std::make_pair( + fieldName, typeFromString(allWords.substr(fieldName.size() + 1))); +} + +} // namespace facebook::velox::type::fbclp diff --git a/velox/type/fbclp/ClpParserUtil.h b/velox/type/fbclp/ClpParserUtil.h new file mode 100644 index 000000000000..527d72e95385 --- /dev/null +++ b/velox/type/fbclp/ClpParserUtil.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "velox/type/Type.h" + +namespace facebook::velox::type::fbclp { + +/// Normalize Presto types such as INT and DOUBLE PRECISION and convert to Velox +/// type. +TypePtr typeFromString( + const std::string& type, + bool failIfNotRegistered = true); + +TypePtr customTypeWithChildren( + const std::string& name, + const std::vector& children); + +/// Convert words with spaces to a Velox type. +/// First check if all the words are a Velox type. +/// Then check if the first word is a field name and the remaining words are a +/// Velox type. If cannotHaveFieldName = true, then all words must be a Velox +/// type. +std::pair inferTypeWithSpaces( + std::vector& words, + bool cannotHaveFieldName = false); + +} // namespace facebook::velox::type::fbclp diff --git a/velox/type/fbclp/ClpScanner.h b/velox/type/fbclp/ClpScanner.h new file mode 100644 index 000000000000..945c9f5f0286 --- /dev/null +++ b/velox/type/fbclp/ClpScanner.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include "velox/common/base/Exceptions.h" +#include "velox/type/Type.h" + +namespace facebook::velox::type::fbclp { + +class ClpScanner : public yyFlexLexer { + public: + ClpScanner( + std::istream& arg_yyin, + std::ostream& arg_yyout, + TypePtr& outputType, + const std::string_view input) + : yyFlexLexer(&arg_yyin, &arg_yyout), + outputType_(outputType), + input_(input){}; + int lex(ClpParser::semantic_type* yylval); + + void setType(TypePtr type) { + outputType_ = std::move(type); + } + + // Store input to print it as part of the error message. + std::string_view input() { + return input_; + } + + private: + TypePtr& outputType_; + const std::string_view input_; +}; + +} // namespace facebook::velox::type::fbclp diff --git a/velox/type/fbclp/ClpTypeParser.h b/velox/type/fbclp/ClpTypeParser.h new file mode 100644 index 000000000000..9b09960247d8 --- /dev/null +++ b/velox/type/fbclp/ClpTypeParser.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "velox/type/Type.h" + +namespace facebook::velox::type::fbclp { + +/// Parses a type string in Presto format to Velox type. +/// Example type strings: +/// row(col0 bigint, varchar) +/// array(bigint) +/// map(bigint, array(bigint)) +/// function(bigint,bigint,bigint) +/// The parsing is case-insensitive. i.e. 'Row' and 'row' are equal. +/// Field names for rows are optional. +/// Quoted field names are supported. +/// All custom types need to be registered. An error is thrown otherwise. +/// Uses the Type::getType API to convert a string to Velox type. +/// This CLP variant additionally supports unquoted field names containing +/// '-', '@', '#', '$', and '\\'. +TypePtr parseClpType(const std::string& typeText); + +} // namespace facebook::velox::type::fbclp diff --git a/velox/type/fbclp/ClpTypeParser.ll b/velox/type/fbclp/ClpTypeParser.ll new file mode 100644 index 000000000000..f72943ae66c2 --- /dev/null +++ b/velox/type/fbclp/ClpTypeParser.ll @@ -0,0 +1,76 @@ +%{ +#include +#include + +#include "velox/type/fbclp/ClpTypeParser.yy.h" // @manual +#include "velox/type/fbclp/ClpScanner.h" +#define YY_DECL int facebook::velox::type::fbclp::ClpScanner::lex(facebook::velox::type::fbclp::ClpParser::semantic_type *yylval) +%} + +%option c++ noyywrap noyylineno nodefault caseless + +A [A|a] +B [B|b] +C [C|c] +D [D|d] +E [E|e] +F [F|f] +G [G|g] +H [H|h] +I [I|i] +J [J|j] +K [K|k] +L [L|l] +M [M|m] +O [O|o] +P [P|p] +R [R|r] +S [S|s] +T [T|t] +U [U|u] +W [W|w] +X [X|x] +Y [Y|y] +Z [Z|z] + +WORD ([[:alpha:][:alnum:]_\-@#\$\\]*) +QUOTED_ID (['"'][[:alnum:][:space:]_]*['"']) +NUMBER ([[:digit:]]+) +VARIABLE (VARCHAR|VARBINARY) + +%% + +"(" return ClpParser::token::LPAREN; +")" return ClpParser::token::RPAREN; +"," return ClpParser::token::COMMA; +(ARRAY) return ClpParser::token::ARRAY; +(MAP) return ClpParser::token::MAP; +(FUNCTION) return ClpParser::token::FUNCTION; +(DECIMAL) return ClpParser::token::DECIMAL; +(ROW) return ClpParser::token::ROW; +{VARIABLE} yylval->build(YYText()); return ClpParser::token::VARIABLE; +{NUMBER} yylval->build(folly::to(YYText())); return ClpParser::token::NUMBER; +{WORD} yylval->build(YYText()); return ClpParser::token::WORD; +{QUOTED_ID} yylval->build(YYText()); return ClpParser::token::QUOTED_ID; +<> return ClpParser::token::YYEOF; +. /* no action on unmatched input */ + +%% + +int yyFlexLexer::yylex() { + throw std::runtime_error("Bad call to yyFlexLexer::yylex()"); +} + +#include "velox/type/fbclp/ClpTypeParser.h" + +facebook::velox::TypePtr facebook::velox::type::fbclp::parseClpType(const std::string& typeText) +{ + std::istringstream is(typeText); + std::ostringstream os; + facebook::velox::TypePtr type; + facebook::velox::type::fbclp::ClpScanner scanner{is, os, type, typeText}; + facebook::velox::type::fbclp::ClpParser parser{ &scanner }; + parser.parse(); + VELOX_CHECK(type, "Failed to parse type [{}]", typeText); + return type; +} diff --git a/velox/type/fbclp/ClpTypeParser.yy b/velox/type/fbclp/ClpTypeParser.yy new file mode 100644 index 000000000000..f606791799dc --- /dev/null +++ b/velox/type/fbclp/ClpTypeParser.yy @@ -0,0 +1,156 @@ +%{ +#include +#include "velox/common/base/Exceptions.h" +#include "velox/type/Type.h" +#include "velox/type/fbclp/ClpParserUtil.h" +%} +%require "3.0.4" +%language "C++" + +%define parser_class_name {ClpParser} +%define api.namespace {facebook::velox::type::fbclp} +%define api.value.type variant +%parse-param {ClpScanner* scanner} +%define parse.error verbose + +%code requires +{ + namespace facebook::velox::type::fbclp { + class ClpScanner; + } // namespace facebook::velox::type::fbclp + namespace facebook::velox { + class Type; + } // namespace facebook::velox + struct RowArguments { + std::vector names; + std::vector> types; + }; +} // %code requires + +%code +{ + #include + #define yylex(x) scanner->lex(x) +} + +%token LPAREN RPAREN COMMA ARRAY MAP ROW FUNCTION DECIMAL +%token WORD VARIABLE QUOTED_ID +%token NUMBER +%token YYEOF 0 + +%nterm > type type_single_word +%nterm > special_type function_type decimal_type row_type array_type map_type variable_type custom_type_with_children +%nterm type_list_opt_names +%nterm >> type_list +%nterm >> named_type +%nterm > type_with_spaces +%nterm field_name + +%start type_spec + +%% + +/* The grammar entry point. */ +type_spec : type { scanner->setType($1); } + | error { yyerrok; } + ; + +type : type_single_word { $$ = $1; } + | type_with_spaces { $$ = inferTypeWithSpaces($1, true).second; } + ; + +type_single_word : WORD { $$ = typeFromString($1); } // Handles most primitive types (e.g. bigint, etc). + | special_type { $$ = $1; } + +special_type : array_type { $$ = $1; } + | map_type { $$ = $1; } + | row_type { $$ = $1; } + | function_type { $$ = $1; } + | variable_type { $$ = $1; } + | decimal_type { $$ = $1; } + | custom_type_with_children { $$ = $1; } + +/* + * Types with spaces have at least two words. They are joined in an + * std::vector here, and resolved by `inferTypeWithSpaces()`. The first + * word is special to allow for tokens such as "map", "array", etc, to + * be used as field names. + */ +type_with_spaces : type_with_spaces WORD { $1.push_back($2); $$ = std::move($1); } + | field_name WORD { $$.push_back($1); $$.push_back($2); } + ; + +/* List of allowed field names. */ +field_name : WORD { $$ = $1; } + | ARRAY { $$ = "array"; } + | MAP { $$ = "map"; } + | FUNCTION { $$ = "function"; } + | DECIMAL { $$ = "decimal"; } + | ROW { $$ = "row"; } + | VARIABLE { $$ = $1; } + ; + +/* + * Varchar and varbinary have an optional `(int)` + * e.g. both `varchar` and `varchar(4)` are valid. + */ +variable_type : VARIABLE LPAREN NUMBER RPAREN { $$ = typeFromString($1); } + | VARIABLE { $$ = typeFromString($1); } + ; + +decimal_type : DECIMAL LPAREN NUMBER COMMA NUMBER RPAREN { $$ = DECIMAL($3, $5); } + ; + +array_type : ARRAY LPAREN type RPAREN { $$ = ARRAY($3); } + ; + +map_type : MAP LPAREN type COMMA type RPAREN { $$ = MAP($3, $5); } + ; + +function_type : FUNCTION LPAREN type_list RPAREN { auto returnType = $3.back(); $3.pop_back(); + $$ = FUNCTION(std::move($3), returnType); } + +row_type : ROW LPAREN type_list_opt_names RPAREN { $$ = ROW(std::move($3.names), std::move($3.types)); } + ; + +custom_type_with_children : WORD LPAREN type_list RPAREN { $$ = customTypeWithChildren($1, $3); } + +/* Consecutive list of types, separated by a comma. */ +type_list : type { $$.push_back($1); } + | type_list COMMA type { $1.push_back($3); $$ = std::move($1); } + ; + +/* + * Consecutive list of types which can optionally have a "name". + * Only allowed inside row definitions. + */ +type_list_opt_names : type_list_opt_names COMMA named_type { $1.names.push_back($3.first); + $1.types.push_back($3.second); + $$.names = std::move($1.names); + $$.types = std::move($1.types); } + | named_type { $$.names.push_back($1.first); $$.types.push_back($1.second); } + ; + +/* + * Named type is a type definition with an optional name. The name can be + * quoted. Since types with spaces are allowed, there is potential ambiguity + * in definitions with multiple words, for example: + * + * > my type + * + * Is "my" the name and "type" the type, or "my type" is the type name? We first + * check if there is a type matching all words ("my type"), and if not, check if + * there is a type matching all but the first wor ("type") and assume the first + * ("my") to be the field name. See `inferTypeWithSpaces()`. + */ +named_type : type_single_word { $$ = std::make_pair("", $1); } + | field_name special_type { $$ = std::make_pair($1, $2); } + | type_with_spaces { $$ = inferTypeWithSpaces($1, false); } + | QUOTED_ID type { $1.erase(0, 1); $1.pop_back(); $$ = std::make_pair($1, $2); } // Remove the quotes. + ; + +%% + +void facebook::velox::type::fbclp::ClpParser::error(const std::string& msg) { + VELOX_UNSUPPORTED("Failed to parse type [{}]. {}", scanner->input(), msg); +} diff --git a/velox/type/fbclp/tests/CMakeLists.txt b/velox/type/fbclp/tests/CMakeLists.txt new file mode 100644 index 000000000000..73ccfb4b4542 --- /dev/null +++ b/velox/type/fbclp/tests/CMakeLists.txt @@ -0,0 +1,25 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_executable(velox_type_fbclp_parser_test ClpTypeParserTest.cpp) + +add_test(NAME velox_type_fbclp_parser_test COMMAND velox_type_fbclp_parser_test) + +target_link_libraries( + velox_type_fbclp_parser_test + velox_type_fbclp_parser + velox_type + GTest::gtest + GTest::gtest_main + GTest::gmock) diff --git a/velox/type/fbclp/tests/ClpTypeParserTest.cpp b/velox/type/fbclp/tests/ClpTypeParserTest.cpp new file mode 100644 index 000000000000..b617cbde7d65 --- /dev/null +++ b/velox/type/fbclp/tests/ClpTypeParserTest.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/type/fbclp/ClpTypeParser.h" + +namespace facebook::velox::type::fbclp { + +class TypeParserTest : public ::testing::Test {}; + +TEST_F(TypeParserTest, rowTypeWithSpecialChars) { + ASSERT_EQ( + *parseClpType( + "row($dollar$sign$ bigint,-da-sh- varchar,#ha#sh# varchar, @a@t@ varchar, \\sla\\sh\\ varchar)"), + *ROW( + {"$dollar$sign$", "-da-sh-", "#ha#sh#", "@a@t@", "\\sla\\sh\\"}, + {BIGINT(), VARCHAR(), VARCHAR(), VARCHAR(), VARCHAR()})); + ASSERT_EQ( + *parseClpType( + "row(\"$dollar$sign$\" bigint,\"-da-sh-\" varchar,\"#ha#sh#\" varchar,\"@a@t@\" varchar,\"\\sla\\sh\\\" varchar)"), + *ROW( + {"$dollar$sign$", "-da-sh-", "#ha#sh#", "@a@t@", "\\sla\\sh\\"}, + {BIGINT(), VARCHAR(), VARCHAR(), VARCHAR(), VARCHAR()})); +} + +} // namespace facebook::velox::type::fbclp