Skip to content

Commit 7e299a6

Browse files
Yifeng-Wangfacebook-github-bot
authored andcommitted
feat: Add Spark varchar_type_write_side_check function (facebookincubator#13034)
Summary: This PR adds the `varchar_type_write_side_check` function. This function trims trailing space characters (ASCII 32) from input string and checks if it fits within the specified length limit, throwing exceptions if the string exceeds the limit after trimming or if the specified limit is not greater than 0. Examples: ``` varchar_type_write_side_check("abc ", 3) -- "abc" varchar_type_write_side_check("abcd", 3) -- VeloxUserError: "Exceeds allowed length limitation: '3'" varchar_type_write_side_check(" ", 0) -- VeloxUserError: "The length limit must be greater than 0." ``` **Note:** This PR is a splitted work of PR facebookincubator#12773, as outlined in PR facebookincubator#12772. Pull Request resolved: facebookincubator#13034 Reviewed By: Yuhta Differential Revision: D75963922 Pulled By: kKPulla fbshipit-source-id: 34bba67e95645ab45f2493fd30f6635a6b3cf9a0
1 parent f0e336a commit 7e299a6

File tree

9 files changed

+254
-0
lines changed

9 files changed

+254
-0
lines changed

velox/docs/functions/spark/string.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,3 +438,20 @@ String Functions
438438
Returns string with all characters changed to uppercase. ::
439439

440440
SELECT upper('SparkSql'); -- SPARKSQL
441+
442+
.. spark:function:: varchar_type_write_side_check(string, limit) -> varchar
443+
444+
Removes trailing space characters (ASCII 32) that exceed the length ``limit`` from the end of input ``string``. ``limit`` is the maximum length of characters that can be allowed.
445+
Throws exception when ``string`` still exceeds ``limit`` after trimming trailing spaces or when ``limit`` is not greater than 0.
446+
Empty strings are returned as-is since they always satisfy any length ``limit`` greater than 0.
447+
Note: This function is not directly callable in Spark SQL, but internally used for length check when writing string type columns. ::
448+
449+
-- Function call examples (this function is not directly callable in Spark SQL).
450+
varchar_type_write_side_check("abc", 3) -- "abc"
451+
varchar_type_write_side_check("abc ", 3) -- "abc"
452+
varchar_type_write_side_check("abcd", 3) -- VeloxUserError: "Exceeds allowed length limitation: '3'"
453+
varchar_type_write_side_check("中国", 3) -- "中国"
454+
varchar_type_write_side_check("中文中国", 3) -- VeloxUserError: "Exceeds allowed length limitation: '3'"
455+
varchar_type_write_side_check(" ", 0) -- VeloxUserError: "The length limit must be greater than 0."
456+
varchar_type_write_side_check("", 3) -- ""
457+

velox/functions/sparksql/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ velox_add_library(
1717
velox_functions_spark_impl
1818
ArrayGetFunction.cpp
1919
ArraySort.cpp
20+
CharVarcharUtils.cpp
2021
Comparisons.cpp
2122
ConcatWs.cpp
2223
DecimalArithmetic.cpp
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#include "velox/functions/sparksql/CharVarcharUtils.h"
17+
18+
namespace facebook::velox::functions::sparksql {
19+
20+
void trimTrailingSpaces(
21+
exec::StringWriter& output,
22+
StringView inputStr,
23+
int32_t numChars,
24+
uint32_t limit) {
25+
const auto numTailSpacesToTrim = numChars - limit;
26+
VELOX_USER_CHECK_GT(numTailSpacesToTrim, 0);
27+
28+
auto curPos = inputStr.end() - 1;
29+
const auto trimTo = inputStr.end() - numTailSpacesToTrim;
30+
31+
while (curPos >= trimTo && stringImpl::isAsciiSpace(*curPos)) {
32+
curPos--;
33+
}
34+
// Get the length of the trimmed string in characters.
35+
const auto trimmedSize = numChars - std::distance(curPos + 1, inputStr.end());
36+
37+
VELOX_USER_CHECK_LE(
38+
trimmedSize, limit, "Exceeds allowed length limitation: {}", limit);
39+
output.setNoCopy(
40+
StringView(inputStr.data(), std::distance(inputStr.begin(), curPos + 1)));
41+
}
42+
43+
} // namespace facebook::velox::functions::sparksql
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
18+
#include <string>
19+
20+
#include "velox/expression/StringWriter.h"
21+
#include "velox/functions/lib/string/StringImpl.h"
22+
23+
namespace facebook::velox::functions::sparksql {
24+
25+
/// Trims trailing ASCII space characters (0x20) from `'abc'`
26+
/// to ensure its length does not exceed the specified Unicode string length
27+
/// `limit` (must be greater than 0) in characters. Throws an exception if the
28+
/// string still exceeds `limit` after trimming.
29+
void trimTrailingSpaces(
30+
exec::StringWriter& output,
31+
StringView inputStr,
32+
int32_t numChars,
33+
uint32_t limit);
34+
35+
} // namespace facebook::velox::functions::sparksql
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
18+
#include <string>
19+
20+
#include "velox/functions/Macros.h"
21+
#include "velox/functions/lib/string/StringImpl.h"
22+
#include "velox/functions/sparksql/CharVarcharUtils.h"
23+
24+
namespace facebook::velox::functions::sparksql {
25+
26+
/// Ensures that `'abc'` fits within the specified length `n` in
27+
/// characters. If the length of `'abc'` exceeds `n`, trailing spaces
28+
/// are trimmed to fit within `n`. If the length of `'abc'` is less than
29+
/// or equal to `n`, it is returned as-is. Throws an exception if the
30+
/// trimmed string still exceeds `n` or if `n` is negative. This function
31+
/// will trim at most (length of `'abc'` - `n`) space characters (ASCII 32)
32+
/// from the end of `'abc'`.
33+
template <typename T>
34+
struct VarcharTypeWriteSideCheckFunction {
35+
VELOX_DEFINE_FUNCTION_TYPES(T);
36+
37+
// Results refer to strings in the first argument.
38+
static constexpr int32_t reuse_strings_from_arg = 0;
39+
40+
// ASCII input always produces ASCII result.
41+
static constexpr bool is_default_ascii_behavior = true;
42+
43+
FOLLY_ALWAYS_INLINE void call(
44+
out_type<Varchar>& result,
45+
const arg_type<Varchar>& input,
46+
int32_t limit) {
47+
doCall<false>(result, input, limit);
48+
}
49+
50+
FOLLY_ALWAYS_INLINE void callAscii(
51+
out_type<Varchar>& result,
52+
const arg_type<Varchar>& input,
53+
int32_t limit) {
54+
doCall<true>(result, input, limit);
55+
}
56+
57+
private:
58+
template <bool isAscii>
59+
FOLLY_ALWAYS_INLINE void doCall(
60+
out_type<Varchar>& result,
61+
const arg_type<Varchar>& input,
62+
int32_t limit) {
63+
VELOX_USER_CHECK_GT(limit, 0, "The length limit must be greater than 0.");
64+
65+
auto numCharacters = stringImpl::length<isAscii>(input);
66+
if (numCharacters <= limit) {
67+
result.setNoCopy(input);
68+
} else {
69+
trimTrailingSpaces(result, input, numCharacters, limit);
70+
}
71+
}
72+
};
73+
74+
} // namespace facebook::velox::functions::sparksql

velox/functions/sparksql/registration/RegisterString.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "velox/functions/sparksql/Split.h"
2525
#include "velox/functions/sparksql/String.h"
2626
#include "velox/functions/sparksql/StringToMap.h"
27+
#include "velox/functions/sparksql/VarcharTypeWriteSideCheck.h"
2728

2829
namespace facebook::velox::functions {
2930
void registerSparkStringFunctions(const std::string& prefix) {
@@ -161,6 +162,11 @@ void registerStringFunctions(const std::string& prefix) {
161162
prefix + "lower",
162163
SparkLowerFunction::signatures(),
163164
std::make_unique<SparkLowerFunction>());
165+
registerFunction<
166+
VarcharTypeWriteSideCheckFunction,
167+
Varchar,
168+
Varchar,
169+
int32_t>({prefix + "varchar_type_write_side_check"});
164170
}
165171
} // namespace sparksql
166172
} // namespace facebook::velox::functions

velox/functions/sparksql/tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ add_executable(
6868
UnscaledValueFunctionTest.cpp
6969
UpperLowerTest.cpp
7070
UuidTest.cpp
71+
VarcharTypeWriteSideCheckTest.cpp
7172
XxHash64Test.cpp)
7273

7374
add_test(velox_functions_spark_test velox_functions_spark_test)

velox/functions/sparksql/tests/StringTest.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,5 +1055,6 @@ TEST_F(StringTest, empty2Null) {
10551055
EXPECT_EQ(empty2Null(""), std::nullopt);
10561056
EXPECT_EQ(empty2Null("abc"), "abc");
10571057
}
1058+
10581059
} // namespace
10591060
} // namespace facebook::velox::functions::sparksql::test
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#include "velox/common/base/tests/GTestUtils.h"
17+
#include "velox/functions/sparksql/tests/SparkFunctionBaseTest.h"
18+
19+
namespace facebook::velox::functions::sparksql::test {
20+
namespace {
21+
22+
class VarcharTypeWriteSideCheckTest : public SparkFunctionBaseTest {};
23+
24+
TEST_F(VarcharTypeWriteSideCheckTest, varcharTypeWriteSideCheck) {
25+
const auto varcharTypeWriteSideCheck =
26+
[&](const std::optional<std::string>& input,
27+
const std::optional<int32_t>& limit) {
28+
return evaluateOnce<std::string>(
29+
"varchar_type_write_side_check(c0, c1)", input, limit);
30+
};
31+
32+
// Basic cases - string length <= limit.
33+
EXPECT_EQ(varcharTypeWriteSideCheck("abc", 3), "abc");
34+
EXPECT_EQ(varcharTypeWriteSideCheck("ab", 3), "ab");
35+
EXPECT_EQ(varcharTypeWriteSideCheck("", 5), "");
36+
37+
// Cases with trailing spaces.
38+
// Edge cases - input string is longer than limit but trims to exactly limit.
39+
EXPECT_EQ(varcharTypeWriteSideCheck("abc ", 3), "abc");
40+
EXPECT_EQ(varcharTypeWriteSideCheck("abc ", 4), "abc ");
41+
EXPECT_EQ(varcharTypeWriteSideCheck("abc ", 5), "abc ");
42+
43+
// Unicode string cases with trailing spaces.
44+
EXPECT_EQ(varcharTypeWriteSideCheck("世界 ", 2), "世界");
45+
EXPECT_EQ(varcharTypeWriteSideCheck("世界", 2), "世界");
46+
47+
// Error cases - string length > limit even after trimming trailing spaces.
48+
VELOX_ASSERT_USER_THROW(
49+
varcharTypeWriteSideCheck("abcd", 3),
50+
"Exceeds allowed length limitation: 3");
51+
VELOX_ASSERT_USER_THROW(
52+
varcharTypeWriteSideCheck("世界人", 2),
53+
"Exceeds allowed length limitation: 2");
54+
VELOX_ASSERT_USER_THROW(
55+
varcharTypeWriteSideCheck("abc def", 5),
56+
"Exceeds allowed length limitation: 5");
57+
58+
// Null input cases.
59+
EXPECT_EQ(varcharTypeWriteSideCheck(std::nullopt, 5), std::nullopt);
60+
61+
// Edge cases - length limit must be positive
62+
VELOX_ASSERT_USER_THROW(
63+
varcharTypeWriteSideCheck("abc", 0),
64+
"The length limit must be greater than 0.");
65+
VELOX_ASSERT_USER_THROW(
66+
varcharTypeWriteSideCheck("abc", -1),
67+
"The length limit must be greater than 0.");
68+
69+
// Edge cases - input string is all spaces.
70+
EXPECT_EQ(varcharTypeWriteSideCheck(" ", 2), " ");
71+
EXPECT_EQ(varcharTypeWriteSideCheck(" ", 3), " ");
72+
EXPECT_EQ(varcharTypeWriteSideCheck(" ", 1), " ");
73+
}
74+
75+
} // namespace
76+
} // namespace facebook::velox::functions::sparksql::test

0 commit comments

Comments
 (0)