Skip to content
This repository was archived by the owner on Jan 9, 2020. It is now read-only.

Commit c66d64b

Browse files
kevinyu98gatorsmile
authored andcommitted
[SPARK-14878][SQL] Trim characters string function support
#### What changes were proposed in this pull request? This PR enhances the TRIM function support in Spark SQL by allowing the specification of trim characters set. Below is the SQL syntax : ``` SQL <trim function> ::= TRIM <left paren> <trim operands> <right paren> <trim operands> ::= [ [ <trim specification> ] [ <trim character set> ] FROM ] <trim source> <trim source> ::= <character value expression> <trim specification> ::= LEADING | TRAILING | BOTH <trim character set> ::= <characters value expression> ``` or ``` SQL LTRIM (source-exp [, trim-exp]) RTRIM (source-exp [, trim-exp]) ``` Here are the documentation link of support of this feature by other mainstream databases. - **Oracle:** [TRIM function](http://docs.oracle.com/cd/B28359_01/olap.111/b28126/dml_functions_2126.htm#OLADM704) - **DB2:** [TRIM scalar function](https://www.ibm.com/support/knowledgecenter/en/SSMKHH_10.0.0/com.ibm.etools.mft.doc/ak05270_.htm) - **MySQL:** [Trim function](http://dev.mysql.com/doc/refman/5.7/en/string-functions.html#function_trim) - **Oracle:** [ltrim](https://docs.oracle.com/cd/B28359_01/olap.111/b28126/dml_functions_2018.htm#OLADM594) - **DB2:** [ltrim](https://www.ibm.com/support/knowledgecenter/en/SSEPEK_11.0.0/sqlref/src/tpc/db2z_bif_ltrim.html) This PR is to implement the above enhancement. In the implementation, the design principle is to keep the changes to the minimum. Also, the exiting trim functions (which handles a special case, i.e., trimming space characters) are kept unchanged for performane reasons. #### How was this patch tested? The unit test cases are added in the following files: - UTF8StringSuite.java - StringExpressionsSuite.scala - sql/SQLQuerySuite.scala - StringFunctionsSuite.scala Author: Kevin Yu <[email protected]> Closes apache#12646 from kevinyu98/spark-14878.
1 parent 3b049ab commit c66d64b

File tree

10 files changed

+554
-32
lines changed

10 files changed

+554
-32
lines changed

common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,21 @@ public UTF8String trim() {
511511
}
512512
}
513513

514+
/**
515+
* Based on the given trim string, trim this string starting from both ends
516+
* This method searches for each character in the source string, removes the character if it is found
517+
* in the trim string, stops at the first not found. It calls the trimLeft first, then trimRight.
518+
* It returns a new string in which both ends trim characters have been removed.
519+
* @param trimString the trim character string
520+
*/
521+
public UTF8String trim(UTF8String trimString) {
522+
if (trimString != null) {
523+
return trimLeft(trimString).trimRight(trimString);
524+
} else {
525+
return null;
526+
}
527+
}
528+
514529
public UTF8String trimLeft() {
515530
int s = 0;
516531
// skip all of the space (0x20) in the left side
@@ -523,6 +538,40 @@ public UTF8String trimLeft() {
523538
}
524539
}
525540

541+
/**
542+
* Based on the given trim string, trim this string starting from left end
543+
* This method searches each character in the source string starting from the left end, removes the character if it
544+
* is in the trim string, stops at the first character which is not in the trim string, returns the new string.
545+
* @param trimString the trim character string
546+
*/
547+
public UTF8String trimLeft(UTF8String trimString) {
548+
if (trimString == null) return null;
549+
// the searching byte position in the source string
550+
int srchIdx = 0;
551+
// the first beginning byte position of a non-matching character
552+
int trimIdx = 0;
553+
554+
while (srchIdx < numBytes) {
555+
UTF8String searchChar = copyUTF8String(srchIdx, srchIdx + numBytesForFirstByte(this.getByte(srchIdx)) - 1);
556+
int searchCharBytes = searchChar.numBytes;
557+
// try to find the matching for the searchChar in the trimString set
558+
if (trimString.find(searchChar, 0) >= 0) {
559+
trimIdx += searchCharBytes;
560+
} else {
561+
// no matching, exit the search
562+
break;
563+
}
564+
srchIdx += searchCharBytes;
565+
}
566+
567+
if (trimIdx >= numBytes) {
568+
// empty string
569+
return EMPTY_UTF8;
570+
} else {
571+
return copyUTF8String(trimIdx, numBytes - 1);
572+
}
573+
}
574+
526575
public UTF8String trimRight() {
527576
int e = numBytes - 1;
528577
// skip all of the space (0x20) in the right side
@@ -536,6 +585,50 @@ public UTF8String trimRight() {
536585
}
537586
}
538587

588+
/**
589+
* Based on the given trim string, trim this string starting from right end
590+
* This method searches each character in the source string starting from the right end, removes the character if it
591+
* is in the trim string, stops at the first character which is not in the trim string, returns the new string.
592+
* @param trimString the trim character string
593+
*/
594+
public UTF8String trimRight(UTF8String trimString) {
595+
if (trimString == null) return null;
596+
int charIdx = 0;
597+
// number of characters from the source string
598+
int numChars = 0;
599+
// array of character length for the source string
600+
int[] stringCharLen = new int[numBytes];
601+
// array of the first byte position for each character in the source string
602+
int[] stringCharPos = new int[numBytes];
603+
// build the position and length array
604+
while (charIdx < numBytes) {
605+
stringCharPos[numChars] = charIdx;
606+
stringCharLen[numChars] = numBytesForFirstByte(getByte(charIdx));
607+
charIdx += stringCharLen[numChars];
608+
numChars ++;
609+
}
610+
611+
// index trimEnd points to the first no matching byte position from the right side of the source string.
612+
int trimEnd = numBytes - 1;
613+
while (numChars > 0) {
614+
UTF8String searchChar =
615+
copyUTF8String(stringCharPos[numChars - 1], stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1);
616+
if (trimString.find(searchChar, 0) >= 0) {
617+
trimEnd -= stringCharLen[numChars - 1];
618+
} else {
619+
break;
620+
}
621+
numChars --;
622+
}
623+
624+
if (trimEnd < 0) {
625+
// empty string
626+
return EMPTY_UTF8;
627+
} else {
628+
return copyUTF8String(0, trimEnd);
629+
}
630+
}
631+
539632
public UTF8String reverse() {
540633
byte[] result = new byte[this.numBytes];
541634

common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -730,4 +730,61 @@ public void testToLong() throws IOException {
730730
assertFalse(negativeInput, UTF8String.fromString(negativeInput).toLong(wrapper));
731731
}
732732
}
733+
734+
@Test
735+
public void trimBothWithTrimString() {
736+
assertEquals(fromString("hello"), fromString(" hello ").trim(fromString(" ")));
737+
assertEquals(fromString("o"), fromString(" hello ").trim(fromString(" hle")));
738+
assertEquals(fromString("h e"), fromString("ooh e ooo").trim(fromString("o ")));
739+
assertEquals(fromString(""), fromString("ooo...oooo").trim(fromString("o.")));
740+
assertEquals(fromString("b"), fromString("%^b[]@").trim(fromString("][@^%")));
741+
742+
assertEquals(EMPTY_UTF8, fromString(" ").trim(fromString(" ")));
743+
744+
assertEquals(fromString("数据砖头"), fromString(" 数据砖头 ").trim());
745+
assertEquals(fromString("数"), fromString("a数b").trim(fromString("ab")));
746+
assertEquals(fromString(""), fromString("a").trim(fromString("a数b")));
747+
assertEquals(fromString(""), fromString("数数 数数数").trim(fromString("数 ")));
748+
assertEquals(fromString("据砖头"), fromString("数]数[数据砖头#数数").trim(fromString("[数]#")));
749+
assertEquals(fromString("据砖头数数 "), fromString("数数数据砖头数数 ").trim(fromString("数")));
750+
}
751+
752+
@Test
753+
public void trimLeftWithTrimString() {
754+
assertEquals(fromString(" hello "), fromString(" hello ").trimLeft(fromString("")));
755+
assertEquals(fromString(""), fromString("a").trimLeft(fromString("a")));
756+
assertEquals(fromString("b"), fromString("b").trimLeft(fromString("a")));
757+
assertEquals(fromString("ba"), fromString("ba").trimLeft(fromString("a")));
758+
assertEquals(fromString(""), fromString("aaaaaaa").trimLeft(fromString("a")));
759+
assertEquals(fromString("trim"), fromString("oabtrim").trimLeft(fromString("bao")));
760+
assertEquals(fromString("rim "), fromString("ooootrim ").trimLeft(fromString("otm")));
761+
762+
assertEquals(EMPTY_UTF8, fromString(" ").trimLeft(fromString(" ")));
763+
764+
assertEquals(fromString("数据砖头 "), fromString(" 数据砖头 ").trimLeft(fromString(" ")));
765+
assertEquals(fromString("数"), fromString("数").trimLeft(fromString("a")));
766+
assertEquals(fromString("a"), fromString("a").trimLeft(fromString("数")));
767+
assertEquals(fromString("砖头数数"), fromString("数数数据砖头数数").trimLeft(fromString("据数")));
768+
assertEquals(fromString("据砖头数数"), fromString(" 数数数据砖头数数").trimLeft(fromString("数 ")));
769+
assertEquals(fromString("据砖头数数"), fromString("aa数数数据砖头数数").trimLeft(fromString("a数砖")));
770+
assertEquals(fromString("$S,.$BR"), fromString(",,,,%$S,.$BR").trimLeft(fromString("%,")));
771+
}
772+
773+
@Test
774+
public void trimRightWithTrimString() {
775+
assertEquals(fromString(" hello "), fromString(" hello ").trimRight(fromString("")));
776+
assertEquals(fromString(""), fromString("a").trimRight(fromString("a")));
777+
assertEquals(fromString("cc"), fromString("ccbaaaa").trimRight(fromString("ba")));
778+
assertEquals(fromString(""), fromString("aabbbbaaa").trimRight(fromString("ab")));
779+
assertEquals(fromString(" he"), fromString(" hello ").trimRight(fromString(" ol")));
780+
assertEquals(fromString("oohell"), fromString("oohellooo../*&").trimRight(fromString("./,&%*o")));
781+
782+
assertEquals(EMPTY_UTF8, fromString(" ").trimRight(fromString(" ")));
783+
784+
assertEquals(fromString(" 数据砖头"), fromString(" 数据砖头 ").trimRight(fromString(" ")));
785+
assertEquals(fromString("数数砖头"), fromString("数数砖头数aa数").trimRight(fromString("a数")));
786+
assertEquals(fromString(""), fromString("数数数据砖ab").trimRight(fromString("数据砖ab")));
787+
assertEquals(fromString("头"), fromString("头a???/").trimRight(fromString("数?/*&^%a")));
788+
assertEquals(fromString("头"), fromString("头数b数数 [").trimRight(fromString(" []数b")));
789+
}
733790
}

sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,8 @@ primaryExpression
580580
| '(' query ')' #subqueryExpression
581581
| qualifiedName '(' (setQuantifier? argument+=expression (',' argument+=expression)*)? ')'
582582
(OVER windowSpec)? #functionCall
583+
| qualifiedName '(' trimOption=(BOTH | LEADING | TRAILING) argument+=expression
584+
FROM argument+=expression ')' #functionCall
583585
| value=primaryExpression '[' index=valueExpression ']' #subscript
584586
| identifier #columnReference
585587
| base=primaryExpression '.' fieldName=identifier #dereference
@@ -748,6 +750,7 @@ nonReserved
748750
| UNBOUNDED | WHEN
749751
| DATABASE | SELECT | FROM | WHERE | HAVING | TO | TABLE | WITH | NOT | CURRENT_DATE | CURRENT_TIMESTAMP
750752
| DIRECTORY
753+
| BOTH | LEADING | TRAILING
751754
;
752755

753756
SELECT: 'SELECT';
@@ -861,6 +864,9 @@ COMMIT: 'COMMIT';
861864
ROLLBACK: 'ROLLBACK';
862865
MACRO: 'MACRO';
863866
IGNORE: 'IGNORE';
867+
BOTH: 'BOTH';
868+
LEADING: 'LEADING';
869+
TRAILING: 'TRAILING';
864870

865871
IF: 'IF';
866872
POSITION: 'POSITION';

0 commit comments

Comments
 (0)