Skip to content

Commit 7bf9b57

Browse files
Leziakpradeepvaka
authored andcommitted
Add longest common prefix implementation, documentation and tests
1 parent 073d3f1 commit 7bf9b57

File tree

3 files changed

+49
-0
lines changed

3 files changed

+49
-0
lines changed

presto-docs/src/main/sphinx/functions/string.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,10 @@ String Functions
6666
i.e. the minimum number of single-character edits (insertions,
6767
deletions or substitutions) needed to change ``string1`` into ``string2``.
6868

69+
.. function:: longest_common_prefix(string1, string2) -> varchar
70+
71+
Returns the longest common prefix between ``string1`` and ``string2``
72+
6973
.. function:: lower(string) -> varchar
7074

7175
Converts ``string`` to lowercase.

presto-main-base/src/main/java/com/facebook/presto/operator/scalar/StringFunctions.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -756,6 +756,27 @@ public static Slice rightPad(@SqlType("varchar(x)") Slice text, @SqlType(Standar
756756
return pad(text, targetLength, padString, text.length());
757757
}
758758

759+
@Description("returns the longest common prefix shared by two strings")
760+
@ScalarFunction("longest_common_prefix")
761+
@LiteralParameters({"x", "y"})
762+
@SqlType(StandardTypes.VARCHAR)
763+
public static Slice longestCommonPrefix(@SqlType("varchar(x)") Slice left, @SqlType("varchar(y)") Slice right)
764+
{
765+
int i = 0;
766+
int byteIndex = 0;
767+
int[] leftCodePoints = castToCodePoints(left);
768+
int[] rightCodePoints = castToCodePoints(right);
769+
int leftLength = leftCodePoints.length;
770+
int rightLength = rightCodePoints.length;
771+
772+
while (i < leftLength && i < rightLength && leftCodePoints[i] == rightCodePoints[i]) {
773+
i++;
774+
byteIndex += SliceUtf8.lengthOfCodePointSafe(left, byteIndex);
775+
}
776+
777+
return Slices.wrappedBuffer(left.getBytes(0, byteIndex));
778+
}
779+
759780
@Description("computes Levenshtein distance between two strings")
760781
@ScalarFunction
761782
@LiteralParameters({"x", "y"})

presto-main-base/src/test/java/com/facebook/presto/operator/scalar/TestStringFunctions.java

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,30 @@ public void testCharLength()
164164
assertFunction("LENGTH(CAST('\u4FE1\u5FF5,\u7231,\u5E0C\u671B' AS CHAR(20)))", BIGINT, 20L);
165165
}
166166

167+
@Test
168+
public void testLongestCommonPrefix()
169+
{
170+
assertFunction("LONGEST_COMMON_PREFIX('', '')", VARCHAR, "");
171+
assertFunction("LONGEST_COMMON_PREFIX('', 'hello')", VARCHAR, "");
172+
assertFunction("LONGEST_COMMON_PREFIX('hello', '')", VARCHAR, "");
173+
assertFunction("LONGEST_COMMON_PREFIX('hello', 'hello')", VARCHAR, "hello");
174+
assertFunction("LONGEST_COMMON_PREFIX('hello world', 'hello')", VARCHAR, "hello");
175+
assertFunction("LONGEST_COMMON_PREFIX('hello', 'hello world')", VARCHAR, "hello");
176+
assertFunction("LONGEST_COMMON_PREFIX('hello world', 'hel wold')", VARCHAR, "hel");
177+
assertFunction("LONGEST_COMMON_PREFIX('hel wold', 'hello world')", VARCHAR, "hel");
178+
179+
// Test for non-ASCII
180+
assertFunction("LONGEST_COMMON_PREFIX('\u4FE1\u5FF5,\u7231,\u5E0C\u671B', '')", VARCHAR, "");
181+
assertFunction("LONGEST_COMMON_PREFIX('', '\u4FE1\u5FF5,\u7231,\u5E0C\u671B')", VARCHAR, "");
182+
assertFunction("LONGEST_COMMON_PREFIX('\u4FE1\u5FF5,\u7231,\u5E0C\u671B', '\u4FE1\u5FF5,\u7231,\u5E0C\u671B')", VARCHAR, "\u4FE1\u5FF5,\u7231,\u5E0C\u671B");
183+
assertFunction("LONGEST_COMMON_PREFIX('\u4FE1\u5FF5,\u7221,\u5E0C\u671B', '\u4FE1\u5FF5,\u7231,\u5E0C\u671B')", VARCHAR, "\u4FE1\u5FF5,");
184+
assertFunction("LONGEST_COMMON_PREFIX('hello na\u00EFve world', 'hello na\u00EFve')", VARCHAR, "hello na\u00EFve");
185+
186+
// Test for invalid-utf8 characters
187+
assertInvalidFunction("LONGEST_COMMON_PREFIX('hello world', utf8(from_hex('81')))", "Invalid UTF-8 encoding in characters: �");
188+
assertInvalidFunction("LONGEST_COMMON_PREFIX('hello world', utf8(from_hex('3281')))", "Invalid UTF-8 encoding in characters: 2�");
189+
}
190+
167191
@Test
168192
public void testLevenshteinDistance()
169193
{

0 commit comments

Comments
 (0)