Skip to content

Commit fce2361

Browse files
committed
Refactor splitPart function to reduce memory allocation
1 parent 78613ef commit fce2361

File tree

1 file changed

+77
-15
lines changed

1 file changed

+77
-15
lines changed

pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java

Lines changed: 77 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -551,14 +551,7 @@ public static String[] suffixesWithSuffix(String input, int maxlength, @Nullable
551551
*/
552552
@ScalarFunction
553553
public static String splitPart(String input, String delimiter, int index) {
554-
String[] splitString = StringUtils.splitByWholeSeparator(input, delimiter);
555-
if (index >= 0 && index < splitString.length) {
556-
return splitString[index];
557-
} else if (index < 0 && index >= -splitString.length) {
558-
return splitString[splitString.length + index];
559-
} else {
560-
return "null";
561-
}
554+
return splitPart(input, delimiter, 0, index);
562555
}
563556

564557
/**
@@ -570,14 +563,83 @@ public static String splitPart(String input, String delimiter, int index) {
570563
*/
571564
@ScalarFunction
572565
public static String splitPart(String input, String delimiter, int limit, int index) {
573-
String[] splitString = StringUtils.splitByWholeSeparator(input, delimiter, limit);
574-
if (index >= 0 && index < splitString.length) {
575-
return splitString[index];
576-
} else if (index < 0 && index >= -splitString.length) {
577-
return splitString[splitString.length + index];
578-
} else {
566+
if (delimiter == null || delimiter.isEmpty()) {
579567
return "null";
580568
}
569+
570+
// Normalize limit: non-positive means no limit
571+
int maxParts = (limit <= 0) ? Integer.MAX_VALUE : limit;
572+
573+
int targetIndex = index;
574+
575+
// Handle Negative Index: We must count total tokens first
576+
if (index < 0) {
577+
// Pass -1 as targetIndex to run in "Count Mode"
578+
int totalTokens = scanAndGet(input, delimiter, maxParts, -1, null);
579+
580+
targetIndex = totalTokens + index;
581+
if (targetIndex < 0) {
582+
return "null";
583+
}
584+
}
585+
586+
// Handle Positive Index: Retrieve the token
587+
// We use a 1-element array as a mutable container to avoid allocating a wrapper object
588+
String[] result = new String[1];
589+
scanAndGet(input, delimiter, maxParts, targetIndex, result);
590+
591+
return result[0] != null ? result[0] : "null";
592+
}
593+
594+
/**
595+
* Unified logic to scan tokens.
596+
* If container is null -> Returns token count (Count Mode).
597+
* If container is set -> Extracts the token at targetIndex (Fetch Mode).
598+
*/
599+
private static int scanAndGet(String input, String delimiter, int maxParts, int targetIndex, String[] container) {
600+
int count = 0;
601+
int start = 0;
602+
int len = input.length();
603+
int dLen = delimiter.length();
604+
605+
while (start < len) {
606+
int nextDelim = input.indexOf(delimiter, start);
607+
608+
// Check if this is the last token (End of string OR Hit limit)
609+
if (nextDelim == -1 || (count + 1 == maxParts)) {
610+
if (targetIndex == count && container != null) {
611+
container[0] = input.substring(start);
612+
}
613+
return count + 1; // Return total count (current + 1)
614+
}
615+
616+
// Skip empty tokens (consecutive delimiters)
617+
if (nextDelim == start) {
618+
start += dLen;
619+
continue;
620+
}
621+
622+
// Found a standard token
623+
if (targetIndex == count) {
624+
if (container != null) {
625+
container[0] = input.substring(start, nextDelim);
626+
}
627+
return count; // Found target, return doesn't matter much here but strictly it's 'count'
628+
}
629+
630+
count++;
631+
start = nextDelim + dLen;
632+
}
633+
634+
// Edge Case: Input purely delimiters (e.g. "+++++") or empty
635+
if (count == 0 && len > 0) {
636+
if (targetIndex == 0 && container != null) {
637+
container[0] = "";
638+
}
639+
return 1;
640+
}
641+
642+
return count;
581643
}
582644

583645
/**
@@ -672,7 +734,7 @@ public static int levenshteinDistance(String input1, String input2) {
672734
int cost = (input1.charAt(i - 1) == input2.charAt(j - 1)) ? 0 : 1;
673735
dp[i][j] = Math.min(
674736
Math.min(dp[i - 1][j] + 1, // deletion
675-
dp[i][j - 1] + 1), // insertion
737+
dp[i][j - 1] + 1), // insertion
676738
dp[i - 1][j - 1] + cost // substitution
677739
);
678740
}

0 commit comments

Comments
 (0)