diff --git a/src/org/rascalmpl/library/String.rsc b/src/org/rascalmpl/library/String.rsc index a9fe8ce8ade..73ddc3264ee 100644 --- a/src/org/rascalmpl/library/String.rsc +++ b/src/org/rascalmpl/library/String.rsc @@ -20,7 +20,9 @@ module String extend Exception; import List; +import Map; import ParseTree; +import Set; @synopsis{All functions in this module that have a charset parameter use this as default.} private str DEFAULT_CHARSET = "UTF-8"; @@ -680,4 +682,124 @@ or the indentation. * This function works fine if `indentation` is not spaces or tabs; but it does not make much sense. } @javaClass{org.rascalmpl.library.Prelude} -java str indent(str indentation, str content, bool indentFirstLine=false); \ No newline at end of file +java str indent(str indentation, str content, bool indentFirstLine=false); + +list[str] newLineCharacters = [ + "\u000A", // LF + "\u000B", // VT + "\u000C", // FF + "\u000D", // CR + "\u000D\u000A", // CRLF + "\u0085", // NEL + "\u2028", // LS + "\u2029" // PS +]; + +@synopsis{Comparator to sort strings by length (ascending).} +private bool bySize(str a, str b) = size(a) < size(b); + +@synopsis{Comparator to sort strings by relative position in a reference list.} +private bool(str, str) byIndex(list[str] indices) { + return bool(str a, str b) { + return indexOf(indices, a) < indexOf(indices, b); + }; +} + +@synopsis{Determine the most-used newline character in a string.} +str mostUsedNewline(str input, list[str] lineseps = newLineCharacters, str(list[str]) tieBreaker = getFirstFrom) { + linesepCounts = (nl: 0 | nl <- lineseps); + for (nl <- sort(lineseps, bySize)) { + int count = size(findAll(input, nl)); + linesepCounts[nl] = count; + // subtract all occurrences of substrings of newline characters that we counted before + for (str snl <- substrings(nl), linesepCounts[snl]?) { + linesepCounts[snl] = linesepCounts[snl] - count; + } + } + + byCount = invert(linesepCounts); + return tieBreaker(sort(byCount[max(domain(byCount))], byIndex(lineseps))); +} + +@synopsis{Split a string to an indentation prefix and the remainder of the string.} +tuple[str indentation, str rest] splitIndentation(/^/) + = ; + +str(str) indentSpacesAsTabs(int tabSize) { + str spaces = ("" | it + " " | _ <- [0..tabSize]); + return str(str line) { + parts = splitIndentation(line); + return ""; + }; +} + +str(str) indentTabsAsSpaces(int tabSize) { + str spaces = ("" | it + " " | _ <- [0..tabSize]); + return str(str line) { + parts = splitIndentation(line); + return ""; + }; +} + +@synopsis{Compute all possible strict substrings of a string.} +@pitfalls{ +* Does not include the empty string. +* Does not include the input string itself. +* The number of substrings is quadratic in the size of the string; expensive to compute. +} +set[str] substrings(str input) + = {input[i..i+l] | int i <- [0..size(input)], int l <- [1..size(input)-i+1]} - input; + +@synopsis{If a string does not end with a newline character, append one. } +str insertFinalNewline(str input, list[str] lineseps = newLineCharacters) + = any(nl <- lineseps, endsWith(input, nl)) + ? input + : input + mostUsedNewline(input, lineseps=lineseps) + ; + +@synopsis{Remove all newlines from the end of a string.} +str trimFinalNewlines(str input, list[str] lineseps = newLineCharacters) { + orderedSeps = reverse(sort(lineseps, bySize)); + while (nl <- orderedSeps, endsWith(input, nl)) { + input = input[0..-size(nl)]; + } + return input; +} + +@synopsis{Split a string in pairs for each line.} +list[tuple[str, str]] separateLines(str input, bool includeEmptyLastLine = false, list[str] lineseps = newLineCharacters) { + orderedSeps = reverse(sort(lineseps, bySize)); + + list[tuple[str, str]] lines = []; + int next = 0; + for (int i <- [0..size(input)], i >= next) { + // greedily match line separators (longest first) + if (str nl <- orderedSeps, nl == input[i..i+size(nl)]) { + lines += ; + next = i + size(nl); // skip to the start of the next line + } + } + + // last line + if (next < size(input) || includeEmptyLastLine) { + lines += ; + } + + return lines; +} + +@synopsis{Concatenate a list of pairs to form a single string.} +str mergeLines(list[tuple[str, str]] lines) + = ("" | it + line + sep | <- lines); + +@synopsis{Process the text of a string per line, maintaining the original newline characters.} +str perLine(str input, str(str) lineFunc, bool includeEmptyLastLine = false, list[str] lineseps = newLineCharacters) + = mergeLines([ | <- separateLines(input, includeEmptyLastLine=includeEmptyLastLine, lineseps=lineseps)]); + +@synopsis{Trim trailing non-newline whitespace from each line in a multi-line string.} +str trimTrailingWhitespace(str input, list[str] lineseps = newLineCharacters) { + str trimLineTrailingWs(/^\s*$/) = nonWhiteSpace; + default str trimLineTrailingWs(/^\s*$/) = ""; + + return perLine(input, trimLineTrailingWs, lineseps=lineseps); +} diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc b/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc index f806220cb4b..ea53c6e655f 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc @@ -261,3 +261,39 @@ test bool testBase32AllChars1() = testBase32("`1234567890-=~!@#$%^&*"); test bool testBase32AllChars2() = testBase32("()_+qwertyuiop[]\\QWERTYUIOP"); test bool testBase32AllChars3() = testBase32("{}|asdfghjkl;\'ASDFGHJKL:\""); test bool testBase32AllChars4() = testBase32("zxcvbnm,./ZXCVBNM\<\>? "); + +// mostUsedNewLline +test bool mostUsedNewlineTestMixed() = mostUsedNewline("\r\n\n\r\n\t\t\t\t") == "\r\n"; +test bool mostUsedNewlineTestTie() = mostUsedNewline("\n\n\r\n\r\n") == "\n"; +test bool mostUsedNewlineTestNone() = mostUsedNewline("abcdefg") == "\n"; +test bool mostUsedNewlineTestGreedy() = mostUsedNewline("\r\n\r\n\n") == "\r\n"; + +// insertFinalNewline +test bool insertFinalNewlineTestSimple() = insertFinalNewline("a\nb") == "a\nb\n"; +test bool insertFinalNewlineTestNoop() = insertFinalNewline("a\nb\n") == "a\nb\n"; +test bool insertFinalNewlineTestMixed() = insertFinalNewline("a\nb\r\n") == "a\nb\r\n"; + +// trimFinalNewlines +test bool trimFinalNewlineTestSimple() = trimFinalNewlines("a\n\n\n") == "a"; +test bool trimFinalNewlineTestEndOnly() = trimFinalNewlines("a\n\n\nb\n\n") == "a\n\n\nb"; +test bool trimFinalNewlineTestWhiteSpace() = trimFinalNewlines("a\n\n\nb\n\n ") == "a\n\n\nb\n\n "; + +// trimTrailingWhitespace +test bool trimTrailingWhitespaceTest() = trimTrailingWhitespace("a \nb\t\n c \n") == "a\nb\n c\n"; + +// perLine +test bool perLineTest() = perLine("a\nb\r\nc\n\r\n", str(str line) { return line + "x"; }) == "ax\nbx\r\ncx\nx\r\n"; + +// separateLines +test bool separateLinesTestSimple() = separateLines("a\nb\r\nc\n\r\n") == [<"a", "\n">, <"b", "\r\n">, <"c", "\n">, <"", "\r\n">]; +test bool separateLinesTestSimpleWithLast() = separateLines("a\nb\r\nc\n\r\n", includeEmptyLastLine=true) == [<"a", "\n">, <"b", "\r\n">, <"c", "\n">, <"", "\r\n">, <"", "">]; +test bool separateLinesTestNoFinalNewline() = separateLines("a\nb\r\nc") == [<"a", "\n">, <"b", "\r\n">, <"c", "">]; +test bool separateLinesTestNoFinalNewlineNoEmpty() = separateLines("a\nb\r\nc", includeEmptyLastLine=true) == [<"a", "\n">, <"b", "\r\n">, <"c", "">]; +test bool separateLinesTestOnlyNewlines() = separateLines("\n\r\n\n\r\n") == [<"", "\n">, <"", "\r\n">, <"", "\n">, <"", "\r\n">]; +test bool separateLinesTestNoNewlines() = separateLines("abc") == [<"abc", "">]; + +// substrings +test bool substringsTestEmpty() = substrings("") == {}; +test bool substringsTestSingle() = substrings("a") == {}; +test bool substringsTestTwo() = substrings("ab") == {"a", "b"}; +test bool substringsTestThree() = substrings("abc") == {"a", "b", "c", "ab", "bc"};