Skip to content

Commit 05bc6f0

Browse files
committed
GR-13919: improve the (r/l)strip string nodes for whitespace removal
1 parent a901ca5 commit 05bc6f0

File tree

3 files changed

+203
-7
lines changed

3 files changed

+203
-7
lines changed

graalpython/com.oracle.graal.python.test/src/tests/test_string.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ def __init__(self, value):
1313
def __index__(self):
1414
return self.value
1515

16+
1617
def test_find():
1718
assert "teststring".find("test") == 0
1819
assert "teststring".find("string") == 4
@@ -80,6 +81,7 @@ def test_rfind():
8081
assert s.rfind('ahoj', 16, 20) == 16
8182
assert s.rfind('ahoj', 16, 19) == -1
8283

84+
8385
def test_format():
8486
assert "{}.{}".format("part1", "part2") == "part1.part2"
8587
assert "{0}.{1}".format("part1", "part2") == "part1.part2"
@@ -1023,34 +1025,58 @@ def test_translate():
10231025
else:
10241026
assert False, "should raise"
10251027

1028+
10261029
def test_translate_from_byte_table():
10271030
table = bytes.maketrans(bytes(string.ascii_lowercase, 'ascii'), bytes(string.ascii_uppercase, 'ascii'))
10281031
assert "ahoj".translate(table) == "AHOJ"
10291032
assert "ahoj".translate(bytearray(table)) == "AHOJ"
10301033
assert "ahoj".translate(memoryview(table)) == "AHOJ"
10311034

1035+
10321036
def test_tranlslate_from_short_table():
10331037
table = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`ABCDEFGH'
10341038
assert "ahoj".translate(table) == "AHoj"
10351039

1040+
10361041
def test_translate_nonascii_from_byte_table():
10371042
table = bytes.maketrans(bytes(string.ascii_lowercase, 'ascii'), bytes(string.ascii_uppercase, 'ascii'))
10381043
assert "ačhřožj".translate(table) == "AčHřOžJ"
10391044

1045+
10401046
def test_translate_from_long_byte_table():
10411047
table = bytes.maketrans(bytes(string.ascii_lowercase, 'ascii'), bytes(string.ascii_uppercase, 'ascii'))
10421048
table *= 30
10431049
assert 'ahoj453875287ščřžýáí'.translate(table) == 'AHOJ453875287A\rY~ýáí'
10441050

1051+
10451052
def test_splitlines():
10461053
assert len(str.splitlines("\n\n")) == 2
10471054
assert len(str.splitlines("\n")) == 1
10481055
assert len(str.splitlines("a\nb")) == 2
10491056

1057+
10501058
def test_literals():
10511059
s = "hello\[world\]"
10521060
assert len(s) == 14
10531061
assert "hello\[world\]"[5] == "\\"
10541062
assert "hello\[world\]"[6] == "["
10551063
assert "hello\[world\]"[12] == "\\"
10561064
assert "hello\[world\]"[13] == "]"
1065+
1066+
1067+
def test_strip_whitespace():
1068+
assert 'hello' == ' hello '.strip()
1069+
assert 'hello ' == ' hello '.lstrip()
1070+
assert ' hello' == ' hello '.rstrip()
1071+
assert 'hello' == 'hello'.strip()
1072+
1073+
b = ' \t\n\r\f\vabc \t\n\r\f\v'
1074+
assert 'abc' == b.strip()
1075+
assert 'abc \t\n\r\f\v' == b.lstrip()
1076+
assert ' \t\n\r\f\vabc' == b.rstrip()
1077+
1078+
# strip/lstrip/rstrip with None arg
1079+
assert 'hello' == ' hello '.strip(None)
1080+
assert 'hello ' == ' hello '.lstrip(None)
1081+
assert ' hello' == ' hello '.rstrip(None)
1082+
assert 'hello' == 'hello'.strip(None)

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringBuiltins.java

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
import com.oracle.graal.python.builtins.objects.slice.PSlice.SliceInfo;
8080
import com.oracle.graal.python.builtins.objects.str.StringBuiltinsFactory.SpliceNodeGen;
8181
import com.oracle.graal.python.builtins.objects.str.StringBuiltinsFactory.StringLenNodeFactory;
82+
import com.oracle.graal.python.builtins.objects.str.StringUtils.StripKind;
8283
import com.oracle.graal.python.builtins.objects.tuple.PTuple;
8384
import com.oracle.graal.python.nodes.PNodeWithContext;
8485
import com.oracle.graal.python.nodes.SpecialMethodNames;
@@ -1364,7 +1365,7 @@ String strip(String self, String chars) {
13641365
}
13651366

13661367
@SuppressWarnings("unused")
1367-
@Specialization(guards = "isNoValue(chars)")
1368+
@Specialization
13681369
String strip(String self, PNone chars) {
13691370
return self.trim();
13701371
}
@@ -1381,10 +1382,9 @@ String rstrip(String self, String chars) {
13811382
}
13821383

13831384
@SuppressWarnings("unused")
1384-
@Specialization(guards = "isNoValue(chars)")
1385-
@TruffleBoundary
1385+
@Specialization
13861386
String rstrip(String self, PNone chars) {
1387-
return self.replaceAll("\\s+$", "");
1387+
return StringUtils.strip(self, StripKind.RIGHT);
13881388
}
13891389
}
13901390

@@ -1399,10 +1399,9 @@ String rstrip(String self, String chars) {
13991399
}
14001400

14011401
@SuppressWarnings("unused")
1402-
@Specialization(guards = "isNoValue(chars)")
1403-
@TruffleBoundary
1402+
@Specialization
14041403
String rstrip(String self, PNone chars) {
1405-
return self.replaceAll("^\\s+", "");
1404+
return StringUtils.strip(self, StripKind.LEFT);
14061405
}
14071406
}
14081407

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
/*
2+
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* The Universal Permissive License (UPL), Version 1.0
6+
*
7+
* Subject to the condition set forth below, permission is hereby granted to any
8+
* person obtaining a copy of this software, associated documentation and/or
9+
* data (collectively the "Software"), free of charge and under any and all
10+
* copyright rights in the Software, and any and all patent rights owned or
11+
* freely licensable by each licensor hereunder covering either (i) the
12+
* unmodified Software as contributed to or provided by such licensor, or (ii)
13+
* the Larger Works (as defined below), to deal in both
14+
*
15+
* (a) the Software, and
16+
*
17+
* (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
18+
* one is included with the Software each a "Larger Work" to which the Software
19+
* is contributed by such licensors),
20+
*
21+
* without restriction, including without limitation the rights to copy, create
22+
* derivative works of, display, perform, and distribute the Software and make,
23+
* use, sell, offer for sale, import, export, have made, and have sold the
24+
* Software and the Larger Work(s), and to sublicense the foregoing rights on
25+
* either these or other terms.
26+
*
27+
* This license is subject to the following condition:
28+
*
29+
* The above copyright notice and either this complete permission notice or at a
30+
* minimum a reference to the UPL must be included in all copies or substantial
31+
* portions of the Software.
32+
*
33+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
37+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
38+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39+
* SOFTWARE.
40+
*/
41+
package com.oracle.graal.python.builtins.objects.str;
42+
43+
public final class StringUtils {
44+
public enum StripKind {
45+
LEFT,
46+
RIGHT,
47+
BOTH
48+
}
49+
50+
private static final int[] ASCII_WHITESPACE = {
51+
0, 0, 0, 0, 0, 0, 0, 0,
52+
/* case 0x0009: * CHARACTER TABULATION */
53+
/* case 0x000A: * LINE FEED */
54+
/* case 0x000B: * LINE TABULATION */
55+
/* case 0x000C: * FORM FEED */
56+
/* case 0x000D: * CARRIAGE RETURN */
57+
0, 1, 1, 1, 1, 1, 0, 0,
58+
0, 0, 0, 0, 0, 0, 0, 0,
59+
/* case 0x001C: * FILE SEPARATOR */
60+
/* case 0x001D: * GROUP SEPARATOR */
61+
/* case 0x001E: * RECORD SEPARATOR */
62+
/* case 0x001F: * UNIT SEPARATOR */
63+
0, 0, 0, 0, 1, 1, 1, 1,
64+
/* case 0x0020: * SPACE */
65+
1, 0, 0, 0, 0, 0, 0, 0,
66+
0, 0, 0, 0, 0, 0, 0, 0,
67+
0, 0, 0, 0, 0, 0, 0, 0,
68+
0, 0, 0, 0, 0, 0, 0, 0,
69+
70+
0, 0, 0, 0, 0, 0, 0, 0,
71+
0, 0, 0, 0, 0, 0, 0, 0,
72+
0, 0, 0, 0, 0, 0, 0, 0,
73+
0, 0, 0, 0, 0, 0, 0, 0,
74+
0, 0, 0, 0, 0, 0, 0, 0,
75+
0, 0, 0, 0, 0, 0, 0, 0,
76+
0, 0, 0, 0, 0, 0, 0, 0,
77+
0, 0, 0, 0, 0, 0, 0, 0
78+
};
79+
80+
public static boolean isUnicodeWhitespace(char ch) {
81+
switch (ch) {
82+
case 0x0009:
83+
case 0x000A:
84+
case 0x000B:
85+
case 0x000C:
86+
case 0x000D:
87+
case 0x001C:
88+
case 0x001D:
89+
case 0x001E:
90+
case 0x001F:
91+
case 0x0020:
92+
case 0x0085:
93+
case 0x00A0:
94+
case 0x1680:
95+
case 0x2000:
96+
case 0x2001:
97+
case 0x2002:
98+
case 0x2003:
99+
case 0x2004:
100+
case 0x2005:
101+
case 0x2006:
102+
case 0x2007:
103+
case 0x2008:
104+
case 0x2009:
105+
case 0x200A:
106+
case 0x2028:
107+
case 0x2029:
108+
case 0x202F:
109+
case 0x205F:
110+
case 0x3000:
111+
return true;
112+
default:
113+
return false;
114+
}
115+
}
116+
117+
public static boolean isUnicodeLineBreak(char ch) {
118+
switch (ch) {
119+
case 0x000A:
120+
case 0x000B:
121+
case 0x000C:
122+
case 0x000D:
123+
case 0x001C:
124+
case 0x001D:
125+
case 0x001E:
126+
case 0x0085:
127+
case 0x2028:
128+
case 0x2029:
129+
return true;
130+
default:
131+
return false;
132+
}
133+
}
134+
135+
public static boolean isSpace(char ch) {
136+
if (ch < 128) {
137+
return ASCII_WHITESPACE[ch] == 1;
138+
}
139+
return isUnicodeWhitespace(ch);
140+
}
141+
142+
public static String strip(String str, StripKind stripKind) {
143+
int i = 0;
144+
int len = str.length();
145+
146+
if (stripKind != StripKind.RIGHT) {
147+
while (i < len) {
148+
char ch = str.charAt(i);
149+
if (!isSpace(ch)) {
150+
break;
151+
}
152+
i++;
153+
}
154+
}
155+
156+
int j = len;
157+
if (stripKind != StripKind.LEFT) {
158+
j--;
159+
while (j >= i) {
160+
char ch = str.charAt(j);
161+
if (!isSpace(ch)) {
162+
break;
163+
}
164+
j--;
165+
}
166+
j++;
167+
}
168+
169+
return str.substring(i, j);
170+
}
171+
}

0 commit comments

Comments
 (0)