Skip to content

Commit f36eea8

Browse files
committed
[GR-13919] improve the (r/l)strip string nodes for whitespace removal
PullRequest: graalpython/442
2 parents 6544f55 + 38d1917 commit f36eea8

File tree

3 files changed

+253
-14
lines changed

3 files changed

+253
-14
lines changed

graalpython/com.oracle.graal.python.test/src/tests/test_string.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ def __init__(self, value):
1313
def __index__(self):
1414
return self.value
1515

16+
1617
def test_find():
1718
assert "teststring".find("test") == 0
1819
assert "teststring".find("string") == 4
@@ -80,6 +81,7 @@ def test_rfind():
8081
assert s.rfind('ahoj', 16, 20) == 16
8182
assert s.rfind('ahoj', 16, 19) == -1
8283

84+
8385
def test_format():
8486
assert "{}.{}".format("part1", "part2") == "part1.part2"
8587
assert "{0}.{1}".format("part1", "part2") == "part1.part2"
@@ -1023,34 +1025,74 @@ def test_translate():
10231025
else:
10241026
assert False, "should raise"
10251027

1028+
10261029
def test_translate_from_byte_table():
10271030
table = bytes.maketrans(bytes(string.ascii_lowercase, 'ascii'), bytes(string.ascii_uppercase, 'ascii'))
10281031
assert "ahoj".translate(table) == "AHOJ"
10291032
assert "ahoj".translate(bytearray(table)) == "AHOJ"
10301033
assert "ahoj".translate(memoryview(table)) == "AHOJ"
10311034

1035+
10321036
def test_tranlslate_from_short_table():
10331037
table = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`ABCDEFGH'
10341038
assert "ahoj".translate(table) == "AHoj"
10351039

1040+
10361041
def test_translate_nonascii_from_byte_table():
10371042
table = bytes.maketrans(bytes(string.ascii_lowercase, 'ascii'), bytes(string.ascii_uppercase, 'ascii'))
10381043
assert "ačhřožj".translate(table) == "AčHřOžJ"
10391044

1045+
10401046
def test_translate_from_long_byte_table():
10411047
table = bytes.maketrans(bytes(string.ascii_lowercase, 'ascii'), bytes(string.ascii_uppercase, 'ascii'))
10421048
table *= 30
10431049
assert 'ahoj453875287ščřžýáí'.translate(table) == 'AHOJ453875287A\rY~ýáí'
10441050

1051+
10451052
def test_splitlines():
10461053
assert len(str.splitlines("\n\n")) == 2
10471054
assert len(str.splitlines("\n")) == 1
10481055
assert len(str.splitlines("a\nb")) == 2
10491056

1057+
10501058
def test_literals():
10511059
s = "hello\[world\]"
10521060
assert len(s) == 14
10531061
assert "hello\[world\]"[5] == "\\"
10541062
assert "hello\[world\]"[6] == "["
10551063
assert "hello\[world\]"[12] == "\\"
10561064
assert "hello\[world\]"[13] == "]"
1065+
1066+
1067+
def test_strip_whitespace():
1068+
assert 'hello' == ' hello '.strip()
1069+
assert 'hello ' == ' hello '.lstrip()
1070+
assert ' hello' == ' hello '.rstrip()
1071+
assert 'hello' == 'hello'.strip()
1072+
1073+
b = ' \t\n\r\f\vabc \t\n\r\f\v'
1074+
assert 'abc' == b.strip()
1075+
assert 'abc \t\n\r\f\v' == b.lstrip()
1076+
assert ' \t\n\r\f\vabc' == b.rstrip()
1077+
1078+
# strip/lstrip/rstrip with None arg
1079+
assert 'hello' == ' hello '.strip(None)
1080+
assert 'hello ' == ' hello '.lstrip(None)
1081+
assert ' hello' == ' hello '.rstrip(None)
1082+
assert 'hello' == 'hello'.strip(None)
1083+
1084+
1085+
def test_strip_with_sep():
1086+
# strip/lstrip/rstrip with str arg
1087+
assert 'hello' == 'xyzzyhelloxyzzy'.strip('xyz')
1088+
assert 'helloxyzzy' == 'xyzzyhelloxyzzy'.lstrip('xyz')
1089+
assert 'xyzzyhello' == 'xyzzyhelloxyzzy'.rstrip('xyz')
1090+
assert 'hello' == 'hello'.strip('xyz')
1091+
assert '' == 'mississippi'.strip('mississippi')
1092+
1093+
# only trim the start and end; does not strip internal characters
1094+
assert 'mississipp' == 'mississippi'.strip('i')
1095+
1096+
assertRaises(TypeError, 'hello', 'strip', 42, 42)
1097+
assertRaises(TypeError, 'hello', 'lstrip', 42, 42)
1098+
assertRaises(TypeError, 'hello', 'rstrip', 42, 42)

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringBuiltins.java

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@
5757
import java.nio.charset.CodingErrorAction;
5858
import java.util.Arrays;
5959
import java.util.List;
60-
import java.util.regex.Pattern;
6160

6261
import com.oracle.graal.python.builtins.Builtin;
6362
import com.oracle.graal.python.builtins.CoreFunctions;
@@ -79,6 +78,7 @@
7978
import com.oracle.graal.python.builtins.objects.slice.PSlice.SliceInfo;
8079
import com.oracle.graal.python.builtins.objects.str.StringBuiltinsFactory.SpliceNodeGen;
8180
import com.oracle.graal.python.builtins.objects.str.StringBuiltinsFactory.StringLenNodeFactory;
81+
import com.oracle.graal.python.builtins.objects.str.StringUtils.StripKind;
8282
import com.oracle.graal.python.builtins.objects.tuple.PTuple;
8383
import com.oracle.graal.python.nodes.PNodeWithContext;
8484
import com.oracle.graal.python.nodes.SpecialMethodNames;
@@ -1358,13 +1358,12 @@ public String doReplace(String self, String old, String with, int maxsplit) {
13581358
@TypeSystemReference(PythonArithmeticTypes.class)
13591359
public abstract static class StripNode extends PythonBuiltinNode {
13601360
@Specialization
1361-
@TruffleBoundary
13621361
String strip(String self, String chars) {
1363-
return self.replaceAll("^[" + Pattern.quote(chars) + "]+", "").replaceAll("[" + Pattern.quote(chars) + "]+$", "");
1362+
return StringUtils.strip(self, chars, StripKind.BOTH);
13641363
}
13651364

13661365
@SuppressWarnings("unused")
1367-
@Specialization(guards = "isNoValue(chars)")
1366+
@Specialization
13681367
String strip(String self, PNone chars) {
13691368
return self.trim();
13701369
}
@@ -1375,16 +1374,14 @@ String strip(String self, PNone chars) {
13751374
@TypeSystemReference(PythonArithmeticTypes.class)
13761375
public abstract static class RStripNode extends PythonBuiltinNode {
13771376
@Specialization
1378-
@TruffleBoundary
13791377
String rstrip(String self, String chars) {
1380-
return self.replaceAll("[" + Pattern.quote(chars) + "]+$", "");
1378+
return StringUtils.strip(self, chars, StripKind.RIGHT);
13811379
}
13821380

13831381
@SuppressWarnings("unused")
1384-
@Specialization(guards = "isNoValue(chars)")
1385-
@TruffleBoundary
1382+
@Specialization
13861383
String rstrip(String self, PNone chars) {
1387-
return self.replaceAll("\\s+$", "");
1384+
return StringUtils.strip(self, StripKind.RIGHT);
13881385
}
13891386
}
13901387

@@ -1393,16 +1390,14 @@ String rstrip(String self, PNone chars) {
13931390
@TypeSystemReference(PythonArithmeticTypes.class)
13941391
public abstract static class LStripNode extends PythonBuiltinNode {
13951392
@Specialization
1396-
@TruffleBoundary
13971393
String rstrip(String self, String chars) {
1398-
return self.replaceAll("^[" + Pattern.quote(chars) + "]+", "");
1394+
return StringUtils.strip(self, chars, StripKind.LEFT);
13991395
}
14001396

14011397
@SuppressWarnings("unused")
1402-
@Specialization(guards = "isNoValue(chars)")
1403-
@TruffleBoundary
1398+
@Specialization
14041399
String rstrip(String self, PNone chars) {
1405-
return self.replaceAll("^\\s+", "");
1400+
return StringUtils.strip(self, StripKind.LEFT);
14061401
}
14071402
}
14081403

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
/*
2+
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* The Universal Permissive License (UPL), Version 1.0
6+
*
7+
* Subject to the condition set forth below, permission is hereby granted to any
8+
* person obtaining a copy of this software, associated documentation and/or
9+
* data (collectively the "Software"), free of charge and under any and all
10+
* copyright rights in the Software, and any and all patent rights owned or
11+
* freely licensable by each licensor hereunder covering either (i) the
12+
* unmodified Software as contributed to or provided by such licensor, or (ii)
13+
* the Larger Works (as defined below), to deal in both
14+
*
15+
* (a) the Software, and
16+
*
17+
* (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
18+
* one is included with the Software each a "Larger Work" to which the Software
19+
* is contributed by such licensors),
20+
*
21+
* without restriction, including without limitation the rights to copy, create
22+
* derivative works of, display, perform, and distribute the Software and make,
23+
* use, sell, offer for sale, import, export, have made, and have sold the
24+
* Software and the Larger Work(s), and to sublicense the foregoing rights on
25+
* either these or other terms.
26+
*
27+
* This license is subject to the following condition:
28+
*
29+
* The above copyright notice and either this complete permission notice or at a
30+
* minimum a reference to the UPL must be included in all copies or substantial
31+
* portions of the Software.
32+
*
33+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
37+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
38+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39+
* SOFTWARE.
40+
*/
41+
package com.oracle.graal.python.builtins.objects.str;
42+
43+
public final class StringUtils {
44+
public enum StripKind {
45+
LEFT,
46+
RIGHT,
47+
BOTH
48+
}
49+
50+
private static final int[] ASCII_WHITESPACE = {
51+
0, 0, 0, 0, 0, 0, 0, 0,
52+
/* case 0x0009: * CHARACTER TABULATION */
53+
/* case 0x000A: * LINE FEED */
54+
/* case 0x000B: * LINE TABULATION */
55+
/* case 0x000C: * FORM FEED */
56+
/* case 0x000D: * CARRIAGE RETURN */
57+
0, 1, 1, 1, 1, 1, 0, 0,
58+
0, 0, 0, 0, 0, 0, 0, 0,
59+
/* case 0x001C: * FILE SEPARATOR */
60+
/* case 0x001D: * GROUP SEPARATOR */
61+
/* case 0x001E: * RECORD SEPARATOR */
62+
/* case 0x001F: * UNIT SEPARATOR */
63+
0, 0, 0, 0, 1, 1, 1, 1,
64+
/* case 0x0020: * SPACE */
65+
1, 0, 0, 0, 0, 0, 0, 0,
66+
0, 0, 0, 0, 0, 0, 0, 0,
67+
0, 0, 0, 0, 0, 0, 0, 0,
68+
0, 0, 0, 0, 0, 0, 0, 0,
69+
70+
0, 0, 0, 0, 0, 0, 0, 0,
71+
0, 0, 0, 0, 0, 0, 0, 0,
72+
0, 0, 0, 0, 0, 0, 0, 0,
73+
0, 0, 0, 0, 0, 0, 0, 0,
74+
0, 0, 0, 0, 0, 0, 0, 0,
75+
0, 0, 0, 0, 0, 0, 0, 0,
76+
0, 0, 0, 0, 0, 0, 0, 0,
77+
0, 0, 0, 0, 0, 0, 0, 0
78+
};
79+
80+
public static boolean isUnicodeWhitespace(char ch) {
81+
switch (ch) {
82+
case 0x0009:
83+
case 0x000A:
84+
case 0x000B:
85+
case 0x000C:
86+
case 0x000D:
87+
case 0x001C:
88+
case 0x001D:
89+
case 0x001E:
90+
case 0x001F:
91+
case 0x0020:
92+
case 0x0085:
93+
case 0x00A0:
94+
case 0x1680:
95+
case 0x2000:
96+
case 0x2001:
97+
case 0x2002:
98+
case 0x2003:
99+
case 0x2004:
100+
case 0x2005:
101+
case 0x2006:
102+
case 0x2007:
103+
case 0x2008:
104+
case 0x2009:
105+
case 0x200A:
106+
case 0x2028:
107+
case 0x2029:
108+
case 0x202F:
109+
case 0x205F:
110+
case 0x3000:
111+
return true;
112+
default:
113+
return false;
114+
}
115+
}
116+
117+
public static boolean isUnicodeLineBreak(char ch) {
118+
switch (ch) {
119+
case 0x000A:
120+
case 0x000B:
121+
case 0x000C:
122+
case 0x000D:
123+
case 0x001C:
124+
case 0x001D:
125+
case 0x001E:
126+
case 0x0085:
127+
case 0x2028:
128+
case 0x2029:
129+
return true;
130+
default:
131+
return false;
132+
}
133+
}
134+
135+
public static boolean isSpace(char ch) {
136+
if (ch < 128) {
137+
return ASCII_WHITESPACE[ch] == 1;
138+
}
139+
return isUnicodeWhitespace(ch);
140+
}
141+
142+
public static String strip(String str, StripKind stripKind) {
143+
int i = 0;
144+
int len = str.length();
145+
146+
if (stripKind != StripKind.RIGHT) {
147+
while (i < len) {
148+
char ch = str.charAt(i);
149+
if (!isSpace(ch)) {
150+
break;
151+
}
152+
i++;
153+
}
154+
}
155+
156+
int j = len;
157+
if (stripKind != StripKind.LEFT) {
158+
j--;
159+
while (j >= i) {
160+
char ch = str.charAt(j);
161+
if (!isSpace(ch)) {
162+
break;
163+
}
164+
j--;
165+
}
166+
j++;
167+
}
168+
169+
return str.substring(i, j);
170+
}
171+
172+
public static String strip(String str, String chars, StripKind stripKind) {
173+
int i = 0;
174+
int len = str.length();
175+
// TODO: cpython uses a bloom filter for to skip chars that are not in the sep list:
176+
// to avoid the linear search in chars
177+
if (stripKind != StripKind.RIGHT) {
178+
while (i < len) {
179+
char ch = str.charAt(i);
180+
if (chars.indexOf(ch) < 0) {
181+
break;
182+
}
183+
i++;
184+
}
185+
}
186+
187+
int j = len;
188+
if (stripKind != StripKind.LEFT) {
189+
j--;
190+
while (j >= i) {
191+
char ch = str.charAt(j);
192+
if (chars.indexOf(ch) < 0) {
193+
break;
194+
}
195+
j--;
196+
}
197+
j++;
198+
}
199+
200+
return str.substring(i, j);
201+
}
202+
}

0 commit comments

Comments
 (0)