1616
1717 >>> from nltk.tokenize import RegexpTokenizer
1818 >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
19- >>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
19+ >>> tokenizer = RegexpTokenizer(r '\w+|\$[\d\.]+|\S+')
2020 >>> tokenizer.tokenize(s) # doctest: +NORMALIZE_WHITESPACE
2121 ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
2222 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
2323
2424A ``RegexpTokenizer`` can use its regexp to match delimiters instead:
2525
26- >>> tokenizer = RegexpTokenizer('\s+', gaps=True)
26+ >>> tokenizer = RegexpTokenizer(r '\s+', gaps=True)
2727 >>> tokenizer.tokenize(s) # doctest: +NORMALIZE_WHITESPACE
2828 ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
2929 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
3434The material between the tokens is discarded. For example,
3535the following tokenizer selects just the capitalized words:
3636
37- >>> capword_tokenizer = RegexpTokenizer('[A-Z]\w+')
37+ >>> capword_tokenizer = RegexpTokenizer(r '[A-Z]\w+')
3838 >>> capword_tokenizer.tokenize(s)
3939 ['Good', 'New', 'York', 'Please', 'Thanks']
4040
5050All of the regular expression tokenizers are also available as functions:
5151
5252 >>> from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize
53- >>> regexp_tokenize(s, pattern='\w+|\$[\d\.]+|\S+') # doctest: +NORMALIZE_WHITESPACE
53+ >>> regexp_tokenize(s, pattern=r '\w+|\$[\d\.]+|\S+') # doctest: +NORMALIZE_WHITESPACE
5454 ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
5555 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
5656 >>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
@@ -77,7 +77,7 @@ class RegexpTokenizer(TokenizerI):
7777 A tokenizer that splits a string using a regular expression, which
7878 matches either the tokens or the separators between tokens.
7979
80- >>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
80+ >>> tokenizer = RegexpTokenizer(r '\w+|\$[\d\.]+|\S+')
8181
8282 :type pattern: str
8383 :param pattern: The pattern used to build this tokenizer.
0 commit comments