diff --git a/docs/usage.rst b/docs/usage.rst index 55e6a313..398b0fc7 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -359,6 +359,15 @@ Use it to extract just the first matching string:: >>> selector.xpath('//a[contains(@href, "image")]/text()').re_first(r'Name:\s*(.*)') 'My image 1 ' +You can also use compiled regular expressions with both methods:: + + >>> import re + >>> regex = re.compile(r'Name:\s*(.*)') + >>> selector.xpath('//a[contains(@href, "image")]/text()').re_first(regex) + 'My image 1 ' + +As well as adding regex flags with the ``flags`` argument. + .. _topics-selectors-relative-xpaths: Working with relative XPaths diff --git a/parsel/selector.py b/parsel/selector.py index 0a6530fc..26e9d9dd 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -119,7 +119,10 @@ def css(self, query: str) -> "SelectorList[_SelectorType]": return self.__class__(flatten([x.css(query) for x in self])) def re( - self, regex: Union[str, Pattern[str]], replace_entities: bool = True + self, + regex: Union[str, Pattern[str]], + replace_entities: bool = True, + flags: int = 0, ) -> List[str]: """ Call the ``.re()`` method for each element in this list and return @@ -129,8 +132,14 @@ def re( corresponding character (except for ``&`` and ``<``. Passing ``replace_entities`` as ``False`` switches off these replacements. + + It is possible to provide regex flags using the `flags` argument. They + will be applied only if the provided regex is not a compiled regular + expression. """ - return flatten([x.re(regex, replace_entities=replace_entities) for x in self]) + return flatten( + [x.re(regex, replace_entities=replace_entities, flags=flags) for x in self] + ) @typing.overload def re_first( @@ -138,6 +147,7 @@ def re_first( regex: Union[str, Pattern[str]], default: None = None, replace_entities: bool = True, + flags: int = 0, ) -> Optional[str]: pass @@ -147,6 +157,7 @@ def re_first( regex: Union[str, Pattern[str]], default: str, replace_entities: bool = True, + flags: int = 0, ) -> str: pass @@ -155,6 +166,7 @@ def re_first( regex: Union[str, Pattern[str]], default: Optional[str] = None, replace_entities: bool = True, + flags: int = 0, ) -> Optional[str]: """ Call the ``.re()`` method for the first element in this list and @@ -168,7 +180,7 @@ def re_first( replacements. """ for el in iflatten( - x.re(regex, replace_entities=replace_entities) for x in self + x.re(regex, replace_entities=replace_entities, flags=flags) for x in self ): return el return default @@ -358,21 +370,30 @@ def _css2xpath(self, query: str) -> Any: return self._csstranslator.css_to_xpath(query) def re( - self, regex: Union[str, Pattern[str]], replace_entities: bool = True + self, + regex: Union[str, Pattern[str]], + replace_entities: bool = True, + flags: int = 0, ) -> List[str]: """ Apply the given regex and return a list of unicode strings with the matches. ``regex`` can be either a compiled regular expression or a string which - will be compiled to a regular expression using ``re.compile(regex)``. + will be compiled to a regular expression using ``re.compile()``. By default, character entity references are replaced by their corresponding character (except for ``&`` and ``<``). Passing ``replace_entities`` as ``False`` switches off these replacements. + + It is possible to provide regex flags using the `flags` argument. They + will be applied only if the provided regex is not a compiled regular + expression. """ - return extract_regex(regex, self.get(), replace_entities=replace_entities) + return extract_regex( + regex, self.get(), replace_entities=replace_entities, flags=flags + ) @typing.overload def re_first( @@ -380,6 +401,7 @@ def re_first( regex: Union[str, Pattern[str]], default: None = None, replace_entities: bool = True, + flags: int = 0, ) -> Optional[str]: pass @@ -389,6 +411,7 @@ def re_first( regex: Union[str, Pattern[str]], default: str, replace_entities: bool = True, + flags: int = 0, ) -> str: pass @@ -397,6 +420,7 @@ def re_first( regex: Union[str, Pattern[str]], default: Optional[str] = None, replace_entities: bool = True, + flags: int = 0, ) -> Optional[str]: """ Apply the given regex and return the first unicode string which @@ -407,9 +431,14 @@ def re_first( corresponding character (except for ``&`` and ``<``). Passing ``replace_entities`` as ``False`` switches off these replacements. + + It is possible to provide regex flags using the `flags` argument. They + will be applied only if the provided regex is not a compiled regular + expression. """ return next( - iflatten(self.re(regex, replace_entities=replace_entities)), default + iflatten(self.re(regex, replace_entities=replace_entities, flags=flags)), + default, ) def get(self) -> str: diff --git a/parsel/utils.py b/parsel/utils.py index 94d27079..d9e4f991 100644 --- a/parsel/utils.py +++ b/parsel/utils.py @@ -57,15 +57,20 @@ def _is_listlike(x: Any) -> bool: def extract_regex( - regex: Union[str, Pattern[str]], text: str, replace_entities: bool = True + regex: Union[str, Pattern[str]], + text: str, + replace_entities: bool = True, + flags: int = 0, ) -> List[str]: """Extract a list of unicode strings from the given text/encoding using the following policies: + * if the regex is a string it will be compiled using the provided flags * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ if isinstance(regex, str): - regex = re.compile(regex, re.UNICODE) + flags |= re.UNICODE + regex = re.compile(regex, flags) if "extract" in regex.groupindex: # named group diff --git a/tests/test_selector.py b/tests/test_selector.py index 75c0a1e0..97ee103b 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -318,7 +318,7 @@ def test_re_first(self) -> None: self.assertEqual(sel.re_first(r"foo"), None) self.assertEqual(sel.re_first(r"foo", default="bar"), "bar") - def test_extract_first_re_default(self) -> None: + def test_re_first_default(self) -> None: """Test if re_first() returns default value when no results found""" body = '