Merge branch 'master' into documentation-introduction

Gallaecio · web-flow · commit cefa6c7c55e9 · 2019-10-16T14:05:45.000+02:00
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.5.1
+current_version = 1.5.2
 commit = True
 tag = True
 tag_name = v{new_version}
diff --git a/NEWS b/NEWS
@@ -3,6 +3,16 @@
 History
 -------
 
+1.5.2 (2019-08-09)
+~~~~~~~~~~~~~~~~~~
+
+* ``Selector.remove_namespaces`` received a significant performance improvement
+* The value of ``data`` within the printable representation of a selector
+  (``repr(selector)``) now ends in ``...`` when truncated, to make the
+  truncation obvious.
+* Minor documentation improvements.
+
+
 1.5.1 (2018-10-25)
 ~~~~~~~~~~~~~~~~~~
 
@@ -12,6 +22,7 @@ History
 * documentation improvements;
 * Python 3.7 tests are run on CI; other test improvements.
 
+
 1.5.0 (2018-07-04)
 ~~~~~~~~~~~~~~~~~~
 
diff --git a/README.rst b/README.rst
@@ -15,8 +15,8 @@ Parsel
    :alt: Coverage report
 
 
-Parsel is a BSD-licensed Python_ library to extract data from HTML_ and XML_
-using XPath_ and CSS_ selectors, optionally combined with
+Parsel is a BSD-licensed Python_ library to extract and remove data from HTML_
+and XML_ using XPath_ and CSS_ selectors, optionally combined with
 `regular expressions`_.
 
 Find the Parsel online documentation at https://parsel.readthedocs.org.
@@ -30,7 +30,7 @@ Find the Parsel online documentation at https://parsel.readthedocs.org.
                 <ul>
                     <li><a href="http://example.com">Link 1</a></li>
                     <li><a href="http://scrapy.org">Link 2</a></li>
-                </ul
+                </ul>
             </body>
             </html>""")
     >>> selector.css('h1::text').get()
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -385,6 +385,41 @@ XPath specification.
 .. _Location Paths: https://www.w3.org/TR/xpath#location-paths
 
 
+Removing elements
+-----------------
+
+If for any reason you need to remove elements based on a Selector or
+a SelectorList, you can do it with the ``remove()`` method, available for both
+classes.
+
+.. warning:: this is a destructive action and cannot be undone. The original
+    content of the selector is removed from the elements tree. This could be useful
+    when trying to reduce the memory footprint of Responses.
+
+Example removing an ad from a blog post:
+
+    >>> from parsel import Selector
+    >>> doc = u"""
+    ... <article>
+    ...     <div class="row">Content paragraph...</div>
+    ...     <div class="row">
+    ...         <div class="ad">
+    ...             Ad content...
+    ...             <a href="http://...">Link</a>
+    ...         </div>
+    ...     </div>
+    ...     <div class="row">More content...</div>
+    ... </article>
+    ... """
+    >>> sel = Selector(text=doc)
+    >>> sel.xpath('//div/text()').getall()
+    ['Content paragraph...', 'Ad content...', 'Link', 'More content...']
+    >>> sel.xpath('//div[@class="ad"]').remove()
+    >>> sel.xpath('//div//text()').getall()
+    ['Content paragraph...', 'More content...']
+    >>>
+
+
 Using EXSLT extensions
 ----------------------
 
@@ -695,6 +730,66 @@ you can just select by class using CSS and then switch to XPath when needed::
 This is cleaner than using the verbose XPath trick shown above. Just remember
 to use the ``.`` in the XPath expressions that will follow.
 
+
+Beware of how script and style tags differ from other tags
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+`Following the standard`__, the contents of ``script`` and ``style`` elements
+are parsed as plain text.
+
+__ https://www.w3.org/TR/html401/types.html#type-cdata
+
+This means that XML-like structures found within them, including comments, are
+all treated as part of the element text, and not as separate nodes.
+
+For example::
+
+    >>> from parsel import Selector
+    >>> selector = Selector(text="""
+    ....        <script>
+    ....            <!-- comment -->
+    ....            text
+    ....            <br/>
+    ....        </script>
+    ....        <style>
+    ....            <!-- comment -->
+    ....            text
+    ....            <br/>
+    ....        </style>
+    ....        <div>
+    ....            <!-- comment -->
+    ....            text
+    ....            <br/>
+    ....        </div>""")
+    >>> for tag in selector.xpath('//*[contains(text(), "text")]'):
+    ...     print(tag.xpath('name()').get())
+    ...     print('    Text: ' + (tag.xpath('text()').get() or ''))
+    ...     print('    Comment: ' + (tag.xpath('comment()').get() or ''))
+    ...     print('    Children: ' + ''.join(tag.xpath('*').getall()))
+    ...
+    script
+        Text:
+            text
+            <!-- comment -->
+            <br/>
+
+        Comment:
+        Children:
+    style
+        Text:
+            text
+            <!-- comment -->
+            <br/>
+
+        Comment:
+        Children:
+    div
+        Text:
+            text
+
+        Comment: <!-- comment -->
+        Children: <br>
+
 .. _old-extraction-api:
 
 extract() and extract_first()
@@ -745,6 +840,23 @@ are more predictable: ``.get()`` always returns a single result,
 ``.getall()`` always returns a list of all extracted results.
 
 
+Command-Line Interface Tools
+============================
+
+There are third-party tools that allow using Parsel from the command line:
+
+-   `Parsel CLI <https://github.com/rmax/parsel-cli>`_ allows applying
+    Parsel selectors to the standard input. For example, you can apply a Parsel
+    selector to the output of cURL_.
+
+-   `parselcli
+    <https://github.com/Granitosaurus/parsel-cli>`_ provides an interactive
+    shell that allows applying Parsel selectors to a remote URL or a local
+    file.
+
+.. _cURL: https://curl.haxx.se/
+
+
 .. _topics-selectors-ref:
 
 API reference
diff --git a/parsel/__init__.py b/parsel/__init__.py
@@ -5,7 +5,7 @@
 
 __author__ = 'Scrapy project'
 __email__ = 'info@scrapy.org'
-__version__ = '1.5.1'
+__version__ = '1.5.2'
 
 from parsel.selector import Selector, SelectorList  # NOQA
 from parsel.csstranslator import css2xpath  # NOQA
diff --git a/parsel/selector.py b/parsel/selector.py
@@ -7,10 +7,18 @@
 import six
 from lxml import etree, html
 
-from .utils import flatten, iflatten, extract_regex
+from .utils import flatten, iflatten, extract_regex, shorten
 from .csstranslator import HTMLTranslator, GenericTranslator
 
 
+class CannotRemoveElementWithoutRoot(Exception):
+    pass
+
+
+class CannotRemoveElementWithoutParent(Exception):
+    pass
+
+
 class SafeXMLParser(etree.XMLParser):
     def __init__(self, *args, **kwargs):
         kwargs.setdefault('resolve_entities', False)
@@ -150,6 +158,13 @@ def attrib(self):
         else:
             return {}
 
+    def remove(self):
+        """
+        Remove matched nodes from the parent for each element in this list.
+        """
+        for x in self:
+            x.remove()
+
 
 class Selector(object):
     """
@@ -339,8 +354,32 @@ def remove_namespaces(self):
             for an in el.attrib.keys():
                 if an.startswith('{'):
                     el.attrib[an.split('}', 1)[1]] = el.attrib.pop(an)
-            # remove namespace declarations
-            etree.cleanup_namespaces(self.root)
+        # remove namespace declarations
+        etree.cleanup_namespaces(self.root)
+
+    def remove(self):
+        """
+        Remove matched nodes from the parent element.
+        """
+        try:
+            parent = self.root.getparent()
+        except AttributeError:
+            # 'str' object has no attribute 'getparent'
+            raise CannotRemoveElementWithoutRoot(
+                "The node you're trying to remove has no root, "
+                "are you trying to remove a pseudo-element? "
+                "Try to use 'li' as a selector instead of 'li::text' or "
+                "'//li' instead of '//li/text()', for example."
+            )
+
+        try:
+            parent.remove(self.root)
+        except AttributeError:
+            # 'NoneType' object has no attribute 'remove'
+            raise CannotRemoveElementWithoutParent(
+                "The node you're trying to remove has no parent, "
+                "are you trying to remove a root element?"
+            )
 
     @property
     def attrib(self):
@@ -358,6 +397,6 @@ def __bool__(self):
     __nonzero__ = __bool__
 
     def __str__(self):
-        data = repr(self.get()[:40])
+        data = repr(shorten(self.get(), width=40))
         return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data)
     __repr__ = __str__
diff --git a/parsel/utils.py b/parsel/utils.py
@@ -80,4 +80,15 @@ def extract_regex(regex, text, replace_entities=True):
     strings = flatten(strings)
     if not replace_entities:
         return strings
-    return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings]
+    return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings]
+
+
+def shorten(text, width, suffix='...'):
+    """Truncate the given text to fit in the given width."""
+    if len(text) <= width:
+        return text
+    if width > len(suffix):
+        return text[:width-len(suffix)] + suffix
+    if width >= 0:
+        return suffix[len(suffix)-width:]
+    raise ValueError('width must be equal or greater than 0')
diff --git a/setup.py b/setup.py
@@ -27,7 +27,8 @@ def has_environment_marker_platform_impl_support():
 
 install_requires = [
     'w3lib>=1.19.0',
-    'lxml>=2.3',
+    'lxml;python_version!="3.4"',
+    'lxml<=4.3.5;python_version=="3.4"',
     'six>=1.5.2',
     'cssselect>=0.9'
 ]
@@ -41,7 +42,7 @@ def has_environment_marker_platform_impl_support():
 
 setup(
     name='parsel',
-    version='1.5.1',
+    version='1.5.2',
     description="Parsel is a library to extract data from HTML and XML using XPath and CSS selectors",
     long_description=readme + '\n\n' + history,
     author="Scrapy project",
diff --git a/tests/test_selector.py b/tests/test_selector.py
diff --git a/tests/test_utils.py b/tests/test_utils.py