Skip to content

Commit 8d6feb4

Browse files
authored
Merge pull request #167 from Gallaecio/doctest
Run the documentation tests as part of the test suite
2 parents 1e84677 + 1c15254 commit 8d6feb4

File tree

8 files changed

+3556
-78
lines changed

8 files changed

+3556
-78
lines changed

docs/_static/multiroot.html

Lines changed: 2831 additions & 0 deletions
Large diffs are not rendered by default.

docs/_static/python-insider.xml

Lines changed: 583 additions & 0 deletions
Large diffs are not rendered by default.

docs/_static/selectors-sample1.html

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,3 @@
1313
</div>
1414
</body>
1515
</html>
16-

docs/conftest.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import os
2+
from doctest import ELLIPSIS, NORMALIZE_WHITESPACE
3+
from sys import version_info
4+
5+
from sybil import Sybil
6+
from sybil.parsers.codeblock import CodeBlockParser
7+
from sybil.parsers.doctest import DocTestParser
8+
from sybil.parsers.skip import skip
9+
10+
from parsel import Selector
11+
12+
13+
def load_selector(filename, **kwargs):
14+
input_path = os.path.join(os.path.dirname(__file__), '_static', filename)
15+
with open(input_path) as input_file:
16+
return Selector(text=input_file.read(), **kwargs)
17+
18+
19+
def setup(namespace):
20+
namespace['load_selector'] = load_selector
21+
22+
23+
if version_info >= (3,):
24+
pytest_collect_file = Sybil(
25+
parsers=[
26+
DocTestParser(optionflags=ELLIPSIS | NORMALIZE_WHITESPACE),
27+
CodeBlockParser(future_imports=['print_function']),
28+
skip,
29+
],
30+
pattern='*.rst',
31+
setup=setup,
32+
).pytest()

docs/usage.rst

Lines changed: 106 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -74,13 +74,21 @@ For the sake of completeness, here's its full HTML code:
7474

7575
.. highlight:: python
7676

77-
So, let's download that page and create a selector for it::
77+
So, let's download that page and create a selector for it:
7878

79-
>>> import requests
80-
>>> from parsel import Selector
81-
>>> url = 'http://parsel.readthedocs.org/en/latest/_static/selectors-sample1.html'
82-
>>> text = requests.get(url).text
83-
>>> selector = Selector(text=text)
79+
.. skip: start
80+
81+
>>> import requests
82+
>>> from parsel import Selector
83+
>>> url = 'http://parsel.readthedocs.org/en/latest/_static/selectors-sample1.html'
84+
>>> text = requests.get(url).text
85+
>>> selector = Selector(text=text)
86+
87+
.. skip: end
88+
89+
.. invisible-code-block: python
90+
91+
selector = load_selector('selectors-sample1.html')
8492
8593
Since we're dealing with HTML, the default type for Selector, we don't need
8694
to specify the `type` argument.
@@ -279,7 +287,6 @@ too. Here's an example::
279287
>>> for index, link in enumerate(links):
280288
... args = (index, link.xpath('@href').get(), link.xpath('img/@src').get())
281289
... print('Link number %d points to url %r and image %r' % args)
282-
283290
Link number 0 points to url 'image1.html' and image 'image1_thumb.jpg'
284291
Link number 1 points to url 'image2.html' and image 'image2_thumb.jpg'
285292
Link number 2 points to url 'image3.html' and image 'image3_thumb.jpg'
@@ -341,18 +348,18 @@ Here's an example used to extract image names from the :ref:`HTML code
341348
<topics-selectors-htmlcode>` above::
342349

343350
>>> selector.xpath('//a[contains(@href, "image")]/text()').re(r'Name:\s*(.*)')
344-
['My image 1',
345-
'My image 2',
346-
'My image 3',
347-
'My image 4',
348-
'My image 5']
351+
['My image 1 ',
352+
'My image 2 ',
353+
'My image 3 ',
354+
'My image 4 ',
355+
'My image 5 ']
349356

350357
There's an additional helper reciprocating ``.get()`` (and its
351358
alias ``.extract_first()``) for ``.re()``, named ``.re_first()``.
352359
Use it to extract just the first matching string::
353360

354361
>>> selector.xpath('//a[contains(@href, "image")]/text()').re_first(r'Name:\s*(.*)')
355-
'My image 1'
362+
'My image 1 '
356363

357364
.. _topics-selectors-relative-xpaths:
358365

@@ -419,11 +426,10 @@ Example removing an ad from a blog post:
419426
... """
420427
>>> sel = Selector(text=doc)
421428
>>> sel.xpath('//div/text()').getall()
422-
['Content paragraph...', 'Ad content...', 'Link', 'More content...']
429+
['Content paragraph...', '\n ', '\n Ad content...\n ', '\n ', '\n ', 'More content...']
423430
>>> sel.xpath('//div[@class="ad"]').remove()
424431
>>> sel.xpath('//div//text()').getall()
425432
['Content paragraph...', 'More content...']
426-
>>>
427433

428434

429435
Using EXSLT extensions
@@ -463,7 +469,7 @@ Example selecting links in list item with a "class" attribute ending with a digi
463469
>>> sel = Selector(text=doc)
464470
>>> sel.xpath('//li//@href').getall()
465471
['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']
466-
>>> sel.xpath('//li[re:test(@class, "item-\d$")]//@href').getall()
472+
>>> sel.xpath(r'//li[re:test(@class, "item-\d$")]//@href').getall()
467473
['link1.html', 'link2.html', 'link4.html', 'link5.html']
468474
>>>
469475

@@ -539,29 +545,27 @@ with groups of itemscopes and corresponding itemprops::
539545
... .//*[@itemscope]/*/@itemprop)''')
540546
... print(" properties: %s" % (props.getall()))
541547
... print("")
542-
543548
current scope: ['http://schema.org/Product']
544549
properties: ['name', 'aggregateRating', 'offers', 'description', 'review', 'review']
545-
550+
<BLANKLINE>
546551
current scope: ['http://schema.org/AggregateRating']
547552
properties: ['ratingValue', 'reviewCount']
548-
553+
<BLANKLINE>
549554
current scope: ['http://schema.org/Offer']
550555
properties: ['price', 'availability']
551-
556+
<BLANKLINE>
552557
current scope: ['http://schema.org/Review']
553558
properties: ['name', 'author', 'datePublished', 'reviewRating', 'description']
554-
559+
<BLANKLINE>
555560
current scope: ['http://schema.org/Rating']
556561
properties: ['worstRating', 'ratingValue', 'bestRating']
557-
562+
<BLANKLINE>
558563
current scope: ['http://schema.org/Review']
559564
properties: ['name', 'author', 'datePublished', 'reviewRating', 'description']
560-
565+
<BLANKLINE>
561566
current scope: ['http://schema.org/Rating']
562567
properties: ['worstRating', 'ratingValue', 'bestRating']
563568

564-
>>>
565569

566570
Here we first iterate over ``itemscope`` elements, and for each one,
567571
we look for all ``itemprops`` elements and exclude those that are themselves
@@ -680,16 +684,16 @@ Example::
680684

681685
>>> from parsel import Selector
682686
>>> sel = Selector(text="""
683-
....: <ul class="list">
684-
....: <li>1</li>
685-
....: <li>2</li>
686-
....: <li>3</li>
687-
....: </ul>
688-
....: <ul class="list">
689-
....: <li>4</li>
690-
....: <li>5</li>
691-
....: <li>6</li>
692-
....: </ul>""")
687+
... <ul class="list">
688+
... <li>1</li>
689+
... <li>2</li>
690+
... <li>3</li>
691+
... </ul>
692+
... <ul class="list">
693+
... <li>4</li>
694+
... <li>5</li>
695+
... <li>6</li>
696+
... </ul>""")
693697
>>> xp = lambda x: sel.xpath(x).getall()
694698

695699
This gets all first ``<li>`` elements under whatever it is its parent::
@@ -752,21 +756,21 @@ For example::
752756

753757
>>> from parsel import Selector
754758
>>> selector = Selector(text="""
755-
.... <script>
756-
.... <!-- comment -->
757-
.... text
758-
.... <br/>
759-
.... </script>
760-
.... <style>
761-
.... <!-- comment -->
762-
.... text
763-
.... <br/>
764-
.... </style>
765-
.... <div>
766-
.... <!-- comment -->
767-
.... text
768-
.... <br/>
769-
.... </div>""")
759+
... <script>
760+
... text
761+
... <!-- comment -->
762+
... <br/>
763+
... </script>
764+
... <style>
765+
... text
766+
... <!-- comment -->
767+
... <br/>
768+
... </style>
769+
... <div>
770+
... text
771+
... <!-- comment -->
772+
... <br/>
773+
... </div>""")
770774
>>> for tag in selector.xpath('//*[contains(text(), "text")]'):
771775
... print(tag.xpath('name()').get())
772776
... print(' Text: ' + (tag.xpath('text()').get() or ''))
@@ -778,21 +782,21 @@ For example::
778782
text
779783
<!-- comment -->
780784
<br/>
781-
785+
<BLANKLINE>
782786
Comment:
783787
Children:
784788
style
785789
Text:
786790
text
787791
<!-- comment -->
788792
<br/>
789-
793+
<BLANKLINE>
790794
Comment:
791795
Children:
792796
div
793797
Text:
794798
text
795-
799+
<BLANKLINE>
796800
Comment: <!-- comment -->
797801
Children: <br>
798802

@@ -811,6 +815,10 @@ and readable code.
811815

812816
The following examples show how these methods map to each other.
813817

818+
.. invisible-code-block: python
819+
820+
selector = load_selector('selectors-sample1.html')
821+
814822
1. ``SelectorList.get()`` is the same as ``SelectorList.extract_first()``::
815823

816824
>>> selector.css('a::attr(href)').get()
@@ -852,10 +860,14 @@ Using CSS selectors in multi-root documents
852860
Some webpages may have multiple root elements. It can happen, for example, when
853861
a webpage has broken code, such as missing closing tags.
854862

855-
You can use XPath to determine if a page has multiple root elements::
863+
.. invisible-code-block: python
856864
857-
>>> len(selector.xpath('/*')) > 1
858-
True
865+
selector = load_selector('multiroot.html')
866+
867+
You can use XPath to determine if a page has multiple root elements:
868+
869+
>>> len(selector.xpath('/*')) > 1
870+
True
859871

860872
CSS selectors only work on the first root element, because the first root
861873
element is always used as the starting current element, and CSS selectors do
@@ -954,12 +966,20 @@ method for that.
954966

955967
Let's show an example that illustrates this with the Python Insider blog atom feed.
956968

957-
Let's download the atom feed using `requests`_ and create a selector::
969+
Let's download the atom feed using `requests`_ and create a selector:
958970

959-
>>> import requests
960-
>>> from parsel import Selector
961-
>>> text = requests.get('https://feeds.feedburner.com/PythonInsider').text
962-
>>> sel = Selector(text=text, type='xml')
971+
.. skip: start
972+
973+
>>> import requests
974+
>>> from parsel import Selector
975+
>>> text = requests.get('https://feeds.feedburner.com/PythonInsider').text
976+
>>> sel = Selector(text=text, type='xml')
977+
978+
.. skip: end
979+
980+
.. invisible-code-block: python
981+
982+
sel = load_selector('python-insider.xml', type='xml')
963983
964984
This is how the file starts:
965985

@@ -993,9 +1013,9 @@ directly by their names::
9931013

9941014
>>> sel.remove_namespaces()
9951015
>>> sel.xpath("//link")
996-
[<Selector xpath='//link' data='<link rel="alternate" type="text/html" h'>,
997-
<Selector xpath='//link' data='<link rel="next" type="application/atom+'>,
998-
...
1016+
[<Selector xpath='//link' data='<link rel="alternate" type="text/html...'>,
1017+
<Selector xpath='//link' data='<link rel="next" type="application/at...'>,
1018+
...]
9991019

10001020
If you wonder why the namespace removal procedure isn't called always by default
10011021
instead of having to call it manually, this is because of two reasons, which, in order
@@ -1020,29 +1040,37 @@ Ad-hoc namespaces references
10201040
references along with the query, through a ``namespaces`` argument,
10211041
with the prefixes you declare being used in your XPath or CSS query.
10221042

1023-
Let's use the same Python Insider Atom feed::
1043+
Let's use the same Python Insider Atom feed:
10241044

1025-
>>> import requests
1026-
>>> from parsel import Selector
1027-
>>> text = requests.get('https://feeds.feedburner.com/PythonInsider').text
1028-
>>> sel = Selector(text=text, type='xml')
1045+
.. skip: start
1046+
1047+
>>> import requests
1048+
>>> from parsel import Selector
1049+
>>> text = requests.get('https://feeds.feedburner.com/PythonInsider').text
1050+
>>> sel = Selector(text=text, type='xml')
1051+
1052+
.. skip: end
1053+
1054+
.. invisible-code-block: python
1055+
1056+
sel = load_selector('python-insider.xml', type='xml')
10291057
10301058
And try to select the links again, now using an "atom:" prefix
10311059
for the "link" node test::
10321060

10331061
>>> sel.xpath("//atom:link", namespaces={"atom": "http://www.w3.org/2005/Atom"})
1034-
[<Selector xpath='//atom:link' data='<link xmlns="http://www.w3.org/2005/Atom'>,
1035-
<Selector xpath='//atom:link' data='<link xmlns="http://www.w3.org/2005/Atom'>,
1036-
...
1062+
[<Selector xpath='//atom:link' data='<link xmlns="http://www.w3.org/2005/A...'>,
1063+
<Selector xpath='//atom:link' data='<link xmlns="http://www.w3.org/2005/A...'>,
1064+
...]
10371065

10381066
You can pass several namespaces (here we're using shorter 1-letter prefixes)::
10391067

10401068
>>> sel.xpath("//a:entry/a:author/g:image/@src",
10411069
... namespaces={"a": "http://www.w3.org/2005/Atom",
10421070
... "g": "http://schemas.google.com/g/2005"}).getall()
1043-
['http://photos1.blogger.com/blogger/4554/1119/400/beethoven_10.jpg',
1044-
'//lh3.googleusercontent.com/-7xisiK0EArc/AAAAAAAAAAI/AAAAAAAAAuM/-r6o6A8RKCM/s512-c/photo.jpg',
1045-
...
1071+
['https://img1.blogblog.com/img/b16-rounded.gif',
1072+
'https://img1.blogblog.com/img/b16-rounded.gif',
1073+
...]
10461074

10471075
.. _topics-xpath-variables:
10481076

@@ -1055,6 +1083,10 @@ queries or prepared statements in the SQL world where you replace
10551083
some arguments in your queries with placeholders like ``?``,
10561084
which are then substituted with values passed with the query.
10571085

1086+
.. invisible-code-block: python
1087+
1088+
selector = load_selector('selectors-sample1.html')
1089+
10581090
Here's an example to match an element based on its normalized string-value::
10591091

10601092
>>> str_to_match = "Name: My image 3"

pytest.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[pytest]
2-
addopts = --doctest-modules --assert=plain --ignore=setup.py
2+
addopts = --assert=plain --doctest-modules --ignore=setup.py
33
flake8-ignore =
44
parsel/csstranslator.py E501
55
parsel/selector.py E501

tests/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pytest
22
pytest-cov
3+
sybil

tox.ini

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
envlist = py27, py35, py36, py37, py38, pypy, pypy3
33

44
[testenv]
5+
usedevelop = True
56
deps =
67
-r{toxinidir}/tests/requirements.txt
7-
8-
commands = py.test --cov=parsel --cov-report= {posargs:parsel tests}
8+
commands = py.test --cov=parsel --cov-report= {posargs:docs parsel tests}
99

1010
[testenv:security]
1111
basepython = python3.8

0 commit comments

Comments
 (0)