Skip to content

Commit 4eb4a60

Browse files
committed
bug symfony#10958 [DomCrawler] Fixed filterXPath() chaining loosing the parent DOM nodes (stof, robbertkl)
This PR was merged into the 2.3 branch. Discussion ---------- [DomCrawler] Fixed filterXPath() chaining loosing the parent DOM nodes | Q | A | ------------- | --- | Bug fix? | yes | New feature? | no | BC breaks? | no | Deprecations? | no | Tests pass? | yes | Fixed tickets | symfony#10206 | License | MIT | Doc PR | n/a This is a fixed version of symfony#10207, preserving the BC for XPath queries. It is the rebased version of symfony#10935 targetting 2.3 The example given in symfony#10260 when reporting the regression in the previous attempt is covered by the new tests added in the first commit of the PR. I also added many tests ensuring that the behavior is the same than in the current implementation. Commits ------- 80438c2 Fixed the XPath filtering to have the same behavior than Symfony 2.4 711ac32 [DomCrawler] Fixed filterXPath() chaining 8f706c9 [DomCrawler] Added more tests for the XPath filtering
2 parents 125b9e7 + 80438c2 commit 4eb4a60

File tree

5 files changed

+219
-41
lines changed

5 files changed

+219
-41
lines changed

src/Symfony/Component/DomCrawler/Crawler.php

Lines changed: 103 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ public function addHtmlContent($content, $charset = 'UTF-8')
170170

171171
$this->addDocument($dom);
172172

173-
$base = $this->filterXPath('descendant-or-self::base')->extract(array('href'));
173+
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(array('href'));
174174

175175
$baseHref = current($base);
176176
if (count($base) && !empty($baseHref)) {
@@ -445,7 +445,7 @@ public function parents()
445445
$nodes = array();
446446

447447
while ($node = $node->parentNode) {
448-
if (1 === $node->nodeType && '_root' !== $node->nodeName) {
448+
if (1 === $node->nodeType) {
449449
$nodes[] = $node;
450450
}
451451
}
@@ -580,6 +580,11 @@ public function extract($attributes)
580580
/**
581581
* Filters the list of nodes with an XPath expression.
582582
*
583+
* The XPath expression is evaluated in the context of the crawler, which
584+
* is considered as a fake parent of the elements inside it.
585+
* This means that a child selector "div" or "./div" will match only
586+
* the div elements of the current crawler, not their children.
587+
*
583588
* @param string $xpath An XPath expression
584589
*
585590
* @return Crawler A new instance of Crawler with the filtered list of nodes
@@ -588,15 +593,14 @@ public function extract($attributes)
588593
*/
589594
public function filterXPath($xpath)
590595
{
591-
$document = new \DOMDocument('1.0', 'UTF-8');
592-
$root = $document->appendChild($document->createElement('_root'));
593-
foreach ($this as $node) {
594-
$root->appendChild($document->importNode($node, true));
595-
}
596+
$xpath = $this->relativize($xpath);
596597

597-
$domxpath = new \DOMXPath($document);
598+
// If we dropped all expressions in the XPath while preparing it, there would be no match
599+
if ('' === $xpath) {
600+
return new static(null, $this->uri);
601+
}
598602

599-
return new static($domxpath->query($xpath), $this->uri);
603+
return $this->filterRelativeXPath($xpath);
600604
}
601605

602606
/**
@@ -620,7 +624,8 @@ public function filter($selector)
620624
// @codeCoverageIgnoreEnd
621625
}
622626

623-
return $this->filterXPath(CssSelector::toXPath($selector));
627+
// The CssSelector already prefixes the selector with descendant-or-self::
628+
return $this->filterRelativeXPath(CssSelector::toXPath($selector));
624629
}
625630

626631
/**
@@ -634,10 +639,10 @@ public function filter($selector)
634639
*/
635640
public function selectLink($value)
636641
{
637-
$xpath = sprintf('//a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s)] ', static::xpathLiteral(' '.$value.' ')).
638-
sprintf('| //a/img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]/ancestor::a', static::xpathLiteral(' '.$value.' '));
642+
$xpath = sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) ', static::xpathLiteral(' '.$value.' ')).
643+
sprintf('or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]]', static::xpathLiteral(' '.$value.' '));
639644

640-
return $this->filterXPath($xpath);
645+
return $this->filterRelativeXPath($xpath);
641646
}
642647

643648
/**
@@ -652,11 +657,11 @@ public function selectLink($value)
652657
public function selectButton($value)
653658
{
654659
$translate = 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")';
655-
$xpath = sprintf('//input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')).
660+
$xpath = sprintf('descendant-or-self::input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')).
656661
sprintf('or (contains(%s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id="%s" or @name="%s"] ', $translate, static::xpathLiteral(' '.$value.' '), $value, $value).
657-
sprintf('| //button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value);
662+
sprintf('| descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value);
658663

659-
return $this->filterXPath($xpath);
664+
return $this->filterRelativeXPath($xpath);
660665
}
661666

662667
/**
@@ -772,6 +777,88 @@ public static function xpathLiteral($s)
772777
return sprintf("concat(%s)", implode($parts, ', '));
773778
}
774779

780+
/**
781+
* Filters the list of nodes with an XPath expression.
782+
*
783+
* The XPath expression should already be processed to apply it in the context of each node.
784+
*
785+
* @param string $xpath
786+
*
787+
* @return Crawler
788+
*/
789+
private function filterRelativeXPath($xpath)
790+
{
791+
$crawler = new static(null, $this->uri);
792+
793+
foreach ($this as $node) {
794+
$domxpath = new \DOMXPath($node->ownerDocument);
795+
$crawler->add($domxpath->query($xpath, $node));
796+
}
797+
798+
return $crawler;
799+
}
800+
801+
/**
802+
* Make the XPath relative to the current context.
803+
*
804+
* The returned XPath will match elements matching the XPath inside the current crawler
805+
* when running in the context of a node of the crawler.
806+
*
807+
* @param string $xpath
808+
*
809+
* @return string
810+
*/
811+
private function relativize($xpath)
812+
{
813+
$expressions = array();
814+
815+
$unionPattern = '/\|(?![^\[]*\])/';
816+
// An expression which will never match to replace expressions which cannot match in the crawler
817+
// We cannot simply drop
818+
$nonMatchingExpression = 'a[name() = "b"]';
819+
820+
// Split any unions into individual expressions.
821+
foreach (preg_split($unionPattern, $xpath) as $expression) {
822+
$expression = trim($expression);
823+
$parenthesis = '';
824+
825+
// If the union is inside some braces, we need to preserve the opening braces and apply
826+
// the change only inside it.
827+
if (preg_match('/^[\(\s*]+/', $expression, $matches)) {
828+
$parenthesis = $matches[0];
829+
$expression = substr($expression, strlen($parenthesis));
830+
}
831+
832+
// BC for Symfony 2.4 and lower were elements were adding in a fake _root parent
833+
if (0 === strpos($expression, '/_root/')) {
834+
$expression = './'.substr($expression, 7);
835+
}
836+
837+
// add prefix before absolute element selector
838+
if (empty($expression)) {
839+
$expression = $nonMatchingExpression;
840+
} elseif (0 === strpos($expression, '//')) {
841+
$expression = 'descendant-or-self::' . substr($expression, 2);
842+
} elseif (0 === strpos($expression, './')) {
843+
$expression = 'self::' . substr($expression, 2);
844+
} elseif ('/' === $expression[0]) {
845+
// the only direct child in Symfony 2.4 and lower is _root, which is already handled previously
846+
// so let's drop the expression entirely
847+
$expression = $nonMatchingExpression;
848+
} elseif ('.' === $expression[0]) {
849+
// '.' is the fake root element in Symfony 2.4 and lower, which is excluded from results
850+
$expression = $nonMatchingExpression;
851+
} elseif (0 === strpos($expression, 'descendant::')) {
852+
$expression = 'descendant-or-self::' . substr($expression, strlen('descendant::'));
853+
} elseif (0 !== strpos($expression, 'descendant-or-self::')) {
854+
$expression = 'self::' .$expression;
855+
}
856+
$expressions[] = $parenthesis.$expression;
857+
}
858+
859+
return implode(' | ', $expressions);
860+
}
861+
775862
/**
776863
* @param int $position
777864
*

src/Symfony/Component/DomCrawler/Field/FormField.php

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,13 +52,7 @@ public function __construct(\DOMNode $node)
5252
{
5353
$this->node = $node;
5454
$this->name = $node->getAttribute('name');
55-
56-
$this->document = new \DOMDocument('1.0', 'UTF-8');
57-
$this->node = $this->document->importNode($this->node, true);
58-
59-
$root = $this->document->appendChild($this->document->createElement('_root'));
60-
$root->appendChild($this->node);
61-
$this->xpath = new \DOMXPath($this->document);
55+
$this->xpath = new \DOMXPath($node->ownerDocument);
6256

6357
$this->initialize();
6458
}

src/Symfony/Component/DomCrawler/Form.php

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -388,9 +388,7 @@ private function initialize()
388388
{
389389
$this->fields = new FormFieldRegistry();
390390

391-
$document = new \DOMDocument('1.0', 'UTF-8');
392-
$xpath = new \DOMXPath($document);
393-
$root = $document->appendChild($document->createElement('_root'));
391+
$xpath = new \DOMXPath($this->node->ownerDocument);
394392

395393
// add submitted button if it has a valid name
396394
if ('form' !== $this->button->nodeName && $this->button->hasAttribute('name') && $this->button->getAttribute('name')) {
@@ -400,38 +398,32 @@ private function initialize()
400398

401399
// temporarily change the name of the input node for the x coordinate
402400
$this->button->setAttribute('name', $name.'.x');
403-
$this->set(new Field\InputFormField($document->importNode($this->button, true)));
401+
$this->set(new Field\InputFormField($this->button));
404402

405403
// temporarily change the name of the input node for the y coordinate
406404
$this->button->setAttribute('name', $name.'.y');
407-
$this->set(new Field\InputFormField($document->importNode($this->button, true)));
405+
$this->set(new Field\InputFormField($this->button));
408406

409407
// restore the original name of the input node
410408
$this->button->setAttribute('name', $name);
411409
} else {
412-
$this->set(new Field\InputFormField($document->importNode($this->button, true)));
410+
$this->set(new Field\InputFormField($this->button));
413411
}
414412
}
415413

416414
// find form elements corresponding to the current form
417415
if ($this->node->hasAttribute('id')) {
418-
// traverse through the whole document
419-
$node = $document->importNode($this->node->ownerDocument->documentElement, true);
420-
$root->appendChild($node);
421-
422416
// corresponding elements are either descendants or have a matching HTML5 form attribute
423417
$formId = Crawler::xpathLiteral($this->node->getAttribute('id'));
424-
$fieldNodes = $xpath->query(sprintf('descendant::input[@form=%s] | descendant::button[@form=%s] | descendant::textarea[@form=%s] | descendant::select[@form=%s] | //form[@id=%s]//input[not(@form)] | //form[@id=%s]//button[not(@form)] | //form[@id=%s]//textarea[not(@form)] | //form[@id=%s]//select[not(@form)]', $formId, $formId, $formId, $formId, $formId, $formId, $formId, $formId), $root);
418+
419+
$fieldNodes = $xpath->query(sprintf('descendant::input[@form=%s] | descendant::button[@form=%s] | descendant::textarea[@form=%s] | descendant::select[@form=%s] | //form[@id=%s]//input[not(@form)] | //form[@id=%s]//button[not(@form)] | //form[@id=%s]//textarea[not(@form)] | //form[@id=%s]//select[not(@form)]', $formId, $formId, $formId, $formId, $formId, $formId, $formId, $formId));
425420
foreach ($fieldNodes as $node) {
426421
$this->addField($node);
427422
}
428423
} else {
429-
// parent form has no id, add descendant elements only
430-
$node = $document->importNode($this->node, true);
431-
$root->appendChild($node);
432-
433-
// descendant elements with form attribute are not part of this form
434-
$fieldNodes = $xpath->query('descendant::input[not(@form)] | descendant::button[not(@form)] | descendant::textarea[not(@form)] | descendant::select[not(@form)]', $root);
424+
// do the xpath query with $this->node as the context node, to only find descendant elements
425+
// however, descendant elements with form attribute are not part of this form
426+
$fieldNodes = $xpath->query('descendant::input[not(@form)] | descendant::button[not(@form)] | descendant::textarea[not(@form)] | descendant::select[not(@form)]', $this->node);
435427
foreach ($fieldNodes as $node) {
436428
$this->addField($node);
437429
}

src/Symfony/Component/DomCrawler/Tests/CrawlerTest.php

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,27 @@ public function testExtract()
368368
$this->assertEquals(array(), $this->createTestCrawler()->filterXPath('//ol')->extract('_text'), '->extract() returns an empty array if the node list is empty');
369369
}
370370

371+
public function testFilterXpathComplexQueries()
372+
{
373+
$crawler = $this->createTestCrawler()->filterXPath('//body');
374+
375+
$this->assertCount(0, $crawler->filterXPath('/input'));
376+
$this->assertCount(0, $crawler->filterXPath('/body'));
377+
$this->assertCount(1, $crawler->filterXPath('/_root/body'));
378+
$this->assertCount(1, $crawler->filterXPath('./body'));
379+
$this->assertCount(4, $crawler->filterXPath('//form')->filterXPath('//button | //input'));
380+
$this->assertCount(1, $crawler->filterXPath('body'));
381+
$this->assertCount(6, $crawler->filterXPath('//button | //input'));
382+
$this->assertCount(1, $crawler->filterXPath('//body'));
383+
$this->assertCount(1, $crawler->filterXPath('descendant-or-self::body'));
384+
$this->assertCount(1, $crawler->filterXPath('//div[@id="parent"]')->filterXPath('./div'), 'A child selection finds only the current div');
385+
$this->assertCount(2, $crawler->filterXPath('//div[@id="parent"]')->filterXPath('descendant::div'), 'A descendant selector matches the current div and its child');
386+
$this->assertCount(2, $crawler->filterXPath('//div[@id="parent"]')->filterXPath('//div'), 'A descendant selector matches the current div and its child');
387+
$this->assertCount(5, $crawler->filterXPath('(//a | //div)//img'));
388+
$this->assertCount(7, $crawler->filterXPath('((//a | //div)//img | //ul)'));
389+
$this->assertCount(7, $crawler->filterXPath('( ( //a | //div )//img | //ul )'));
390+
}
391+
371392
/**
372393
* @covers Symfony\Component\DomCrawler\Crawler::filterXPath
373394
*/
@@ -378,8 +399,10 @@ public function testFilterXPath()
378399
$this->assertInstanceOf('Symfony\\Component\\DomCrawler\\Crawler', $crawler, '->filterXPath() returns a new instance of a crawler');
379400

380401
$crawler = $this->createTestCrawler()->filterXPath('//ul');
381-
382402
$this->assertCount(6, $crawler->filterXPath('//li'), '->filterXPath() filters the node list with the XPath expression');
403+
404+
$crawler = $this->createTestCrawler();
405+
$this->assertCount(3, $crawler->filterXPath('//body')->filterXPath('//button')->parents(), '->filterXpath() preserves parents when chained');
383406
}
384407

385408
/**
@@ -455,6 +478,44 @@ public function testLink()
455478
}
456479
}
457480

481+
public function testSelectLinkAndLinkFiltered()
482+
{
483+
$html = <<<HTML
484+
<!DOCTYPE html>
485+
<html lang="en">
486+
<body>
487+
<div id="action">
488+
<a href="/index.php?r=site/login">Login</a>
489+
</div>
490+
<form id="login-form" action="/index.php?r=site/login" method="post">
491+
<button type="submit">Submit</button>
492+
</form>
493+
</body>
494+
</html>
495+
HTML;
496+
497+
$crawler = new Crawler($html);
498+
$filtered = $crawler->filterXPath("descendant-or-self::*[@id = 'login-form']");
499+
500+
$this->assertCount(0, $filtered->selectLink('Login'));
501+
$this->assertCount(1, $filtered->selectButton('Submit'));
502+
503+
$filtered = $crawler->filterXPath("descendant-or-self::*[@id = 'action']");
504+
505+
$this->assertCount(1, $filtered->selectLink('Login'));
506+
$this->assertCount(0, $filtered->selectButton('Submit'));
507+
508+
$this->assertCount(1, $crawler->selectLink('Login')->selectLink('Login'));
509+
$this->assertCount(1, $crawler->selectButton('Submit')->selectButton('Submit'));
510+
}
511+
512+
public function testChaining()
513+
{
514+
$crawler = new Crawler('<div name="a"><div name="b"><div name="c"></div></div></div>');
515+
516+
$this->assertEquals('a', $crawler->filterXPath('//div')->filterXPath('div')->filterXPath('div')->attr('name'));
517+
}
518+
458519
public function testLinks()
459520
{
460521
$crawler = $this->createTestCrawler('http://example.com/bar/')->selectLink('Foo');
@@ -665,6 +726,10 @@ public function createTestCrawler($uri = null)
665726
<li>Two Bis</li>
666727
<li>Three Bis</li>
667728
</ul>
729+
<div id="parent">
730+
<div id="child"></div>
731+
</div>
732+
<div id="sibling"><img /></div>
668733
</body>
669734
</html>
670735
');

0 commit comments

Comments
 (0)