Skip to content

Commit 80438c2

Browse files
committed
Fixed the XPath filtering to have the same behavior than Symfony 2.4
1 parent 711ac32 commit 80438c2

File tree

1 file changed

+101
-13
lines changed

1 file changed

+101
-13
lines changed

src/Symfony/Component/DomCrawler/Crawler.php

Lines changed: 101 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ public function addHtmlContent($content, $charset = 'UTF-8')
170170

171171
$this->addDocument($dom);
172172

173-
$base = $this->filterXPath('descendant-or-self::base')->extract(array('href'));
173+
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(array('href'));
174174

175175
$baseHref = current($base);
176176
if (count($base) && !empty($baseHref)) {
@@ -580,6 +580,11 @@ public function extract($attributes)
580580
/**
581581
* Filters the list of nodes with an XPath expression.
582582
*
583+
* The XPath expression is evaluated in the context of the crawler, which
584+
* is considered as a fake parent of the elements inside it.
585+
* This means that a child selector "div" or "./div" will match only
586+
* the div elements of the current crawler, not their children.
587+
*
583588
* @param string $xpath An XPath expression
584589
*
585590
* @return Crawler A new instance of Crawler with the filtered list of nodes
@@ -588,14 +593,14 @@ public function extract($attributes)
588593
*/
589594
public function filterXPath($xpath)
590595
{
591-
$crawler = new static(null, $this->uri);
596+
$xpath = $this->relativize($xpath);
592597

593-
foreach ($this as $node) {
594-
$domxpath = new \DOMXPath($node->ownerDocument);
595-
$crawler->add($domxpath->query($xpath, $node));
598+
// If we dropped all expressions in the XPath while preparing it, there would be no match
599+
if ('' === $xpath) {
600+
return new static(null, $this->uri);
596601
}
597602

598-
return $crawler;
603+
return $this->filterRelativeXPath($xpath);
599604
}
600605

601606
/**
@@ -619,7 +624,8 @@ public function filter($selector)
619624
// @codeCoverageIgnoreEnd
620625
}
621626

622-
return $this->filterXPath(CssSelector::toXPath($selector));
627+
// The CssSelector already prefixes the selector with descendant-or-self::
628+
return $this->filterRelativeXPath(CssSelector::toXPath($selector));
623629
}
624630

625631
/**
@@ -633,10 +639,10 @@ public function filter($selector)
633639
*/
634640
public function selectLink($value)
635641
{
636-
$xpath = sprintf('//a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s)] ', static::xpathLiteral(' '.$value.' ')).
637-
sprintf('| //a/img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]/ancestor::a', static::xpathLiteral(' '.$value.' '));
642+
$xpath = sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) ', static::xpathLiteral(' '.$value.' ')).
643+
sprintf('or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]]', static::xpathLiteral(' '.$value.' '));
638644

639-
return $this->filterXPath($xpath);
645+
return $this->filterRelativeXPath($xpath);
640646
}
641647

642648
/**
@@ -651,11 +657,11 @@ public function selectLink($value)
651657
public function selectButton($value)
652658
{
653659
$translate = 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")';
654-
$xpath = sprintf('//input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')).
660+
$xpath = sprintf('descendant-or-self::input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')).
655661
sprintf('or (contains(%s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id="%s" or @name="%s"] ', $translate, static::xpathLiteral(' '.$value.' '), $value, $value).
656-
sprintf('| //button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value);
662+
sprintf('| descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value);
657663

658-
return $this->filterXPath($xpath);
664+
return $this->filterRelativeXPath($xpath);
659665
}
660666

661667
/**
@@ -771,6 +777,88 @@ public static function xpathLiteral($s)
771777
return sprintf("concat(%s)", implode($parts, ', '));
772778
}
773779

780+
/**
781+
* Filters the list of nodes with an XPath expression.
782+
*
783+
* The XPath expression should already be processed to apply it in the context of each node.
784+
*
785+
* @param string $xpath
786+
*
787+
* @return Crawler
788+
*/
789+
private function filterRelativeXPath($xpath)
790+
{
791+
$crawler = new static(null, $this->uri);
792+
793+
foreach ($this as $node) {
794+
$domxpath = new \DOMXPath($node->ownerDocument);
795+
$crawler->add($domxpath->query($xpath, $node));
796+
}
797+
798+
return $crawler;
799+
}
800+
801+
/**
802+
* Make the XPath relative to the current context.
803+
*
804+
* The returned XPath will match elements matching the XPath inside the current crawler
805+
* when running in the context of a node of the crawler.
806+
*
807+
* @param string $xpath
808+
*
809+
* @return string
810+
*/
811+
private function relativize($xpath)
812+
{
813+
$expressions = array();
814+
815+
$unionPattern = '/\|(?![^\[]*\])/';
816+
// An expression which will never match to replace expressions which cannot match in the crawler
817+
// We cannot simply drop
818+
$nonMatchingExpression = 'a[name() = "b"]';
819+
820+
// Split any unions into individual expressions.
821+
foreach (preg_split($unionPattern, $xpath) as $expression) {
822+
$expression = trim($expression);
823+
$parenthesis = '';
824+
825+
// If the union is inside some braces, we need to preserve the opening braces and apply
826+
// the change only inside it.
827+
if (preg_match('/^[\(\s*]+/', $expression, $matches)) {
828+
$parenthesis = $matches[0];
829+
$expression = substr($expression, strlen($parenthesis));
830+
}
831+
832+
// BC for Symfony 2.4 and lower were elements were adding in a fake _root parent
833+
if (0 === strpos($expression, '/_root/')) {
834+
$expression = './'.substr($expression, 7);
835+
}
836+
837+
// add prefix before absolute element selector
838+
if (empty($expression)) {
839+
$expression = $nonMatchingExpression;
840+
} elseif (0 === strpos($expression, '//')) {
841+
$expression = 'descendant-or-self::' . substr($expression, 2);
842+
} elseif (0 === strpos($expression, './')) {
843+
$expression = 'self::' . substr($expression, 2);
844+
} elseif ('/' === $expression[0]) {
845+
// the only direct child in Symfony 2.4 and lower is _root, which is already handled previously
846+
// so let's drop the expression entirely
847+
$expression = $nonMatchingExpression;
848+
} elseif ('.' === $expression[0]) {
849+
// '.' is the fake root element in Symfony 2.4 and lower, which is excluded from results
850+
$expression = $nonMatchingExpression;
851+
} elseif (0 === strpos($expression, 'descendant::')) {
852+
$expression = 'descendant-or-self::' . substr($expression, strlen('descendant::'));
853+
} elseif (0 !== strpos($expression, 'descendant-or-self::')) {
854+
$expression = 'self::' .$expression;
855+
}
856+
$expressions[] = $parenthesis.$expression;
857+
}
858+
859+
return implode(' | ', $expressions);
860+
}
861+
774862
/**
775863
* @param int $position
776864
*

0 commit comments

Comments
 (0)