Skip to content

Commit 4a329d7

Browse files
author
Andy Rowlands
authored
Merge pull request #137 from salsadigitalauorg/feature/crawler-filter-patterns
Add a new group plugin for crawler.
2 parents 3b22fb4 + 7c16be0 commit 4a329d7

File tree

2 files changed

+106
-0
lines changed

2 files changed

+106
-0
lines changed

src/Command/CrawlCommand.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,10 @@ protected function execute(InputInterface $input, OutputInterface $output)
145145
$crawler->setDelayBetweenRequests($delay);
146146
}
147147

148+
if (!empty($this->config['options']['ignore_robotstxt'])) {
149+
$crawler->ignoreRobots();
150+
}
151+
148152
$io->success('Starting crawl!');
149153

150154
$crawler->startCrawling($baseUrl);
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
<?php
2+
3+
namespace Merlin\Crawler\Group;
4+
5+
use Psr\Http\Message\ResponseInterface;
6+
use Symfony\Component\DomCrawler\Crawler;
7+
8+
/**
9+
* Allows regex element filters to extract to separate files.
10+
*
11+
* @example
12+
* id: group-by-node-type
13+
* type: element_filter
14+
* options:
15+
* selector: .node # DOM or Xpath
16+
* pattern: /node-\w+/
17+
* filter_attr: class
18+
*/
19+
class ElementFilter extends GroupBase
20+
{
21+
22+
/**
23+
* The filtered type - used to separate output by id.
24+
*
25+
* @var string
26+
*/
27+
protected $filter_type;
28+
29+
30+
/**
31+
* {@inheritdoc}
32+
*/
33+
public function __construct(array $config=[])
34+
{
35+
parent::__construct($config);
36+
$this->filter_type = NULL;
37+
38+
}//end __construct()
39+
40+
41+
/**
42+
* {@inheritdoc}
43+
*/
44+
public function getId() : string
45+
{
46+
$id = parent::getId();
47+
48+
if ($this->filter_type) {
49+
$id .= "-{$this->filter_type}";
50+
}
51+
52+
return $id;
53+
54+
}//end getId()
55+
56+
57+
/**
58+
* {@inheritdoc}
59+
*/
60+
public function match($url, ResponseInterface $response) : bool
61+
{
62+
$dom = new Crawler($response->getBody()->__toString(), $url);
63+
$filter_attr = $this->getOption('filter_attr') ?: 'class';
64+
$pattern = $this->getOption('pattern');
65+
66+
if (empty($this->getOption('selector')) || empty($pattern)) {
67+
return FALSE;
68+
}
69+
70+
try {
71+
$element = $dom->evaluate($this->getOption('selector'));
72+
} catch (\Exception $error) {
73+
$element = [];
74+
}
75+
76+
if (!is_callable([$element, 'count']) || $element->count() === 0) {
77+
try {
78+
$element = $dom->filter($this->getOption('selector'));
79+
} catch (\Exception $error) {
80+
return FALSE;
81+
}
82+
}
83+
84+
if ($element->count() === 0) {
85+
return FALSE;
86+
}
87+
88+
$types = $element->each(
89+
function(Crawler $node) use ($filter_attr, $pattern) {
90+
preg_match($pattern, $node->attr($filter_attr), $matches);
91+
return reset($matches);
92+
}
93+
);
94+
95+
$this->filter_type = reset($types);
96+
97+
return TRUE;
98+
99+
}//end match()
100+
101+
102+
}//end class

0 commit comments

Comments
 (0)