Skip to content

Commit d94f825

Browse files
committed
RemotePageFetcher
1 parent 759d8e0 commit d94f825

File tree

10 files changed

+535
-6
lines changed

10 files changed

+535
-6
lines changed

composer.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,8 @@
7979
"ext-imap": "*",
8080
"tatevikgr/rss-feed": "dev-main",
8181
"ext-pdo": "*",
82-
"ezyang/htmlpurifier": "^4.19"
82+
"ezyang/htmlpurifier": "^4.19",
83+
"ext-libxml": "*"
8384
},
8485
"require-dev": {
8586
"phpunit/phpunit": "^9.5",

src/Domain/Messaging/Service/HtmlToText.php renamed to src/Domain/Common/HtmlToText.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
declare(strict_types=1);
44

5-
namespace PhpList\Core\Domain\Messaging\Service;
5+
namespace PhpList\Core\Domain\Common;
66

77
use PhpList\Core\Domain\Configuration\Model\ConfigOption;
88
use PhpList\Core\Domain\Configuration\Service\Provider\ConfigProvider;
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpList\Core\Domain\Common;
6+
7+
use DOMDocument;
8+
use DOMElement;
9+
use DOMXPath;
10+
11+
class HtmlUrlRewriter
12+
{
13+
public function addAbsoluteResources(string $html, string $baseUrl): string
14+
{
15+
$baseUrl = rtrim($baseUrl, "/");
16+
17+
// 1) Rewrite HTML attributes via DOM (handles quotes, whitespace, etc.)
18+
$dom = new DOMDocument();
19+
libxml_use_internal_errors(true);
20+
21+
// Prevent DOMDocument from adding html/body tags if you pass fragments
22+
$wrapped = '<!doctype html><meta charset="utf-8"><div id="__wrap__">' . $html . '</div>';
23+
$dom->loadHTML($wrapped, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
24+
25+
$xpath = new DOMXPath($dom);
26+
27+
// Attributes to rewrite
28+
$attrMap = [
29+
'//*[@src]' => 'src',
30+
'//*[@href]' => 'href',
31+
'//*[@action]' => 'action',
32+
'//*[@background]' => 'background',
33+
];
34+
35+
foreach ($attrMap as $query => $attr) {
36+
foreach ($xpath->query($query) as $node) {
37+
/** @var DOMElement $node */
38+
$val = $node->getAttribute($attr);
39+
$node->setAttribute($attr, $this->absolutizeUrl($val, $baseUrl));
40+
}
41+
}
42+
43+
// srcset needs special handling (multiple candidates)
44+
foreach ($xpath->query('//*[@srcset]') as $node) {
45+
/** @var DOMElement $node */
46+
$node->setAttribute('srcset', $this->rewriteSrcset($node->getAttribute('srcset'), $baseUrl));
47+
}
48+
49+
// 2) Rewrite inline <style> blocks (CSS)
50+
foreach ($xpath->query('//style') as $styleNode) {
51+
/** @var DOMElement $styleNode */
52+
$css = $styleNode->nodeValue;
53+
$styleNode->nodeValue = $this->rewriteCssUrls($css, $baseUrl);
54+
}
55+
56+
// 3) Rewrite style="" attributes (CSS)
57+
foreach ($xpath->query('//*[@style]') as $node) {
58+
/** @var DOMElement $node */
59+
$css = $node->getAttribute('style');
60+
$node->setAttribute('style', $this->rewriteCssUrls($css, $baseUrl));
61+
}
62+
63+
// Extract the original fragment back out
64+
$wrap = $dom->getElementById('__wrap__');
65+
$out = '';
66+
foreach ($wrap->childNodes as $child) {
67+
$out .= $dom->saveHTML($child);
68+
}
69+
70+
libxml_clear_errors();
71+
72+
return $out;
73+
}
74+
75+
/**
76+
* Convert $url to an absolute URL based on $baseUrl.
77+
* Leaves alone:
78+
* - already absolute (scheme:)
79+
* - protocol-relative (//example.com) => keeps host but adds scheme
80+
* - anchors (#...)
81+
* - placeholders like [SOMETHING]
82+
* - mailto:, tel:, data:, javascript: (etc)
83+
*/
84+
public function absolutizeUrl(string $url, string $baseUrl): string
85+
{
86+
$url = trim($url);
87+
if ($url === '' || $url[0] === '#') return $url;
88+
if (preg_match('/\[[^\]]+\]/', $url)) return $url;
89+
90+
// already has a scheme (http:, https:, mailto:, data:, etc.)
91+
if (preg_match('#^[a-z][a-z0-9+.-]*:#i', $url)) return $url;
92+
93+
$base = parse_url($baseUrl);
94+
if (!$base || empty($base['scheme']) || empty($base['host'])) {
95+
// If base is invalid, bail out rather than corrupt URLs
96+
return $url;
97+
}
98+
99+
// protocol-relative
100+
if (str_starts_with($url, '//')) {
101+
return $base['scheme'] . ':' . $url;
102+
}
103+
104+
$basePath = $base['path'] ?? '/';
105+
// If baseUrl points to a file, use its directory
106+
if (!str_ends_with($basePath, '/')) {
107+
$basePath = preg_replace('#/[^/]*$#', '/', $basePath);
108+
}
109+
110+
if (str_starts_with($url, '/')) {
111+
$path = $url;
112+
} else {
113+
$path = $basePath . $url;
114+
}
115+
116+
$path = $this->normalizePath($path);
117+
118+
$port = isset($base['port']) ? ':' . $base['port'] : '';
119+
return $base['scheme'] . '://' . $base['host'] . $port . $path;
120+
}
121+
122+
function normalizePath(string $path): string
123+
{
124+
// Keep query/fragment if present
125+
$parts = parse_url($path);
126+
$p = $parts['path'] ?? $path;
127+
128+
$segments = explode('/', $p);
129+
$out = [];
130+
foreach ($segments as $seg) {
131+
if ($seg === '' || $seg === '.') continue;
132+
if ($seg === '..') {
133+
array_pop($out);
134+
continue;
135+
}
136+
$out[] = $seg;
137+
}
138+
$norm = '/' . implode('/', $out);
139+
140+
if (isset($parts['query'])) $norm .= '?' . $parts['query'];
141+
if (isset($parts['fragment'])) $norm .= '#' . $parts['fragment'];
142+
return $norm;
143+
}
144+
145+
public function rewriteSrcset(string $srcset, string $baseUrl): string
146+
{
147+
// "a.jpg 1x, /b.jpg 2x" => absolutize each URL part
148+
$candidates = array_map('trim', explode(',', $srcset));
149+
foreach ($candidates as &$cand) {
150+
if ($cand === '') continue;
151+
// split at first whitespace: "url descriptor..."
152+
if (preg_match('/^(\S+)(\s+.*)?$/', $cand, $m)) {
153+
$u = $m[1];
154+
$d = $m[2] ?? '';
155+
$cand = $this->absolutizeUrl($u, $baseUrl) . $d;
156+
}
157+
}
158+
return implode(', ', $candidates);
159+
}
160+
161+
public function rewriteCssUrls(string $css, string $baseUrl): string
162+
{
163+
// url(...) handling (supports quotes or no quotes)
164+
$css = preg_replace_callback(
165+
'#url\(\s*(["\']?)(.*?)\1\s*\)#i',
166+
function ($m) use ($baseUrl) {
167+
$q = $m[1];
168+
$u = $m[2];
169+
$abs = $this->absolutizeUrl($u, $baseUrl);
170+
return 'url(' . ($q !== '' ? $q : '') . $abs . ($q !== '' ? $q : '') . ')';
171+
},
172+
$css
173+
);
174+
175+
// @import "..."; or @import url("..."); etc.
176+
return preg_replace_callback(
177+
'#@import\s+(?:url\()?(\s*["\']?)([^"\')\s;]+)\1\)?#i',
178+
function ($m) use ($baseUrl) {
179+
$q = trim($m[1]);
180+
$u = $m[2];
181+
$abs = $this->absolutizeUrl($u, $baseUrl);
182+
// Preserve original form loosely
183+
return str_starts_with($m[0], '@import url')
184+
? '@import url(' . ($q ?: '') . $abs . ($q ?: '') . ')'
185+
: '@import ' . ($q ?: '') . $abs . ($q ?: '');
186+
},
187+
$css
188+
);
189+
}
190+
}
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpList\Core\Domain\Common;
6+
7+
use Doctrine\ORM\EntityManagerInterface;
8+
use PhpList\Core\Domain\Configuration\Model\ConfigOption;
9+
use PhpList\Core\Domain\Configuration\Model\UrlCache;
10+
use PhpList\Core\Domain\Configuration\Repository\UrlCacheRepository;
11+
use PhpList\Core\Domain\Configuration\Service\Manager\EventLogManager;
12+
use PhpList\Core\Domain\Configuration\Service\Provider\ConfigProvider;
13+
use Psr\SimpleCache\CacheInterface;
14+
use Symfony\Contracts\HttpClient\HttpClientInterface;
15+
use Throwable;
16+
17+
class RemotePageFetcher
18+
{
19+
public function __construct(
20+
private readonly HttpClientInterface $httpClient,
21+
private readonly CacheInterface $cache,
22+
private readonly ConfigProvider $configProvider,
23+
private readonly UrlCacheRepository $urlCacheRepository,
24+
private readonly EventLogManager $eventLogManager,
25+
private readonly HtmlUrlRewriter $htmlUrlRewriter,
26+
private readonly EntityManagerInterface $entityManager,
27+
private readonly int $defaultTtl = 300,
28+
) {
29+
}
30+
31+
public function __invoke(string $url, array $userData): string
32+
{
33+
//# fix the Editor replacing & with &amp;
34+
$url = str_ireplace('&amp;', '&', $url);
35+
36+
foreach ($userData as $key => $val) {
37+
if ($key !== 'password') {
38+
$url = utf8_encode(str_ireplace("[$key]", urlencode($val), utf8_decode($url)));
39+
}
40+
}
41+
42+
$url = $this->expandUrl($url);
43+
$cacheKey = md5($url);
44+
45+
$item = $this->cache->get($cacheKey);
46+
if ($item && is_array($item) && (time() - $item['fetched'] < $this->defaultTtl)) {
47+
return $item['content'];
48+
}
49+
50+
$cacheUrl = $this->urlCacheRepository->findByUrlAndLastModified($url);
51+
$timeout = time() - ($cacheUrl?->getLastModified() ?? 0);
52+
if ($timeout < $this->defaultTtl) {
53+
return $cacheUrl->getContent();
54+
}
55+
56+
//# relying on the last modified header doesn't work for many pages
57+
//# use current time instead
58+
//# see http://mantis.phplist.com/view.php?id=7684
59+
$lastModified = time();
60+
$cacheUrl = $this->urlCacheRepository->findByUrlAndLastModified($url, $lastModified);
61+
$content = $cacheUrl?->getContent();
62+
if ($cacheUrl) {
63+
// todo: check what the page should be for this log
64+
$this->eventLogManager->log(page: 'unknown page', entry: $url . ' was cached in database');
65+
} else {
66+
$content = $this->fetchUrlDirect($url);
67+
}
68+
69+
if (!empty($content)) {
70+
$content = $this->htmlUrlRewriter->addAbsoluteResources($content, $url);
71+
$this->eventLogManager->log(page: 'unknown page', entry:'Fetching '.$url.' success');
72+
73+
$caches = $this->urlCacheRepository->getByUrl($url);
74+
foreach ($caches as $cache) {
75+
$this->entityManager->remove($cache);
76+
}
77+
$urlCache = (new UrlCache())->setUrl($url)->setContent($content)->setLastModified($lastModified);
78+
$this->entityManager->persist($urlCache);
79+
80+
$this->cache->set($cacheKey, [
81+
'fetched' => time(),
82+
'content' => $content,
83+
]);
84+
}
85+
86+
return $content;
87+
}
88+
89+
private function fetchUrlDirect(string $url): string
90+
{
91+
try {
92+
$response = $this->httpClient->request('GET', $url, [
93+
// 'timeout' => 10,
94+
'timeout' => 600,
95+
'allowRedirects' => 1,
96+
'method' => 'HEAD',
97+
]);
98+
99+
return $response->getContent(false);
100+
} catch (Throwable $e) {
101+
return '';
102+
}
103+
}
104+
105+
private function expandURL(string $url): string
106+
{
107+
$url_append = $this->configProvider->getValue(ConfigOption::RemoteUrlAppend);
108+
$url_append = strip_tags($url_append);
109+
$url_append = preg_replace('/\W/', '', $url_append);
110+
if ($url_append) {
111+
if (strpos($url, '?')) {
112+
$url = $url.$url_append;
113+
} else {
114+
$url = $url.'?'.$url_append;
115+
}
116+
}
117+
118+
return $url;
119+
}
120+
}
121+

src/Domain/Messaging/Service/TextParser.php renamed to src/Domain/Common/TextParser.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
declare(strict_types=1);
44

5-
namespace PhpList\Core\Domain\Messaging\Service;
5+
namespace PhpList\Core\Domain\Common;
66

77
class TextParser
88
{

src/Domain/Configuration/Model/ConfigOption.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,5 @@ enum ConfigOption: string
2424
case NotifyStartDefault = 'notifystart_default';
2525
case NotifyEndDefault = 'notifyend_default';
2626
case WordWrap = 'wordwrap';
27+
case RemoteUrlAppend = 'remoteurl_append';
2728
}

src/Domain/Configuration/Repository/UrlCacheRepository.php

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,26 @@
77
use PhpList\Core\Domain\Common\Repository\AbstractRepository;
88
use PhpList\Core\Domain\Common\Repository\CursorPaginationTrait;
99
use PhpList\Core\Domain\Common\Repository\Interfaces\PaginatableRepositoryInterface;
10+
use PhpList\Core\Domain\Configuration\Model\UrlCache;
1011

1112
class UrlCacheRepository extends AbstractRepository implements PaginatableRepositoryInterface
1213
{
1314
use CursorPaginationTrait;
15+
16+
public function findByUrlAndLastModified(string $url, ?int $lastModified = 0): ?UrlCache
17+
{
18+
return $this->createQueryBuilder('u')
19+
->andWhere('u.url = :url')
20+
->setParameter('url', $url)
21+
->andWhere('u.lastModified > :lastModified')
22+
->setParameter('lastModified', $lastModified)
23+
->getQuery()
24+
->getOneOrNullResult();
25+
}
26+
27+
/** @return UrlCache[] */
28+
public function getByUrl(string $url): array
29+
{
30+
return $this->findBy(['url' => $url]);
31+
}
1432
}

src/Domain/Messaging/MessageHandler/CampaignProcessorMessageHandler.php

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,11 @@ public function __invoke(CampaignProcessorMessage|SyncCampaignProcessorMessage $
7676
}
7777

7878
$messageContent = $this->precacheService->getOrCacheBaseMessageContent($campaign);
79+
if (!$messageContent) {
80+
$this->updateMessageStatus($campaign, MessageStatus::Suspended);
81+
82+
return;
83+
}
7984

8085
$this->updateMessageStatus($campaign, MessageStatus::Prepared);
8186
$subscribers = $this->subscriberProvider->getSubscribersForMessage($campaign);

0 commit comments

Comments
 (0)