Skip to content

Commit 2712226

Browse files
committed
Merge pull request #61 from cameron1729/more_restrictive_regex
More restrictive regex for tags
2 parents 6ade23e + 0a5ae7e commit 2712226

File tree

2 files changed

+22
-18
lines changed

2 files changed

+22
-18
lines changed

src/Html2Text.php

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -49,25 +49,25 @@ class Html2Text
4949
protected $search = array(
5050
"/\r/", // Non-legal carriage return
5151
"/[\n\t]+/", // Newlines and tabs
52-
'/<head[^>]*>.*?<\/head>/i', // <head>
53-
'/<script[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with
54-
'/<style[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with
55-
'/<i[^>]*>(.*?)<\/i>/i', // <i>
56-
'/<em[^>]*>(.*?)<\/em>/i', // <em>
57-
'/(<ul[^>]*>|<\/ul>)/i', // <ul> and </ul>
58-
'/(<ol[^>]*>|<\/ol>)/i', // <ol> and </ol>
59-
'/(<dl[^>]*>|<\/dl>)/i', // <dl> and </dl>
60-
'/<li[^>]*>(.*?)<\/li>/i', // <li> and </li>
61-
'/<dd[^>]*>(.*?)<\/dd>/i', // <dd> and </dd>
62-
'/<dt[^>]*>(.*?)<\/dt>/i', // <dt> and </dt>
63-
'/<li[^>]*>/i', // <li>
64-
'/<hr[^>]*>/i', // <hr>
65-
'/<div[^>]*>/i', // <div>
66-
'/(<table[^>]*>|<\/table>)/i', // <table> and </table>
67-
'/(<tr[^>]*>|<\/tr>)/i', // <tr> and </tr>
68-
'/<td[^>]*>(.*?)<\/td>/i', // <td> and </td>
52+
'/<head\b[^>]*>.*?<\/head>/i', // <head>
53+
'/<script\b[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with
54+
'/<style\b[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with
55+
'/<i\b[^>]*>(.*?)<\/i>/i', // <i>
56+
'/<em\b[^>]*>(.*?)<\/em>/i', // <em>
57+
'/(<ul\b[^>]*>|<\/ul>)/i', // <ul> and </ul>
58+
'/(<ol\b[^>]*>|<\/ol>)/i', // <ol> and </ol>
59+
'/(<dl\b[^>]*>|<\/dl>)/i', // <dl> and </dl>
60+
'/<li\b[^>]*>(.*?)<\/li>/i', // <li> and </li>
61+
'/<dd\b[^>]*>(.*?)<\/dd>/i', // <dd> and </dd>
62+
'/<dt\b[^>]*>(.*?)<\/dt>/i', // <dt> and </dt>
63+
'/<li\b[^>]*>/i', // <li>
64+
'/<hr\b[^>]*>/i', // <hr>
65+
'/<div\b[^>]*>/i', // <div>
66+
'/(<table\b[^>]*>|<\/table>)/i', // <table> and </table>
67+
'/(<tr\b[^>]*>|<\/tr>)/i', // <tr> and </tr>
68+
'/<td\b[^>]*>(.*?)<\/td>/i', // <td> and </td>
6969
'/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span>
70-
'/<(img)[^>]*alt=\"([^>"]+)\"[^>]*>/i', // <img> with alt tag
70+
'/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i', // <img> with alt tag
7171
);
7272

7373
/**

test/ImageTest.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ public function testImageDataProvider() {
2626
'html' => 'xx<img src="http://example.com/example.jpg" alt="An example image">xx',
2727
'expected' => 'xx[An example image]xx',
2828
),
29+
'With italics' => array(
30+
'html' => '<img src="shrek.jpg" alt="the ogrelord" /> Blah <i>blah</i> blah',
31+
'expected' => '[the ogrelord] Blah _blah_ blah'
32+
)
2933
);
3034
}
3135

0 commit comments

Comments
 (0)