Skip to content

Commit dcc4a2a

Browse files
committed
More accurate way to tell a single-white-color image
1 parent ae5951f commit dcc4a2a

File tree

3 files changed

+32
-8
lines changed

3 files changed

+32
-8
lines changed

page_content_extractor/webimage.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,8 @@ def is_candidate(self):
6363
logger.info('Failed on image bytesize check, size is %s, %s', len(self.raw_data),
6464
self.url)
6565
return False
66-
try:
67-
img = Image.open(io.BytesIO(self.raw_data))
68-
colors = img.getcolors(maxcolors=2)
69-
if colors is not None and len(colors) == 1:
70-
logger.info('Maybe a solid color image(%s), colors=%s', self.url, len(colors))
71-
return False
72-
except Exception as e:
73-
logger.warning('Failed on image colors check, %s, url=%s', e, self.url)
66+
if self.is_predominantly_white_color():
67+
return False
7468
self._is_candidate = True
7569
self.width, self.height = width, height
7670
return True
@@ -123,6 +117,24 @@ def raw_data(self):
123117
def raw_data(self, value):
124118
self._raw_data = value
125119

120+
def is_predominantly_white_color(self, predominance=.99, white_distance=10):
121+
try:
122+
maxpixels = 1024
123+
with Image.open(io.BytesIO(self.raw_data)) as img:
124+
img = img.convert('RGB')
125+
# img.show()
126+
if img.width and img.height:
127+
maxpixels = img.width * img.height
128+
colors = img.getcolors(maxcolors=maxpixels)
129+
total_count = sum(count for count, color in colors)
130+
for count, color in colors:
131+
if count / total_count > predominance and all(255 - white_distance <= value <= 255 for value in color):
132+
logger.info('Maybe a solid color image(%s), dominant_pct=%f, RGB=%s', self.url, count / total_count, color)
133+
return True
134+
except Exception as e:
135+
logger.warning('Failed on image colors check, %s, url=%s', e, self.url)
136+
return False
137+
126138
# 'image/svg+xml;charset=utf-8' -> svg
127139
def guess_suffix(self):
128140
if not self.content_type:

test/fixtures/reddit.png

4.74 KB
Loading

test/test_image.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,15 @@ def test_webp_compression(self):
6262
img.try_compress()
6363
self.assertEqual('.webp', img.suffix)
6464
self.assertEqual('c1a11593331f7678c7addb3c0001f57f.webp', img.uniq_name())
65+
66+
def test_predominantly_white_color(self):
67+
for fname, is_white_color in (
68+
('home.png', False),
69+
('reddit.png', True),
70+
('medium_Comment_f406ff2a89.png', False),
71+
):
72+
fpath = os.path.join(os.path.dirname(__file__), 'fixtures', fname)
73+
img = WebImage.from_json_str('{"url":"%s"}' % fpath)
74+
with open(fpath, 'rb') as stream:
75+
img.raw_data = stream.read()
76+
self.assertEqual(is_white_color, img.is_predominantly_white_color())

0 commit comments

Comments
 (0)