Skip to content

Commit 6c7ca7c

Browse files
committed
- Add modern Google Images parsing (udm=2) and use view_image to render extracted image results, with Chrome UA and forced image endpoint for tbm=isch/udm=2.
- Normalize layouts (image grid width) and inject styling tweaks; remove broken image pagination/next link with TODO left for proper paging.
1 parent ff3a44b commit 6c7ca7c

File tree

4 files changed

+262
-38
lines changed

4 files changed

+262
-38
lines changed

app/filter.py

Lines changed: 175 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
from flask import render_template
66
import html
77
import urllib.parse as urlparse
8-
from urllib.parse import parse_qs
8+
import os
9+
from urllib.parse import parse_qs, urlencode, urlunparse
910
import re
1011

1112
from app.models.g_classes import GClasses
@@ -208,6 +209,9 @@ def clean(self, soup) -> BeautifulSoup:
208209
header = self.soup.find('header')
209210
if header:
210211
header.decompose()
212+
# Remove broken "Dark theme" toggle snippets that occasionally slip
213+
# into the footer.
214+
self.remove_dark_theme_toggle(self.soup)
211215
self.remove_site_blocks(self.soup)
212216
return self.soup
213217

@@ -292,6 +296,22 @@ def add_favicon(self, link) -> None:
292296
if GClasses.result_class_a in p_cls:
293297
break
294298

299+
def remove_dark_theme_toggle(self, soup: BeautifulSoup) -> None:
300+
"""Removes stray Dark theme toggle/link fragments that can appear
301+
in the footer."""
302+
for node in soup.find_all(string=re.compile(r'Dark theme', re.I)):
303+
try:
304+
parent = node.find_parent(
305+
lambda tag: tag.name in ['div', 'span', 'p', 'a', 'li',
306+
'section'])
307+
target = parent or node.parent
308+
if target:
309+
target.decompose()
310+
else:
311+
node.extract()
312+
except Exception:
313+
continue
314+
295315
def remove_site_blocks(self, soup) -> None:
296316
if not self.config.block or not soup.body:
297317
return
@@ -531,10 +551,32 @@ def update_styling(self) -> None:
531551
)
532552
css = f"{css_html_tag}{css}"
533553
css = re.sub('body{(.*?)}',
534-
'body{padding:0 8px;margin:0 auto;max-width:736px;}',
554+
'body{padding:0 12px;margin:0 auto;max-width:1200px;}',
535555
css)
536556
style.string = css
537557

558+
# Normalize the max width between result types so the page doesn't
559+
# jump in size when switching tabs.
560+
if not self.mobile:
561+
max_width_css = (
562+
'body, #cnt, #center_col, .main, .e9EfHf, #searchform, '
563+
'.GyAeWb, .s6JM6d {'
564+
'max-width:1200px;'
565+
'margin:0 auto;'
566+
'padding-left:12px;'
567+
'padding-right:12px;'
568+
'}'
569+
)
570+
# Build the style tag using a fresh soup to avoid cases where the
571+
# current soup lacks the helper methods (e.g., non-root elements).
572+
factory_soup = BeautifulSoup('', 'html.parser')
573+
extra_style = factory_soup.new_tag('style')
574+
extra_style.string = max_width_css
575+
if self.soup.head:
576+
self.soup.head.append(extra_style)
577+
else:
578+
self.soup.insert(0, extra_style)
579+
538580
def update_link(self, link: Tag) -> None:
539581
"""Update internal link paths with encrypted path, otherwise remove
540582
unnecessary redirects and/or marketing params from the url
@@ -738,16 +780,113 @@ def site_alt_swap(self) -> None:
738780
desc_node.replace_with(new_desc)
739781

740782
def view_image(self, soup) -> BeautifulSoup:
741-
"""Replaces the soup with a new one that handles mobile results and
742-
adds the link of the image full res to the results.
783+
"""Parses image results from Google Images and rewrites them into the
784+
lightweight Whoogle image results template.
743785
744-
Args:
745-
soup: A BeautifulSoup object containing the image mobile results.
746-
747-
Returns:
748-
BeautifulSoup: The new BeautifulSoup object
786+
Google now serves image results via the modern udm=2 endpoint, where
787+
the raw HTML contains only placeholder thumbnails. The actual image
788+
URLs live inside serialized data blobs in script tags. We extract that
789+
data and pair it with the visible result cards.
749790
"""
750791

792+
def _decode_url(url: str) -> str:
793+
if not url:
794+
return ''
795+
# Decode common escaped characters found in the script blobs
796+
return html.unescape(
797+
url.replace('\\u003d', '=').replace('\\u0026', '&')
798+
)
799+
800+
def _extract_image_data(modern_soup: BeautifulSoup) -> dict:
801+
"""Extracts docid -> {img_url, img_tbn} from serialized scripts."""
802+
scripts_text = ' '.join(
803+
script.string for script in modern_soup.find_all('script')
804+
if script.string
805+
)
806+
pattern = re.compile(
807+
r'\[0,"(?P<docid>[^"]+)",\["(?P<thumb>https://encrypted-tbn[^"]+)"'
808+
r'(?:,\d+,\d+)?\],\["(?P<full>https?://[^"]+?)"'
809+
r'(?:,\d+,\d+)?\]',
810+
re.DOTALL
811+
)
812+
results_map = {}
813+
for match in pattern.finditer(scripts_text):
814+
docid = match.group('docid')
815+
thumb = _decode_url(match.group('thumb'))
816+
full = _decode_url(match.group('full'))
817+
results_map[docid] = {
818+
'img_tbn': thumb,
819+
'img_url': full
820+
}
821+
return results_map
822+
823+
def _parse_modern_results(modern_soup: BeautifulSoup) -> list:
824+
cards = modern_soup.find_all(
825+
'div',
826+
attrs={
827+
'data-attrid': 'images universal',
828+
'data-docid': True
829+
}
830+
)
831+
if not cards:
832+
return []
833+
834+
meta_map = _extract_image_data(modern_soup)
835+
parsed = []
836+
seen = set()
837+
838+
for card in cards:
839+
docid = card.get('data-docid')
840+
meta = meta_map.get(docid, {})
841+
img_url = meta.get('img_url')
842+
img_tbn = meta.get('img_tbn')
843+
844+
# Fall back to the inline src if we failed to map the docid
845+
if not img_tbn:
846+
img_tag = card.find('img')
847+
if img_tag:
848+
candidate_src = img_tag.get('src')
849+
if candidate_src and candidate_src.startswith('http'):
850+
img_tbn = candidate_src
851+
852+
web_page = card.get('data-lpage') or ''
853+
if not web_page:
854+
link = card.find('a', href=True)
855+
if link:
856+
web_page = link['href']
857+
858+
key = (img_url, img_tbn, web_page)
859+
if not any(key) or key in seen:
860+
continue
861+
seen.add(key)
862+
863+
parsed.append({
864+
'domain': urlparse.urlparse(web_page).netloc
865+
if web_page else '',
866+
'img_url': img_url or img_tbn or '',
867+
'web_page': web_page,
868+
'img_tbn': img_tbn or img_url or ''
869+
})
870+
return parsed
871+
872+
# Try parsing the modern (udm=2) layout first
873+
modern_results = _parse_modern_results(soup)
874+
if modern_results:
875+
# TODO: Implement proper image pagination. Google images uses
876+
# infinite scroll with `ijn` offsets; we need a clean,
877+
# de-duplicated pagination strategy before exposing a Next link.
878+
next_link = None
879+
return BeautifulSoup(
880+
render_template(
881+
'imageresults.html',
882+
length=len(modern_results),
883+
results=modern_results,
884+
view_label="View Image",
885+
next_link=next_link
886+
),
887+
features='html.parser'
888+
)
889+
751890
# get some tags that are unchanged between mobile and pc versions
752891
cor_suggested = soup.find_all('table', attrs={'class': "By0U9"})
753892
next_pages = soup.find('table', attrs={'class': "uZgmoc"})
@@ -761,7 +900,11 @@ def view_image(self, soup) -> BeautifulSoup:
761900
results_all = results_div.find_all('div', attrs={'class': "lIMUZd"})
762901

763902
for item in results_all:
764-
urls = item.find('a')['href'].split('&imgrefurl=')
903+
link = item.find('a', href=True)
904+
if not link:
905+
continue
906+
907+
urls = link['href'].split('&imgrefurl=')
765908

766909
# Skip urls that are not two-element lists
767910
if len(urls) != 2:
@@ -776,7 +919,16 @@ def view_image(self, soup) -> BeautifulSoup:
776919
except IndexError:
777920
web_page = urlparse.unquote(urls[1])
778921

779-
img_tbn = urlparse.unquote(item.find('a').find('img')['src'])
922+
img_tag = link.find('img')
923+
if not img_tag:
924+
continue
925+
926+
img_tbn = urlparse.unquote(
927+
img_tag.get('src') or img_tag.get('data-src', '')
928+
)
929+
930+
if not img_tbn:
931+
continue
780932

781933
results.append({
782934
'domain': urlparse.urlparse(web_page).netloc,
@@ -793,11 +945,18 @@ def view_image(self, soup) -> BeautifulSoup:
793945

794946
# replace correction suggested by google object if exists
795947
if len(cor_suggested):
796-
soup.find_all(
948+
suggested_tables = soup.find_all(
797949
'table',
798950
attrs={'class': "By0U9"}
799-
)[0].replaceWith(cor_suggested[0])
800-
# replace next page object at the bottom of the page
801-
soup.find_all('table',
802-
attrs={'class': "uZgmoc"})[0].replaceWith(next_pages)
951+
)
952+
if suggested_tables:
953+
suggested_tables[0].replaceWith(cor_suggested[0])
954+
955+
# replace next page object at the bottom of the page, when present
956+
next_page_tables = soup.find_all('table', attrs={'class': "uZgmoc"})
957+
if next_pages and next_page_tables:
958+
next_page_tables[0].replaceWith(next_pages)
959+
960+
# TODO: Reintroduce pagination for legacy image layout if needed.
961+
803962
return soup

app/request.py

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,10 @@ def gen_query(query, args, config) -> str:
147147
# Pass along type of results (news, images, books, etc)
148148
if 'tbm' in args:
149149
param_dict['tbm'] = '&tbm=' + args.get('tbm')
150+
# Google Images now expects the modern udm=2 layout; force it when
151+
# requesting images to avoid redirects to the new AI/text layout.
152+
if args.get('tbm') == 'isch' and 'udm' not in args:
153+
param_dict['udm'] = '&udm=2'
150154

151155
# Get results page start value (10 per page, ie page 2 start val = 20)
152156
if 'start' in args:
@@ -212,8 +216,18 @@ class Request:
212216
"""
213217

214218
def __init__(self, normal_ua, root_path, config: Config, http_client=None):
215-
self.search_url = 'https://www.google.com/search?gbv=1&num=' + str(
216-
os.getenv('WHOOGLE_RESULTS_PER_PAGE', 10)) + '&q='
219+
results_per_page = str(os.getenv('WHOOGLE_RESULTS_PER_PAGE', 10))
220+
self.search_url = (
221+
'https://www.google.com/search?gbv=1&num='
222+
f'{results_per_page}&q='
223+
)
224+
# Google Images rejects the lightweight gbv=1 interface. Use the
225+
# modern udm=2 entrypoint specifically for image searches to avoid the
226+
# "update your browser" interstitial.
227+
self.image_search_url = (
228+
'https://www.google.com/search?udm=2&num='
229+
f'{results_per_page}&q='
230+
)
217231
# Optionally send heartbeat to Tor to determine availability
218232
# Only when Tor is enabled in config to avoid unnecessary socket usage
219233
if config.tor:
@@ -235,6 +249,13 @@ def __init__(self, normal_ua, root_path, config: Config, http_client=None):
235249
if not self.mobile:
236250
self.modified_user_agent_mobile = gen_user_agent(config, True)
237251

252+
# Dedicated modern UA to use when Google rejects legacy ones (e.g. Images)
253+
self.image_user_agent = (
254+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
255+
'AppleWebKit/537.36 (KHTML, like Gecko) '
256+
'Chrome/127.0.0.0 Safari/537.36'
257+
)
258+
238259
# Set up proxy configuration
239260
proxy_path = os.environ.get('WHOOGLE_PROXY_LOC', '')
240261
if proxy_path:
@@ -332,6 +353,13 @@ def send(self, base_url='', query='', attempt=0,
332353
else:
333354
modified_user_agent = self.modified_user_agent
334355

356+
# Some Google endpoints (notably Images) now refuse legacy user agents.
357+
# If an image search is detected and the generated UA isn't Chromium-
358+
# like, retry with a modern Chrome string to avoid the "update your
359+
# browser" interstitial.
360+
if (('tbm=isch' in query) or ('udm=2' in query)) and 'Chrome' not in modified_user_agent:
361+
modified_user_agent = self.image_user_agent
362+
335363
headers = {
336364
'User-Agent': modified_user_agent,
337365
'Accept': ('text/html,application/xhtml+xml,application/xml;'
@@ -345,16 +373,23 @@ def send(self, base_url='', query='', attempt=0,
345373
'Sec-Fetch-Site': 'none',
346374
'Sec-Fetch-Mode': 'navigate',
347375
'Sec-Fetch-User': '?1',
348-
'Sec-Fetch-Dest': 'document',
349-
'Sec-CH-UA': (
350-
'"Not/A)Brand";v="8", '
351-
'"Chromium";v="127", '
352-
'"Google Chrome";v="127"'
353-
),
354-
'Sec-CH-UA-Mobile': '?0',
355-
'Sec-CH-UA-Platform': '"macOS"'
376+
'Sec-Fetch-Dest': 'document'
356377
}
357-
378+
# Only attach client hints when using a Chromium-like user agent to
379+
# avoid sending conflicting information that can trigger unsupported
380+
# browser pages.
381+
if 'Chrome' in headers['User-Agent']:
382+
headers.update({
383+
'Sec-CH-UA': (
384+
'"Not/A)Brand";v="8", '
385+
'"Chromium";v="127", '
386+
'"Google Chrome";v="127"'
387+
),
388+
'Sec-CH-UA-Mobile': '?0',
389+
'Sec-CH-UA-Platform': '"Windows"'
390+
})
391+
392+
358393
# Add Accept-Language header tied to the current config if requested
359394
if self.lang_interface:
360395
headers['Accept-Language'] = (
@@ -393,9 +428,13 @@ def send(self, base_url='', query='', attempt=0,
393428
"Error raised during Tor connection validation",
394429
disable=True)
395430

431+
search_base = base_url or self.search_url
432+
if not base_url and ('tbm=isch' in query or 'udm=2' in query):
433+
search_base = self.image_search_url
434+
396435
try:
397436
response = self.http_client.get(
398-
(base_url or self.search_url) + query,
437+
search_base + query,
399438
headers=headers,
400439
cookies=consent_cookies)
401440
except httpx.HTTPError as e:
@@ -406,6 +445,6 @@ def send(self, base_url='', query='', attempt=0,
406445
attempt += 1
407446
if attempt > 10:
408447
raise TorError("Tor query failed -- max attempts exceeded 10")
409-
return self.send((base_url or self.search_url), query, attempt)
448+
return self.send(search_base, query, attempt)
410449

411450
return response

0 commit comments

Comments
 (0)