benbusby
diff --git a/‎app/filter.py‎
Lines changed: 175 additions & 16 deletions b/‎app/filter.py‎
Lines changed: 175 additions & 16 deletions
diff --git a/‎app/request.py‎
Lines changed: 52 additions & 13 deletions b/‎app/request.py‎
Lines changed: 52 additions & 13 deletions
@@ -5,7 +5,8 @@
 from flask import render_template
 import html
 import urllib.parse as urlparse
-from urllib.parse import parse_qs
+import os
+from urllib.parse import parse_qs, urlencode, urlunparse
 import re
 
 from app.models.g_classes import GClasses
@@ -208,6 +209,9 @@ def clean(self, soup) -> BeautifulSoup:
         header = self.soup.find('header')
         if header:
             header.decompose()
+        # Remove broken "Dark theme" toggle snippets that occasionally slip
+        # into the footer.
+        self.remove_dark_theme_toggle(self.soup)
         self.remove_site_blocks(self.soup)
         return self.soup
 
@@ -292,6 +296,22 @@ def add_favicon(self, link) -> None:
             if GClasses.result_class_a in p_cls:
                 break
 
+    def remove_dark_theme_toggle(self, soup: BeautifulSoup) -> None:
+        """Removes stray Dark theme toggle/link fragments that can appear
+        in the footer."""
+        for node in soup.find_all(string=re.compile(r'Dark theme', re.I)):
+            try:
+                parent = node.find_parent(
+                    lambda tag: tag.name in ['div', 'span', 'p', 'a', 'li',
+                                             'section'])
+                target = parent or node.parent
+                if target:
+                    target.decompose()
+                else:
+                    node.extract()
+            except Exception:
+                continue
+
     def remove_site_blocks(self, soup) -> None:
         if not self.config.block or not soup.body:
             return
@@ -531,10 +551,32 @@ def update_styling(self) -> None:
             )
             css = f"{css_html_tag}{css}"
             css = re.sub('body{(.*?)}',
-                         'body{padding:0 8px;margin:0 auto;max-width:736px;}',
+                         'body{padding:0 12px;margin:0 auto;max-width:1200px;}',
                          css)
             style.string = css
 
+        # Normalize the max width between result types so the page doesn't
+        # jump in size when switching tabs.
+        if not self.mobile:
+            max_width_css = (
+                'body, #cnt, #center_col, .main, .e9EfHf, #searchform, '
+                '.GyAeWb, .s6JM6d {'
+                'max-width:1200px;'
+                'margin:0 auto;'
+                'padding-left:12px;'
+                'padding-right:12px;'
+                '}'
+            )
+            # Build the style tag using a fresh soup to avoid cases where the
+            # current soup lacks the helper methods (e.g., non-root elements).
+            factory_soup = BeautifulSoup('', 'html.parser')
+            extra_style = factory_soup.new_tag('style')
+            extra_style.string = max_width_css
+            if self.soup.head:
+                self.soup.head.append(extra_style)
+            else:
+                self.soup.insert(0, extra_style)
+
     def update_link(self, link: Tag) -> None:
         """Update internal link paths with encrypted path, otherwise remove
         unnecessary redirects and/or marketing params from the url
@@ -738,16 +780,113 @@ def site_alt_swap(self) -> None:
             desc_node.replace_with(new_desc)
 
     def view_image(self, soup) -> BeautifulSoup:
-        """Replaces the soup with a new one that handles mobile results and
-        adds the link of the image full res to the results.
+        """Parses image results from Google Images and rewrites them into the
+        lightweight Whoogle image results template.
 
-        Args:
-            soup: A BeautifulSoup object containing the image mobile results.
-
-        Returns:
-            BeautifulSoup: The new BeautifulSoup object
+        Google now serves image results via the modern udm=2 endpoint, where
+        the raw HTML contains only placeholder thumbnails. The actual image
+        URLs live inside serialized data blobs in script tags. We extract that
+        data and pair it with the visible result cards.
         """
 
+        def _decode_url(url: str) -> str:
+            if not url:
+                return ''
+            # Decode common escaped characters found in the script blobs
+            return html.unescape(
+                url.replace('\\u003d', '=').replace('\\u0026', '&')
+            )
+
+        def _extract_image_data(modern_soup: BeautifulSoup) -> dict:
+            """Extracts docid -> {img_url, img_tbn} from serialized scripts."""
+            scripts_text = ' '.join(
+                script.string for script in modern_soup.find_all('script')
+                if script.string
+            )
+            pattern = re.compile(
+                r'\[0,"(?P<docid>[^"]+)",\["(?P<thumb>https://encrypted-tbn[^"]+)"'
+                r'(?:,\d+,\d+)?\],\["(?P<full>https?://[^"]+?)"'
+                r'(?:,\d+,\d+)?\]',
+                re.DOTALL
+            )
+            results_map = {}
+            for match in pattern.finditer(scripts_text):
+                docid = match.group('docid')
+                thumb = _decode_url(match.group('thumb'))
+                full = _decode_url(match.group('full'))
+                results_map[docid] = {
+                    'img_tbn': thumb,
+                    'img_url': full
+                }
+            return results_map
+
+        def _parse_modern_results(modern_soup: BeautifulSoup) -> list:
+            cards = modern_soup.find_all(
+                'div',
+                attrs={
+                    'data-attrid': 'images universal',
+                    'data-docid': True
+                }
+            )
+            if not cards:
+                return []
+
+            meta_map = _extract_image_data(modern_soup)
+            parsed = []
+            seen = set()
+
+            for card in cards:
+                docid = card.get('data-docid')
+                meta = meta_map.get(docid, {})
+                img_url = meta.get('img_url')
+                img_tbn = meta.get('img_tbn')
+
+                # Fall back to the inline src if we failed to map the docid
+                if not img_tbn:
+                    img_tag = card.find('img')
+                    if img_tag:
+                        candidate_src = img_tag.get('src')
+                        if candidate_src and candidate_src.startswith('http'):
+                            img_tbn = candidate_src
+
+                web_page = card.get('data-lpage') or ''
+                if not web_page:
+                    link = card.find('a', href=True)
+                    if link:
+                        web_page = link['href']
+
+                key = (img_url, img_tbn, web_page)
+                if not any(key) or key in seen:
+                    continue
+                seen.add(key)
+
+                parsed.append({
+                    'domain': urlparse.urlparse(web_page).netloc
+                    if web_page else '',
+                    'img_url': img_url or img_tbn or '',
+                    'web_page': web_page,
+                    'img_tbn': img_tbn or img_url or ''
+                })
+            return parsed
+
+        # Try parsing the modern (udm=2) layout first
+        modern_results = _parse_modern_results(soup)
+        if modern_results:
+            # TODO: Implement proper image pagination. Google images uses
+            # infinite scroll with `ijn` offsets; we need a clean,
+            # de-duplicated pagination strategy before exposing a Next link.
+            next_link = None
+            return BeautifulSoup(
+                render_template(
+                    'imageresults.html',
+                    length=len(modern_results),
+                    results=modern_results,
+                    view_label="View Image",
+                    next_link=next_link
+                ),
+                features='html.parser'
+            )
+
         # get some tags that are unchanged between mobile and pc versions
         cor_suggested = soup.find_all('table', attrs={'class': "By0U9"})
         next_pages = soup.find('table', attrs={'class': "uZgmoc"})
@@ -761,7 +900,11 @@ def view_image(self, soup) -> BeautifulSoup:
             results_all = results_div.find_all('div', attrs={'class': "lIMUZd"})
 
         for item in results_all:
-            urls = item.find('a')['href'].split('&imgrefurl=')
+            link = item.find('a', href=True)
+            if not link:
+                continue
+
+            urls = link['href'].split('&imgrefurl=')
 
             # Skip urls that are not two-element lists
             if len(urls) != 2:
@@ -776,7 +919,16 @@ def view_image(self, soup) -> BeautifulSoup:
             except IndexError:
                 web_page = urlparse.unquote(urls[1])
 
-            img_tbn = urlparse.unquote(item.find('a').find('img')['src'])
+            img_tag = link.find('img')
+            if not img_tag:
+                continue
+
+            img_tbn = urlparse.unquote(
+                img_tag.get('src') or img_tag.get('data-src', '')
+            )
+
+            if not img_tbn:
+                continue
 
             results.append({
                 'domain': urlparse.urlparse(web_page).netloc,
@@ -793,11 +945,18 @@ def view_image(self, soup) -> BeautifulSoup:
 
         # replace correction suggested by google object if exists
         if len(cor_suggested):
-            soup.find_all(
+            suggested_tables = soup.find_all(
                 'table',
                 attrs={'class': "By0U9"}
-            )[0].replaceWith(cor_suggested[0])
-        # replace next page object at the bottom of the page
-        soup.find_all('table',
-                      attrs={'class': "uZgmoc"})[0].replaceWith(next_pages)
+            )
+            if suggested_tables:
+                suggested_tables[0].replaceWith(cor_suggested[0])
+
+        # replace next page object at the bottom of the page, when present
+        next_page_tables = soup.find_all('table', attrs={'class': "uZgmoc"})
+        if next_pages and next_page_tables:
+            next_page_tables[0].replaceWith(next_pages)
+
+        # TODO: Reintroduce pagination for legacy image layout if needed.
+
         return soup
@@ -147,6 +147,10 @@ def gen_query(query, args, config) -> str:
     # Pass along type of results (news, images, books, etc)
     if 'tbm' in args:
         param_dict['tbm'] = '&tbm=' + args.get('tbm')
+        # Google Images now expects the modern udm=2 layout; force it when
+        # requesting images to avoid redirects to the new AI/text layout.
+        if args.get('tbm') == 'isch' and 'udm' not in args:
+            param_dict['udm'] = '&udm=2'
 
     # Get results page start value (10 per page, ie page 2 start val = 20)
     if 'start' in args:
@@ -212,8 +216,18 @@ class Request:
     """
 
     def __init__(self, normal_ua, root_path, config: Config, http_client=None):
-        self.search_url = 'https://www.google.com/search?gbv=1&num=' + str(
-            os.getenv('WHOOGLE_RESULTS_PER_PAGE', 10)) + '&q='
+        results_per_page = str(os.getenv('WHOOGLE_RESULTS_PER_PAGE', 10))
+        self.search_url = (
+            'https://www.google.com/search?gbv=1&num='
+            f'{results_per_page}&q='
+        )
+        # Google Images rejects the lightweight gbv=1 interface. Use the
+        # modern udm=2 entrypoint specifically for image searches to avoid the
+        # "update your browser" interstitial.
+        self.image_search_url = (
+            'https://www.google.com/search?udm=2&num='
+            f'{results_per_page}&q='
+        )
         # Optionally send heartbeat to Tor to determine availability
         # Only when Tor is enabled in config to avoid unnecessary socket usage
         if config.tor:
@@ -235,6 +249,13 @@ def __init__(self, normal_ua, root_path, config: Config, http_client=None):
         if not self.mobile:
             self.modified_user_agent_mobile = gen_user_agent(config, True)
 
+        # Dedicated modern UA to use when Google rejects legacy ones (e.g. Images)
+        self.image_user_agent = (
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
+            'AppleWebKit/537.36 (KHTML, like Gecko) '
+            'Chrome/127.0.0.0 Safari/537.36'
+        )
+
         # Set up proxy configuration
         proxy_path = os.environ.get('WHOOGLE_PROXY_LOC', '')
         if proxy_path:
@@ -332,6 +353,13 @@ def send(self, base_url='', query='', attempt=0,
             else:
                 modified_user_agent = self.modified_user_agent
 
+        # Some Google endpoints (notably Images) now refuse legacy user agents.
+        # If an image search is detected and the generated UA isn't Chromium-
+        # like, retry with a modern Chrome string to avoid the "update your
+        # browser" interstitial.
+        if (('tbm=isch' in query) or ('udm=2' in query)) and 'Chrome' not in modified_user_agent:
+            modified_user_agent = self.image_user_agent
+
         headers = {
             'User-Agent': modified_user_agent,
             'Accept': ('text/html,application/xhtml+xml,application/xml;'
@@ -345,16 +373,23 @@ def send(self, base_url='', query='', attempt=0,
             'Sec-Fetch-Site': 'none',
             'Sec-Fetch-Mode': 'navigate',
             'Sec-Fetch-User': '?1',
-            'Sec-Fetch-Dest': 'document',
-            'Sec-CH-UA': (
-                '"Not/A)Brand";v="8", '
-                '"Chromium";v="127", '
-                '"Google Chrome";v="127"'
-            ),
-            'Sec-CH-UA-Mobile': '?0',
-            'Sec-CH-UA-Platform': '"macOS"'
+            'Sec-Fetch-Dest': 'document'
         }
-
+        # Only attach client hints when using a Chromium-like user agent to
+        # avoid sending conflicting information that can trigger unsupported
+        # browser pages.
+        if 'Chrome' in headers['User-Agent']:
+            headers.update({
+                'Sec-CH-UA': (
+                    '"Not/A)Brand";v="8", '
+                    '"Chromium";v="127", '
+                    '"Google Chrome";v="127"'
+                ),
+                'Sec-CH-UA-Mobile': '?0',
+                'Sec-CH-UA-Platform': '"Windows"'
+            })
+
+ 
         # Add Accept-Language header tied to the current config if requested
         if self.lang_interface:
             headers['Accept-Language'] = (
@@ -393,9 +428,13 @@ def send(self, base_url='', query='', attempt=0,
                     "Error raised during Tor connection validation",
                     disable=True)
 
+        search_base = base_url or self.search_url
+        if not base_url and ('tbm=isch' in query or 'udm=2' in query):
+            search_base = self.image_search_url
+
         try:
             response = self.http_client.get(
-                (base_url or self.search_url) + query,
+                search_base + query,
                 headers=headers,
                 cookies=consent_cookies)
         except httpx.HTTPError as e:
@@ -406,6 +445,6 @@ def send(self, base_url='', query='', attempt=0,
             attempt += 1
             if attempt > 10:
                 raise TorError("Tor query failed -- max attempts exceeded 10")
-            return self.send((base_url or self.search_url), query, attempt)
+            return self.send(search_base, query, attempt)
 
         return response