55from flask import render_template
66import html
77import urllib .parse as urlparse
8- from urllib .parse import parse_qs
8+ import os
9+ from urllib .parse import parse_qs , urlencode , urlunparse
910import re
1011
1112from app .models .g_classes import GClasses
@@ -208,6 +209,9 @@ def clean(self, soup) -> BeautifulSoup:
208209 header = self .soup .find ('header' )
209210 if header :
210211 header .decompose ()
212+ # Remove broken "Dark theme" toggle snippets that occasionally slip
213+ # into the footer.
214+ self .remove_dark_theme_toggle (self .soup )
211215 self .remove_site_blocks (self .soup )
212216 return self .soup
213217
@@ -292,6 +296,22 @@ def add_favicon(self, link) -> None:
292296 if GClasses .result_class_a in p_cls :
293297 break
294298
299+ def remove_dark_theme_toggle (self , soup : BeautifulSoup ) -> None :
300+ """Removes stray Dark theme toggle/link fragments that can appear
301+ in the footer."""
302+ for node in soup .find_all (string = re .compile (r'Dark theme' , re .I )):
303+ try :
304+ parent = node .find_parent (
305+ lambda tag : tag .name in ['div' , 'span' , 'p' , 'a' , 'li' ,
306+ 'section' ])
307+ target = parent or node .parent
308+ if target :
309+ target .decompose ()
310+ else :
311+ node .extract ()
312+ except Exception :
313+ continue
314+
295315 def remove_site_blocks (self , soup ) -> None :
296316 if not self .config .block or not soup .body :
297317 return
@@ -531,10 +551,32 @@ def update_styling(self) -> None:
531551 )
532552 css = f"{ css_html_tag } { css } "
533553 css = re .sub ('body{(.*?)}' ,
534- 'body{padding:0 8px ;margin:0 auto;max-width:736px ;}' ,
554+ 'body{padding:0 12px ;margin:0 auto;max-width:1200px ;}' ,
535555 css )
536556 style .string = css
537557
558+ # Normalize the max width between result types so the page doesn't
559+ # jump in size when switching tabs.
560+ if not self .mobile :
561+ max_width_css = (
562+ 'body, #cnt, #center_col, .main, .e9EfHf, #searchform, '
563+ '.GyAeWb, .s6JM6d {'
564+ 'max-width:1200px;'
565+ 'margin:0 auto;'
566+ 'padding-left:12px;'
567+ 'padding-right:12px;'
568+ '}'
569+ )
570+ # Build the style tag using a fresh soup to avoid cases where the
571+ # current soup lacks the helper methods (e.g., non-root elements).
572+ factory_soup = BeautifulSoup ('' , 'html.parser' )
573+ extra_style = factory_soup .new_tag ('style' )
574+ extra_style .string = max_width_css
575+ if self .soup .head :
576+ self .soup .head .append (extra_style )
577+ else :
578+ self .soup .insert (0 , extra_style )
579+
538580 def update_link (self , link : Tag ) -> None :
539581 """Update internal link paths with encrypted path, otherwise remove
540582 unnecessary redirects and/or marketing params from the url
@@ -738,16 +780,113 @@ def site_alt_swap(self) -> None:
738780 desc_node .replace_with (new_desc )
739781
740782 def view_image (self , soup ) -> BeautifulSoup :
741- """Replaces the soup with a new one that handles mobile results and
742- adds the link of the image full res to the results.
783+ """Parses image results from Google Images and rewrites them into the
784+ lightweight Whoogle image results template .
743785
744- Args:
745- soup: A BeautifulSoup object containing the image mobile results.
746-
747- Returns:
748- BeautifulSoup: The new BeautifulSoup object
786+ Google now serves image results via the modern udm=2 endpoint, where
787+ the raw HTML contains only placeholder thumbnails. The actual image
788+ URLs live inside serialized data blobs in script tags. We extract that
789+ data and pair it with the visible result cards.
749790 """
750791
792+ def _decode_url (url : str ) -> str :
793+ if not url :
794+ return ''
795+ # Decode common escaped characters found in the script blobs
796+ return html .unescape (
797+ url .replace ('\\ u003d' , '=' ).replace ('\\ u0026' , '&' )
798+ )
799+
800+ def _extract_image_data (modern_soup : BeautifulSoup ) -> dict :
801+ """Extracts docid -> {img_url, img_tbn} from serialized scripts."""
802+ scripts_text = ' ' .join (
803+ script .string for script in modern_soup .find_all ('script' )
804+ if script .string
805+ )
806+ pattern = re .compile (
807+ r'\[0,"(?P<docid>[^"]+)",\["(?P<thumb>https://encrypted-tbn[^"]+)"'
808+ r'(?:,\d+,\d+)?\],\["(?P<full>https?://[^"]+?)"'
809+ r'(?:,\d+,\d+)?\]' ,
810+ re .DOTALL
811+ )
812+ results_map = {}
813+ for match in pattern .finditer (scripts_text ):
814+ docid = match .group ('docid' )
815+ thumb = _decode_url (match .group ('thumb' ))
816+ full = _decode_url (match .group ('full' ))
817+ results_map [docid ] = {
818+ 'img_tbn' : thumb ,
819+ 'img_url' : full
820+ }
821+ return results_map
822+
823+ def _parse_modern_results (modern_soup : BeautifulSoup ) -> list :
824+ cards = modern_soup .find_all (
825+ 'div' ,
826+ attrs = {
827+ 'data-attrid' : 'images universal' ,
828+ 'data-docid' : True
829+ }
830+ )
831+ if not cards :
832+ return []
833+
834+ meta_map = _extract_image_data (modern_soup )
835+ parsed = []
836+ seen = set ()
837+
838+ for card in cards :
839+ docid = card .get ('data-docid' )
840+ meta = meta_map .get (docid , {})
841+ img_url = meta .get ('img_url' )
842+ img_tbn = meta .get ('img_tbn' )
843+
844+ # Fall back to the inline src if we failed to map the docid
845+ if not img_tbn :
846+ img_tag = card .find ('img' )
847+ if img_tag :
848+ candidate_src = img_tag .get ('src' )
849+ if candidate_src and candidate_src .startswith ('http' ):
850+ img_tbn = candidate_src
851+
852+ web_page = card .get ('data-lpage' ) or ''
853+ if not web_page :
854+ link = card .find ('a' , href = True )
855+ if link :
856+ web_page = link ['href' ]
857+
858+ key = (img_url , img_tbn , web_page )
859+ if not any (key ) or key in seen :
860+ continue
861+ seen .add (key )
862+
863+ parsed .append ({
864+ 'domain' : urlparse .urlparse (web_page ).netloc
865+ if web_page else '' ,
866+ 'img_url' : img_url or img_tbn or '' ,
867+ 'web_page' : web_page ,
868+ 'img_tbn' : img_tbn or img_url or ''
869+ })
870+ return parsed
871+
872+ # Try parsing the modern (udm=2) layout first
873+ modern_results = _parse_modern_results (soup )
874+ if modern_results :
875+ # TODO: Implement proper image pagination. Google images uses
876+ # infinite scroll with `ijn` offsets; we need a clean,
877+ # de-duplicated pagination strategy before exposing a Next link.
878+ next_link = None
879+ return BeautifulSoup (
880+ render_template (
881+ 'imageresults.html' ,
882+ length = len (modern_results ),
883+ results = modern_results ,
884+ view_label = "View Image" ,
885+ next_link = next_link
886+ ),
887+ features = 'html.parser'
888+ )
889+
751890 # get some tags that are unchanged between mobile and pc versions
752891 cor_suggested = soup .find_all ('table' , attrs = {'class' : "By0U9" })
753892 next_pages = soup .find ('table' , attrs = {'class' : "uZgmoc" })
@@ -761,7 +900,11 @@ def view_image(self, soup) -> BeautifulSoup:
761900 results_all = results_div .find_all ('div' , attrs = {'class' : "lIMUZd" })
762901
763902 for item in results_all :
764- urls = item .find ('a' )['href' ].split ('&imgrefurl=' )
903+ link = item .find ('a' , href = True )
904+ if not link :
905+ continue
906+
907+ urls = link ['href' ].split ('&imgrefurl=' )
765908
766909 # Skip urls that are not two-element lists
767910 if len (urls ) != 2 :
@@ -776,7 +919,16 @@ def view_image(self, soup) -> BeautifulSoup:
776919 except IndexError :
777920 web_page = urlparse .unquote (urls [1 ])
778921
779- img_tbn = urlparse .unquote (item .find ('a' ).find ('img' )['src' ])
922+ img_tag = link .find ('img' )
923+ if not img_tag :
924+ continue
925+
926+ img_tbn = urlparse .unquote (
927+ img_tag .get ('src' ) or img_tag .get ('data-src' , '' )
928+ )
929+
930+ if not img_tbn :
931+ continue
780932
781933 results .append ({
782934 'domain' : urlparse .urlparse (web_page ).netloc ,
@@ -793,11 +945,18 @@ def view_image(self, soup) -> BeautifulSoup:
793945
794946 # replace correction suggested by google object if exists
795947 if len (cor_suggested ):
796- soup .find_all (
948+ suggested_tables = soup .find_all (
797949 'table' ,
798950 attrs = {'class' : "By0U9" }
799- )[0 ].replaceWith (cor_suggested [0 ])
800- # replace next page object at the bottom of the page
801- soup .find_all ('table' ,
802- attrs = {'class' : "uZgmoc" })[0 ].replaceWith (next_pages )
951+ )
952+ if suggested_tables :
953+ suggested_tables [0 ].replaceWith (cor_suggested [0 ])
954+
955+ # replace next page object at the bottom of the page, when present
956+ next_page_tables = soup .find_all ('table' , attrs = {'class' : "uZgmoc" })
957+ if next_pages and next_page_tables :
958+ next_page_tables [0 ].replaceWith (next_pages )
959+
960+ # TODO: Reintroduce pagination for legacy image layout if needed.
961+
803962 return soup
0 commit comments