44import os
55import json
66from urllib .parse import urlparse
7+ import string
8+ import random
79import requests
810import shutil
911from datetime import datetime
10- from scripts .resize_image import resize_image
12+ from scripts .resize_image import resize_image , convert_to_webp , is_valid_image , is_valid_svg
1113from dateutil .parser import parse as date_parse
14+ from bs4 import BeautifulSoup
1215
1316# Path to the subscribers.json file
1417SUBSCRIBERS_JSON_PATH = os .path .join (os .path .dirname (__file__ ), 'data' , 'subscribers.json' )
@@ -59,15 +62,75 @@ def fetch_and_create_post(self):
5962 except Exception as e :
6063 print (f"Failed to process feed for { self .subscriber_name } : { e } " )
6164
65+ def fetch_all_images (self , content , subscriber_shortname , post_name ):
66+ img_folder = os .path .join ("img" , "subscribers" , subscriber_shortname , post_name )
67+ soup = BeautifulSoup (content , 'html.parser' )
68+ unknown_img_folder = os .path .join ("static" , img_folder , "unknown" )
69+
70+ if os .path .exists (unknown_img_folder ):
71+ shutil .rmtree (unknown_img_folder )
72+ os .makedirs (unknown_img_folder , exist_ok = True )
73+
74+ for img in soup .find_all ('img' ):
75+ img_url = img ['src' ]
76+ file_name = self .get_image_name (img_url .split ('?' )[0 ])
77+ try :
78+ downloaded_img = self .download_and_process_image (img_url , file_name , img_folder , unknown_img_folder )
79+ img ['src' ] = downloaded_img
80+ except Exception as e :
81+ img ['src' ] = ""
82+ print (f"Failed to process image: { e } " )
83+
84+ for video in soup .find_all ('video' ):
85+ video_url = video .find ('source' )['src' ]
86+ video .replace_with (soup .new_tag ('a' , href = video_url , target = "_blank" , string = "Watch Video" ))
87+
88+ return str (soup )
89+
90+ def download_and_process_image (self , img_url , file_name , img_folder , unknown_img_folder ):
91+ no_param_url = img_url .split ('?' )[0 ] # Remove query parameters
92+ if no_param_url .lower ().endswith (('.png' , '.jpg' , '.jpeg' , '.gif' , '.bmp' , '.tiff' , '.webp' )):
93+ downloaded_img = self .download_image (no_param_url , file_name , os .path .join ("static" , img_folder ))
94+ if not is_valid_image (downloaded_img ):
95+ os .remove (downloaded_img )
96+ raise Exception (f"Invalid image: { downloaded_img } " )
97+ resize_image (downloaded_img , max_height = 600 )
98+ webp_img_path = convert_to_webp (downloaded_img , replace = True )
99+ return os .path .join ("/" , img_folder , os .path .basename (webp_img_path ))
100+ elif no_param_url .lower ().endswith ('.svg' ):
101+ downloaded_img = self .download_image (no_param_url , file_name , os .path .join ("static" , img_folder ))
102+ if not is_valid_svg (downloaded_img ):
103+ os .remove (downloaded_img )
104+ raise Exception (f"Invalid image: { downloaded_img } " )
105+ return os .path .join ("/" , img_folder , file_name )
106+ else :
107+ downloaded_img = self .handle_unknown_image_format (img_url , unknown_img_folder )
108+ return os .path .join ("/" , img_folder , "unknown" , os .path .basename (downloaded_img ))
109+
110+ def handle_unknown_image_format (self , img_url , dest_folder ):
111+ """
112+ Handle unknown image formats by downloading the image and converting it to webp format.
113+ """
114+ prefix = '' .join (random .choices (string .ascii_letters + string .digits , k = 8 ))
115+ file_name = f"image_{ prefix } .png"
116+
117+ downloaded_img = self .download_image (
118+ img_url ,
119+ file_name ,
120+ dest_folder ,
121+ is_unknown = True
122+ )
123+ if not is_valid_image (downloaded_img ):
124+ os .remove (downloaded_img )
125+ raise Exception (f"Invalid image: { downloaded_img } " )
126+ resize_image (downloaded_img , max_height = 600 )
127+ return convert_to_webp (downloaded_img , replace = True )
128+
129+
62130 def process_entry (self , entry ):
63131 try :
64132 dest_folder = self .get_dest_folder ()
65133 title = entry .title
66- # I don't think we need to download images because the images are already in the feed
67- # image_url = next((link.href for link in entry.links if 'image' in link.type), entry.links[-1].href)
68- # if image_url.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp')):
69- # file_name = self.get_image_name(image_url)
70- # self.download_image(image_url, file_name, dest_folder)
71134
72135 post_url = entry .link
73136
@@ -81,6 +144,7 @@ def process_entry(self, entry):
81144
82145 are_tags_present = any (str (category ).lower () in tags for category in self .filter_categories )
83146 if are_tags_present :
147+ content = self .fetch_all_images (content , self .shortname , file_name )
84148 content = self .generate_markdown_content (title , entry_date , post_url , content , tags )
85149
86150 # Copy the markdown file to the posts folder
@@ -169,12 +233,20 @@ def write_to_file(self, filename, content):
169233 with open (filename , "w" , encoding = "utf=8" ) as f :
170234 f .write (content )
171235
172- def download_image (self , image_url , image_name , dest_folder ):
173- response = requests . get ( image_url , stream = True )
236+ def download_image (self , image_url , image_name , dest_folder , is_unknown = False ):
237+ os . makedirs ( dest_folder , exist_ok = True )
174238 image_filename = os .path .join (dest_folder , image_name )
175- with open (image_filename , 'wb' ) as out_file :
176- shutil .copyfileobj (response .raw , out_file )
177- print (f"Writing: { image_filename } " )
239+ if is_unknown :
240+ response = requests .get (image_url , stream = True )
241+ with open (image_filename , "wb" ) as file :
242+ for chunk in response .iter_content (1024 ):
243+ file .write (chunk )
244+ else :
245+ response = requests .get (image_url , stream = True )
246+ content = response .raw
247+ with open (image_filename , 'wb' ) as out_file :
248+ shutil .copyfileobj (content , out_file )
249+ return image_filename
178250
179251
180252class FunderProcessor :
@@ -265,10 +337,11 @@ def process_funder(item):
265337 print (f"Failed to delete { file_path } . Reason: { e } " )
266338
267339 # Iterate over the subscribers and fetch posts for active ones
340+ i = 1
268341 for subscriber in subscribers :
269342 if not subscriber .get ('is_active' ):
270343 continue
271-
344+ print ( f" { i } / { len ( subscribers ) } : Processing feed for { subscriber [ 'name' ] } " )
272345 languages = subscriber .get ('languages' , {})
273346 available_lang = languages .get ('available' , DEFAULT_AVAILABLE_LANG )
274347 main_lang = languages .get ('main' , DEFAULT_MAIN_LANG )
@@ -283,5 +356,6 @@ def process_funder(item):
283356 filter_categories
284357 )
285358 processor .fetch_and_create_post ()
359+ i += 1
286360
287361 # FunderProcessor.fetch_funders()
0 commit comments