Merge pull request #47 from Xpirix/fetch_all_resources

Xpirix · web-flow · commit cf690f1c729d · 2025-02-10T11:50:30.000+03:00
Fetch all resources
diff --git a/REQUIREMENTS.txt b/REQUIREMENTS.txt
@@ -1,4 +1,5 @@
 feedparser==6.0.11
 requests==2.32.3
 pillow==11.0.0
-python-dateutil==2.9.0.post0
+python-dateutil==2.9.0.post0
+beautifulsoup4==4.13.3
diff --git a/fetch_feeds.py b/fetch_feeds.py
@@ -4,11 +4,14 @@
 import os
 import json
 from urllib.parse import urlparse
+import string
+import random
 import requests
 import shutil
 from datetime import datetime
-from scripts.resize_image import resize_image
+from scripts.resize_image import resize_image, convert_to_webp, is_valid_image, is_valid_svg
 from dateutil.parser import parse as date_parse
+from bs4 import BeautifulSoup
 
 # Path to the subscribers.json file
 SUBSCRIBERS_JSON_PATH = os.path.join(os.path.dirname(__file__), 'data', 'subscribers.json')
@@ -59,15 +62,75 @@ def fetch_and_create_post(self):
         except Exception as e:
             print(f"Failed to process feed for {self.subscriber_name}: {e}")
 
+    def fetch_all_images(self, content, subscriber_shortname, post_name):
+        img_folder = os.path.join("img", "subscribers", subscriber_shortname, post_name)
+        soup = BeautifulSoup(content, 'html.parser')
+        unknown_img_folder = os.path.join("static", img_folder, "unknown")
+
+        if os.path.exists(unknown_img_folder):
+            shutil.rmtree(unknown_img_folder)
+        os.makedirs(unknown_img_folder, exist_ok=True)
+
+        for img in soup.find_all('img'):
+            img_url = img['src']
+            file_name = self.get_image_name(img_url.split('?')[0])
+            try:
+                downloaded_img = self.download_and_process_image(img_url, file_name, img_folder, unknown_img_folder)
+                img['src'] = downloaded_img
+            except Exception as e:
+                img['src'] = ""
+                print(f"Failed to process image: {e}")
+
+        for video in soup.find_all('video'):
+            video_url = video.find('source')['src']
+            video.replace_with(soup.new_tag('a', href=video_url, target="_blank", string="Watch Video"))
+
+        return str(soup)
+
+    def download_and_process_image(self, img_url, file_name, img_folder, unknown_img_folder):
+        no_param_url = img_url.split('?')[0]  # Remove query parameters
+        if no_param_url.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp')):
+            downloaded_img = self.download_image(no_param_url, file_name, os.path.join("static", img_folder))
+            if not is_valid_image(downloaded_img):
+                os.remove(downloaded_img)
+                raise Exception(f"Invalid image: {downloaded_img}")
+            resize_image(downloaded_img, max_height=600)
+            webp_img_path = convert_to_webp(downloaded_img, replace=True)
+            return os.path.join("/", img_folder, os.path.basename(webp_img_path))
+        elif no_param_url.lower().endswith('.svg'):
+            downloaded_img = self.download_image(no_param_url, file_name, os.path.join("static", img_folder))
+            if not is_valid_svg(downloaded_img):
+                os.remove(downloaded_img)
+                raise Exception(f"Invalid image: {downloaded_img}")
+            return os.path.join("/", img_folder, file_name)
+        else:
+            downloaded_img = self.handle_unknown_image_format(img_url, unknown_img_folder)
+            return os.path.join("/", img_folder, "unknown", os.path.basename(downloaded_img))
+
+    def handle_unknown_image_format(self, img_url, dest_folder):
+        """
+        Handle unknown image formats by downloading the image and converting it to webp format.
+        """
+        prefix = ''.join(random.choices(string.ascii_letters + string.digits, k=8))
+        file_name = f"image_{prefix}.png"
+
+        downloaded_img = self.download_image(
+            img_url,
+            file_name,
+            dest_folder,
+            is_unknown=True
+        )
+        if not is_valid_image(downloaded_img):
+            os.remove(downloaded_img)
+            raise Exception(f"Invalid image: {downloaded_img}")
+        resize_image(downloaded_img, max_height=600)
+        return convert_to_webp(downloaded_img, replace=True)
+
+
     def process_entry(self, entry):
         try:
             dest_folder = self.get_dest_folder()
             title = entry.title
-            # I don't think we need to download images because the images are already in the feed
-            # image_url = next((link.href for link in entry.links if 'image' in link.type), entry.links[-1].href)
-            # if image_url.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp')):
-            #     file_name = self.get_image_name(image_url)
-            #     self.download_image(image_url, file_name, dest_folder)
 
             post_url = entry.link
 
@@ -81,6 +144,7 @@ def process_entry(self, entry):
 
             are_tags_present = any(str(category).lower() in tags for category in self.filter_categories)
             if are_tags_present:
+                content = self.fetch_all_images(content, self.shortname, file_name)
                 content = self.generate_markdown_content(title, entry_date, post_url, content, tags)
                 
                 # Copy the markdown file to the posts folder
@@ -169,12 +233,20 @@ def write_to_file(self, filename, content):
         with open(filename, "w", encoding="utf=8") as f:
             f.write(content)
 
-    def download_image(self, image_url, image_name, dest_folder):
-        response = requests.get(image_url, stream=True)
+    def download_image(self, image_url, image_name, dest_folder, is_unknown=False):
+        os.makedirs(dest_folder, exist_ok=True)
         image_filename = os.path.join(dest_folder, image_name)
-        with open(image_filename, 'wb') as out_file:
-            shutil.copyfileobj(response.raw, out_file)
-            print(f"Writing: {image_filename}")
+        if is_unknown:
+            response = requests.get(image_url, stream=True)
+            with open(image_filename, "wb") as file:
+                for chunk in response.iter_content(1024):
+                    file.write(chunk)
+        else:
+            response = requests.get(image_url, stream=True)
+            content = response.raw
+            with open(image_filename, 'wb') as out_file:
+                shutil.copyfileobj(content, out_file)
+        return image_filename
 
 
 class FunderProcessor:
@@ -265,10 +337,11 @@ def process_funder(item):
             print(f"Failed to delete {file_path}. Reason: {e}")
 
     # Iterate over the subscribers and fetch posts for active ones
+    i = 1
     for subscriber in subscribers:
         if not subscriber.get('is_active'):
             continue
-        
+        print(f"{i}/{len(subscribers)}: Processing feed for {subscriber['name']}")
         languages = subscriber.get('languages', {})
         available_lang = languages.get('available', DEFAULT_AVAILABLE_LANG)
         main_lang = languages.get('main', DEFAULT_MAIN_LANG)
@@ -283,5 +356,6 @@ def process_funder(item):
             filter_categories
         )
         processor.fetch_and_create_post()
+        i += 1
     
     # FunderProcessor.fetch_funders()
diff --git a/scripts/resize_image.py b/scripts/resize_image.py
@@ -1,5 +1,6 @@
 from PIL import Image
 import os
+import xml.etree.ElementTree as ET
 
 
 def resize_image(image_filename, max_height=120):
@@ -8,38 +9,87 @@ def resize_image(image_filename, max_height=120):
     The image is resized in place.
     param image_filename: The image file to resize
     param max_height: The maximum height in pixels
+    TODO: Add support for other image formats
     """
-    if (
-        image_filename.lower().endswith('.png') or
-        image_filename.lower().endswith('.jpg')
-    ):
-        if os.path.exists(image_filename):
-            print(f'Processing: {image_filename}')
-            with Image.open(image_filename) as img:
-                width, height = img.size
-                if height > max_height:
-                    new_height = max_height
-                    new_width = int((new_height / height) * width)
-
-                    img_resized = img.resize(
-                        (new_width, new_height), Image.LANCZOS
-                    )
-
-                    # Determine the file format
-                    file_format = (
-                        'PNG' if image_filename.lower().endswith('.png')
-                        else 'JPEG'
-                    )
-
-                    # Save the resized image with optimization
-                    img_resized.save(
-                        image_filename,
-                        format=file_format,
-                        optimize=True,
-                        quality=85
-                    )
-                    print(f'Resized and optimized: {image_filename}')
-                else:
-                    print(f'No resizing needed for: {image_filename}')
-        else:
-            print(f'File not found: {image_filename}')
+    if os.path.exists(image_filename):
+        with Image.open(image_filename) as img:
+            width, height = img.size
+            if height > max_height:
+                new_height = max_height
+                new_width = int((new_height / height) * width)
+
+                img_resized = img.resize(
+                    (new_width, new_height), Image.LANCZOS
+                )
+
+                # Determine the file format
+                file_format = image_filename.split('.')[-1].upper()
+                if file_format == 'JPG':
+                    file_format = 'JPEG'
+
+                # Save the resized image with optimization
+                img_resized.save(
+                    image_filename,
+                    format=file_format,
+                    optimize=True,
+                    quality=85
+                )
+    else:
+        print(f'File not found: {image_filename}')
+
+# Transform an image into webp format
+def convert_to_webp(image_filename, replace=False):
+    """
+    Convert an image to webp format.
+    The image is converted in place.
+    param image_filename: The image file to convert
+    """
+    supported_formats = ['.png', '.jpg', '.jpeg', '.tiff']
+    image_ext = os.path.splitext(image_filename)[1].lower()
+    if image_ext not in supported_formats:
+        return image_filename
+    if os.path.exists(image_filename):
+        with Image.open(image_filename) as img:
+            # Determine the file format
+            file_format = image_filename.split('.')[-1].upper()
+
+            # Save the image in webp format with optimization
+            webp_filename = image_filename.replace(file_format.lower(), 'webp')
+            img.save(
+                webp_filename,
+                format='WEBP',
+                optimize=True,
+                quality=85
+            )
+            if replace:
+                os.remove(image_filename)
+        return webp_filename
+    else:
+        print(f'File not found: {image_filename}')
+        raise FileNotFoundError
+    
+# Check if the image is valid
+def is_valid_image(image_filename):
+    """
+    Check if the image file is valid.
+    param image_filename: The image file to check
+    return: True if the image is valid, False otherwise
+    """
+    try:
+        img = Image.open(image_filename)
+        img.verify()
+        return True
+    except Exception as e:
+        print(f'Invalid image: {image_filename}')
+
+def is_valid_svg(svg_filename):
+    """
+    Check if the svg file is valid.
+    param svg_filename: The svg file to check
+    return: True if the svg is valid, False otherwise
+    """
+    try:
+        ET.parse(svg_filename)  # Try to parse the XML
+        return True  # No error means it's valid
+    except ET.ParseError:
+        return False  # If parsing fails, it's invalid
diff --git a/themes/hugo-bulma-blocks-theme/assets/sass/bulma.sass b/themes/hugo-bulma-blocks-theme/assets/sass/bulma.sass
@@ -32,6 +32,7 @@
 {{ $truenoSBd := resources.Get "webfonts/TruenoSBd.otf" }}
 {{ $truenoBd := resources.Get "webfonts/TruenoBd.otf" }}
 {{ $truenoUltBlk := resources.Get "webfonts/TruenoUltBlk.otf" }}
+{{ $countryFlagsEmoji := resources.Get "webfonts/TwemojiCountryFlags.woff2" }}
  
 @font-face
     font-family: 'Montserrat'
@@ -62,6 +63,9 @@
   src: url("{{ $truenoUltBlk.RelPermalink }}") format("opentype")
   font-weight: 700
   
+@font-face 
+  font-family: "Twemoji Country Flags"
+  src: url("{{ $countryFlagsEmoji.RelPermalink }}") format("opentype")
 
 {{ $worksans := resources.Get "webfonts/worksans.woff2" }}
 
diff --git a/themes/hugo-bulma-blocks-theme/assets/webfonts/TwemojiCountryFlags.woff2 b/themes/hugo-bulma-blocks-theme/assets/webfonts/TwemojiCountryFlags.woff2
diff --git a/themes/hugo-bulma-blocks-theme/layouts/partials/header.html b/themes/hugo-bulma-blocks-theme/layouts/partials/header.html
@@ -175,13 +175,6 @@
             src="{{ .Site.Params.uniNavHeaderUrl }}"
         ></script>
 
-        <!-- Countries Flag for windows -->
-        <!-- Added by Lova -->
-        <!-- See https://github.com/talkjs/country-flag-emoji-polyfill -->
-        <script type="module" defer>
-            import { polyfillCountryFlagEmojis } from "https://cdn.skypack.dev/country-flag-emoji-polyfill";
-            polyfillCountryFlagEmojis();
-        </script>
     </head>
 
     <body></body>