1+ from PIL import Image
2+ from bs4 import BeautifulSoup
3+ from urllib .parse import urljoin , urlparse
4+ from pydantic import Field , ConfigDict
5+ from typing import List , Optional
6+ from io import BytesIO
7+
8+ from loguru import logger
9+
10+ import requests
11+
12+ from guidellm .config import settings
13+ from guidellm .core .serializable import Serializable
14+
15+ __all__ = ["load_images" , "ImageDescriptor" ]
16+
17+ class ImageDescriptor (Serializable ):
18+ """
19+ A class to represent image data in serializable format.
20+ """
21+ model_config = ConfigDict (arbitrary_types_allowed = True )
22+
23+ url : Optional [str ] = Field (description = "url address for image." )
24+ image : Image .Image = Field (description = "PIL image" , exclude = True )
25+ filename : Optional [int ] = Field (
26+ default = None ,
27+ description = "Image filename." ,
28+ )
29+
30+
31+ def load_images (data : str ) -> List [ImageDescriptor ]:
32+ """
33+ Load an HTML file from a path or URL
34+
35+ :param data: the path or URL to load the HTML file from
36+ :type data: Union[str, Path]
37+ :return: Descriptor containing image url and the data in PIL.Image.Image format
38+ :rtype: ImageDescriptor
39+ """
40+
41+ images = []
42+ if not data :
43+ return None
44+ if isinstance (data , str ) and data .startswith ("http" ):
45+ response = requests .get (data , timeout = settings .request_timeout )
46+ response .raise_for_status ()
47+
48+ soup = BeautifulSoup (response .text , 'html.parser' )
49+ for img_tag in soup .find_all ("img" ):
50+ img_url = img_tag .get ("src" )
51+
52+ if img_url :
53+ # Handle relative URLs
54+ img_url = urljoin (data , img_url )
55+
56+ # Download the image
57+ logger .debug ("Loading image: {}" , img_url )
58+ img_response = requests .get (img_url )
59+ img_response .raise_for_status ()
60+
61+ # Load image into Pillow
62+ images .append (
63+ ImageDescriptor (
64+ url = img_url ,
65+ image = Image .open (BytesIO (img_response .content )),
66+ )
67+ )
68+
69+ return images
0 commit comments