|
12 | 12 | from ktoolbox._enum import PostFileTypeEnum, DataStorageNameEnum |
13 | 13 | from ktoolbox.action import ActionRet, fetch_creator_posts, FetchInterruptError |
14 | 14 | from ktoolbox.action.utils import generate_post_path_name, filter_posts_by_date, generate_filename, \ |
15 | | - filter_posts_by_keywords, filter_posts_by_keywords_exclude, generate_grouped_post_path |
| 15 | + filter_posts_by_keywords, filter_posts_by_keywords_exclude, generate_grouped_post_path, extract_content_images |
16 | 16 | from ktoolbox.api.model import Post, Attachment, Revision |
17 | 17 | from ktoolbox.api.posts import get_post_revisions as get_post_revisions_api, get_post as get_post_api |
18 | 18 | from ktoolbox.configuration import config |
@@ -125,7 +125,9 @@ async def create_job_from_post( |
125 | 125 | ) |
126 | 126 | ) |
127 | 127 | # ``post.substring`` is used to determine if the post has content, but it's only partial |
128 | | - if (post.content or post.substring) and post_dir and (config.job.extract_content or config.job.extract_external_links): |
| 128 | + if (post.content or post.substring) and post_dir and ( |
| 129 | + config.job.extract_content or config.job.extract_external_links or config.job.extract_content_images |
| 130 | + ): |
129 | 131 | # If post has no content, fetch it from get_post API |
130 | 132 | if not post.content: |
131 | 133 | get_post_ret = await get_post_api( |
@@ -164,6 +166,67 @@ async def create_job_from_post( |
164 | 166 | for link in sorted(external_links): |
165 | 167 | await f.write(f"{link}\n") |
166 | 168 |
|
| 169 | + # Extract content images |
| 170 | + if config.job.extract_content_images: |
| 171 | + content_image_sources = extract_content_images(post.content) |
| 172 | + for image_src in content_image_sources: |
| 173 | + if not image_src or not image_src.strip(): |
| 174 | + continue |
| 175 | + |
| 176 | + # Handle relative paths by making them absolute |
| 177 | + # noinspection HttpUrlsUsage |
| 178 | + if image_src.startswith('/') and not image_src.startswith('//'): |
| 179 | + # Relative path - construct full URL |
| 180 | + image_path = image_src |
| 181 | + elif image_src.startswith('http://') or image_src.startswith('https://'): |
| 182 | + # Absolute URL - extract path |
| 183 | + image_path = urlparse(image_src).path |
| 184 | + else: |
| 185 | + # Skip data URLs, protocol-relative URLs, or other non-path sources |
| 186 | + continue |
| 187 | + |
| 188 | + if not image_path or not image_path.strip(): |
| 189 | + continue |
| 190 | + |
| 191 | + # Generate filename from the image path |
| 192 | + image_file_path = Path(image_path) |
| 193 | + |
| 194 | + # Apply "allow/block list" filtering first (before incrementing counter) |
| 195 | + if config.job.sequential_filename: |
| 196 | + basic_filename = f"{sequential_counter + 1}{image_file_path.suffix}" |
| 197 | + else: |
| 198 | + basic_filename = image_file_path.name |
| 199 | + |
| 200 | + alt_filename = generate_filename(post, basic_filename, config.job.filename_format) |
| 201 | + |
| 202 | + if (not config.job.allow_list or any( |
| 203 | + map( |
| 204 | + lambda x: fnmatch(alt_filename, x), |
| 205 | + config.job.allow_list |
| 206 | + ) |
| 207 | + )) and not any( |
| 208 | + map( |
| 209 | + lambda x: fnmatch(alt_filename, x), |
| 210 | + config.job.block_list |
| 211 | + ) |
| 212 | + ): |
| 213 | + # Regenerate filename with correct counter |
| 214 | + should_use_sequential = (config.job.sequential_filename and |
| 215 | + image_file_path.suffix.lower() not in config.job.sequential_filename_excludes) |
| 216 | + if should_use_sequential: |
| 217 | + basic_filename = f"{sequential_counter}{image_file_path.suffix}" |
| 218 | + alt_filename = generate_filename(post, basic_filename, config.job.filename_format) |
| 219 | + sequential_counter += 1 |
| 220 | + |
| 221 | + jobs.append( |
| 222 | + Job( |
| 223 | + path=attachments_path, |
| 224 | + alt_filename=alt_filename, |
| 225 | + server_path=image_path, |
| 226 | + type=PostFileTypeEnum.Attachment |
| 227 | + ) |
| 228 | + ) |
| 229 | + |
167 | 230 | return jobs |
168 | 231 |
|
169 | 232 |
|
@@ -263,10 +326,11 @@ async def create_job_from_creator( |
263 | 326 |
|
264 | 327 | if config.job.include_revisions: |
265 | 328 | logger.warning("`job.include_revisions` is enabled and will fetch post revisions, " |
266 | | - "which may take time. Disable if not needed.") |
267 | | - if config.job.extract_content or config.job.extract_external_links: |
268 | | - logger.warning("`job.extract_content` or `job.extract_external_links` is enabled and will fetch post content one by one, " |
269 | | - "which may take time. Disable if not needed.") |
| 329 | + "which may take time. Disable if not needed.") |
| 330 | + if config.job.extract_content or config.job.extract_external_links or config.job.extract_content_images: |
| 331 | + logger.warning( |
| 332 | + "`job.extract_content` or `job.extract_external_links` or `job.extract_content_images` is enabled " |
| 333 | + "and will fetch post content one by one, which may take time. Disable if not needed.") |
270 | 334 |
|
271 | 335 | job_list: List[Job] = [] |
272 | 336 | for post in post_list: |
|
0 commit comments