add bulkImageScraper Plugin (#547)

NotMyMainUser · web-flow · commit 0e35831047b3 · 2025-05-03T23:40:07.000+03:00
diff --git a/plugins/bulkImageScrape/BulkImageScrape.yml b/plugins/bulkImageScrape/BulkImageScrape.yml
@@ -0,0 +1,35 @@
+name: Bulk Image Scrape
+description: Apply an image scraper to all images
+version: 0.1
+url: https://github.com/stashapp/CommunityScripts/
+exec:
+  - python
+  - "{pluginDir}/bulkImageScrape.py"
+interface: raw
+
+settings:
+  ScraperID:
+    displayName: The Scraper ID of the image scraper to use (for example SHALookup)
+    type: STRING
+  SkipTags:
+    displayName: List of tags to skip (comma separated)
+    type: STRING
+  CreateMissingPerformers:
+    displayName: Create missing performers from scrape result
+    type: BOOLEAN
+  CreateMissingStudios:
+    displayName: Create missing studios from scrape result
+    type: BOOLEAN
+  CreateMissingMovies:
+    displayName: Create missing movies/groups from scrape result
+    type: BOOLEAN
+  MergeExistingTags:
+    displayName: Merge existing tags with the scraped tags (default is to overwrite)
+    type: BOOLEAN
+  ExcludeOrganized:
+    displayName: Exclude images that are set as organized (default is to include)
+    type: BOOLEAN
+  
+tasks:
+  - name: "Bulk Image Scrape"
+    description: Apply an image scraper to all images
diff --git a/plugins/bulkImageScrape/bulkImageScrape.py b/plugins/bulkImageScrape/bulkImageScrape.py
@@ -0,0 +1,274 @@
+import sys
+import json
+import time
+import stashapi.log as log
+from stashapi.stash_types import StashItem
+from stashapi.stashapp import StashInterface
+from stashapi.scrape_parser import ScrapeParser
+
+# Quick check to make sure we have the correct version of stashapi
+if StashItem.IMAGE is None or not hasattr(StashInterface, "scrape_image"):
+    log.error(
+        "It seems you are using an older version of stashapi\n"
+        "without support for image scraping.\n"
+        "Please use the requirements.txt file to install the most recent version"
+    )
+    exit(1)
+
+
+#
+# Helper functions
+#
+
+
+def get_tag_id(client: StashInterface, tag_name: str) -> str | None:
+    """
+    Get the id of a tag by name or return None if the tag is not found
+    """
+    if tag_name == "" or tag_name is None:
+        raise ValueError("Invalid tag name provided - skipping ...")
+
+    tags: list[dict] = client.find_tags(
+        f={"name": {"value": tag_name, "modifier": "EQUALS"}}
+    )
+    if len(tags) == 0:
+        log.error(f"No id found for tag {tag_name} - skipping ...")
+        return None
+    return tags[0]["id"]
+
+
+def get_scraper_id(client: StashInterface, scraper_name: str) -> str | None:
+    """
+    Get the id of a scraper by name or return None if the scraper is not found
+    """
+    scrapers: list[dict] = client.list_scrapers([StashItem.IMAGE])
+    for scraper in scrapers:
+        if scraper["name"] == scraper_name:
+            return scraper["id"]
+    return None
+
+
+def parse_skip_tags(client: StashInterface, skip_tags: str) -> list[str]:
+    """
+    Parse the skip tags to a list of tag ids
+    """
+    skip_tags = skip_tags.split(",")
+    tag_ids: list[str] = []
+    for tag in skip_tags:
+        tag_id: None | str = get_tag_id(client, tag.strip())
+        if tag_id is not None:
+            tag_ids.append(tag_id)
+    return tag_ids
+
+
+def get_all_images(
+    client: StashInterface, skip_tags: list[str], exclude_organized: bool
+) -> list[dict]:
+    """
+    Get all images from the stash
+    """
+    image_filter: dict = {}
+    all_results: dict = {
+        "page": 1,
+        "per_page": -1,
+        "sort": "created_at",
+        "direction": "ASC",
+    }
+
+    if exclude_organized:
+        image_filter["organized"] = False
+
+    if skip_tags is not None and len(skip_tags) > 0:
+        tags: list[str] = skip_tags
+        image_filter["tags"] = {
+            "value": [],
+            "excludes": tags,
+            "modifier": "INCLUDES_ALL",
+            "depth": -1,
+        }
+
+    result: list[dict] = client.find_images(f=image_filter, filter=all_results)
+
+    return result
+
+
+def scrape_image(
+    client: StashInterface, image_id: str, scraperid: str
+) -> dict | list[dict] | None:
+    """
+    Scrape an image with the given scraper
+    """
+    try:
+        scrape_result: dict = client.scrape_image(scraperid, image_id)
+    except Exception as e:
+        # Catch any exceptions not under our control
+        # so we can continue with the next image in case of errors
+        log.error(f"Error scraping image {image_id} with scraper {scraperid}: {e}")
+        return None
+
+    return scrape_result
+
+
+def scrape_is_valid(scrape_input: dict | list[dict] | None) -> bool:
+    """
+    Check if the scrape is formatted as expected and has any data we can process
+    """
+    if scrape_input is None:
+        # most likely the scraper failed or had an exception we catched and returned None
+        return False
+    elif isinstance(scrape_input, list):
+        # single image scrape results are lists with a single dict inside
+        return len(scrape_input) == 1 and scrape_is_valid(scrape_input[0])
+    elif isinstance(scrape_input, dict):
+        # check if there are any values that are
+        # not None, empty lists, empty dicts, or empty strings
+        return any(
+            value
+            for value in scrape_input.values()
+            if value is not None and value != [] and value != {} and value != ""
+        )
+    else:
+        # something went wrong strangely wrong?
+        return False
+
+
+def process_image_scrape(
+    parser: ScrapeParser,
+    image_input: dict,
+    scrape_input: dict | list[dict],
+    merge_existing_tags: bool,
+) -> dict | None:
+    """
+    Process the scrape input and return an update dictionary
+    """
+    if isinstance(scrape_input, list) and len(scrape_input) == 1:
+        # single image scrape results are lists with a single dict inside
+        scrape_input = scrape_input[0]
+    elif isinstance(scrape_input, dict):
+        # do nothing if its already a dict
+        pass
+    else:
+        log.error(f"Unknown scrape input type for image {image_input['id']}")
+        return None
+
+    update_dict: dict = parser.image_from_scrape(scrape_input)
+    update_dict["id"] = image_input["id"]
+    if merge_existing_tags:
+        existing_tags: list = [tag["id"] for tag in image_input["tags"]]
+        merged_tags: list = list(set(existing_tags + update_dict["tag_ids"]))
+        update_dict["tag_ids"] = merged_tags
+
+    return update_dict
+
+
+def update_image(client: StashInterface, update: dict) -> dict | None:
+    """
+    Update the image with the given update
+    """
+
+    try:
+        return client.update_image(update)
+    except Exception as e:
+        # Catch any exceptions not under our control
+        # so we can continue with the next image in case of errors
+        log.error(f"Error updating image {update['id']}: {e}")
+        return None
+
+
+#
+# SETUP
+#
+
+
+json_input: dict = json.loads(sys.stdin.read())
+FRAGMENT_SERVER: dict = json_input["server_connection"]
+stash: StashInterface = StashInterface(FRAGMENT_SERVER)
+
+config: dict = stash.get_configuration()["plugins"]
+settings: dict[str, any] = {
+    "ScraperID": "",
+    "SkipTags": "",
+    "CreateMissingPerformers": False,
+    "CreateMissingStudios": False,
+    "CreateMissingTags": False,
+    "MergeExistingTags": False,
+    "ExcludeOrganized": False,
+}
+
+if "BulkImageScrape" in config:
+    settings.update(config["BulkImageScrape"])
+
+scrape_parser = ScrapeParser(
+    stash,
+    log,
+    settings["CreateMissingTags"],
+    settings["CreateMissingStudios"],
+    settings["CreateMissingPerformers"],
+)
+
+#
+# Validate input settings
+#
+
+
+# Exit if no ScraperID is set or we cannot resolve it
+if settings["ScraperID"] == "":
+    log.error("No ScraperID set")
+    exit(1)
+
+scraper_id: None | str = get_scraper_id(stash, settings["ScraperID"])
+if scraper_id is None:
+    log.error(f"ScraperID {settings['ScraperID']} not found - cannot continue")
+    log.error("Please check the ScraperID is correct and try again")
+    exit(1)
+
+# parse the skip tags to a list of tag ids if we have any
+parsed_skip_tags: list[str] = []
+if settings["SkipTags"] != "":
+    parsed_skip_tags = parse_skip_tags(stash, settings["SkipTags"])
+    if len(parsed_skip_tags) == 0:
+        parsed_skip_tags = []
+
+#
+# MAIN
+#
+
+log.info("Starting Bulk Image Scrape Plugin")
+log.info(f"settings: {settings=}")
+log.info("Querying images from stash")
+
+images: list[dict] = get_all_images(
+    stash, parsed_skip_tags, settings["ExcludeOrganized"]
+)
+
+total_images: int = len(images)
+if total_images == 0:
+    log.info("No images found with the given filters")
+    exit(0)
+else:
+    log.info(f"Found {len(images)} images")
+
+
+for i, image in enumerate(images, start=1):
+    time.sleep(0.5)
+    log.progress((i / total_images))
+    log.debug(f"Scraping image {image['id']} with scraper {scraper_id}")
+
+    scrape: dict | list[dict] | None = scrape_image(stash, image["id"], scraper_id)
+    valid: bool = scrape_is_valid(scrape)
+    if not valid:
+        log.error(
+            f"Scraper returned invalid result for image {image['id']} with scraper {scraper_id}"
+        )
+        continue
+
+    update_input: dict | None = process_image_scrape(
+        scrape_parser, image, scrape, settings["MergeExistingTags"]
+    )
+    if update_input is not None:
+        update_image(stash, update_input)
+        log.info(f"Updated image {image['id']} with scraper {scraper_id}")
+    else:
+        log.error(
+            f"Failed to update image {image['id']} with result from scraper {scraper_id}"
+        )
diff --git a/plugins/bulkImageScrape/requirements.txt b/plugins/bulkImageScrape/requirements.txt
@@ -0,0 +1,3 @@
+requests
+# stashapi has to be installed from source until stashapp-tools is updated to include the latest version
+stashapi @ git+https://github.com/stg-annon/stashapi.git

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+requests`
	`2`	`+# stashapi has to be installed from source until stashapp-tools is updated to include the latest version`
	`3`	`+stashapi @ git+https://github.com/stg-annon/stashapi.git`