scp-data · heroheman · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025
diff --git a/.github/workflows/scp-items.yml b/.github/workflows/scp-items.yml
@@ -0,0 +1,92 @@
+name: Crawl SCP Wiki
+
+on:
+#   workflow_dispatch:
+#   schedule:
+#     - cron: "0 0 * * *"
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/scp-items.yml
+
+permissions:
+  contents: write
+
+jobs:
+  update-main-scp:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Clone this Repository"
+        uses: actions/checkout@v6
+        with:
+          path: scp-api
+
+      - name: "Clone the Crawler"
+        uses: actions/checkout@v6
+        with:
+          repository: heroheman/scp_crawler
+          ref: "main"
+          path: scp-crawler
+
+      - name: "Setup Python"
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.13'
+
+      - name: "Install Crawler"
+        working-directory: ./scp-crawler
+        run: make install
+
+      - name: "Crawl Titles"
+        working-directory: ./scp-crawler
+        run: make data/scp_titles.json
+
+      - name: "Crawl Hubs"
+        working-directory: ./scp-crawler
+        run: make data/scp_hubs.json
+
+      - name: "Crawl Items"
+        working-directory: ./scp-crawler
+        run: make data/scp_items.json
+
+      - name: "Process Items"
+        working-directory: ./scp-crawler
+        run: make data/processed/items
+
+      - name: "Crawl Tales"
+        working-directory: ./scp-crawler
+        run: make data/scp_tales.json
+
+      - name: "Process Tales"
+        working-directory: ./scp-crawler
+        run: make data/processed/tales
+
+      - name: "Crawl GOI"
+        working-directory: ./scp-crawler
+        run: make data/goi.json
+
+      - name: "Process GOI"
+        working-directory: ./scp-crawler
+        run: make data/processed/goi
+
+      - name: "Crawl Supplements"
+        working-directory: ./scp-crawler
+        run: make data/scp_supplement.json
+
+      - name: "Process Supplements"
+        working-directory: ./scp-crawler
+        run: make data/processed/supplement
+
+      - name: "Move Files into API"
+        run: cp -Rf ./scp-crawler/data/processed/* ./scp-api/docs/data/scp/
+
+      - name: "Push"
+        shell: bash
+        run: >
+          cd scp-api;
+          ./bin/push.sh;
+
+        env:
+          GIT_USER: "SCP Bot"
+          GIT_EMAIL: "[email protected]"
diff --git a/.gitignore b/.gitignore
@@ -105,3 +105,4 @@ venv.bak/
 
 
 *.json
+.DS_Store
diff --git a/README.md b/README.md
@@ -32,9 +32,15 @@ To crawl the International Hub for SCP Items and save to a custom location:
 scrapy crawl scp_int -o scp_international_items.json
 ```
 
+To crawl pages tagged as `supplement` and save to a custom location:
+
+```bash
+scrapy crawl scp_supplement -o scp_supplement.json
+```
+
 ## Raw Content Structure
 
-There are two types of content downloaded- SCP Items and SCP Tales.
+There are multiple types of content downloaded (Items, Tales, GOI formats, and Supplements).
 
 All content (both SCP Items and Tales) contain the following:
 
@@ -66,6 +72,7 @@ The crawler generates a series of json files containing an array of objects repr
 | scp_titles.json     | Main          | Title | scp     |
 | scp_hubs.json       | Main          | Hub   | scp     |
 | scp_tales.json      | Main          | Tale  | scp     |
+| scp_supplement.json | Main          | Supplement | scp |
 | scp_int.json        | International | Item  | scp_int |
 | scp_int_titles.json | International | Title | scp_int |
 | scp_int_tales.json  | International | Tale  | scp_int |
@@ -76,7 +83,9 @@ To regenerate all files run `make fresh`.
 
 ## Post Processed Data
 
-The postproc system takes the Titles, Hubs, Items, and Tales and uses them to generate a comprehensive set of objects. It combines and cross references data and expands on the data already there.
+The postproc system takes Titles, Hubs, Items, Tales, GOI, and Supplements and uses them to generate a comprehensive set of objects. It combines and cross references data and expands on the data already there.
+
+Supplements are written to `data/processed/supplement/` and include additional fields like `parent_scp` and `parent_tale`.
 
 
 ## Content Licensing

diff --git a/makefile b/makefile
@@ -18,7 +18,7 @@ crawl: scp scp_int goi
 
 scp: scp_crawl scp_postprocess
 
-scp_crawl: data/scp_titles.json data/scp_hubs.json data/scp_items.json data/scp_tales.json data/goi.json
+scp_crawl: data/scp_titles.json data/scp_hubs.json data/scp_items.json data/scp_tales.json data/goi.json data/scp_supplement.json
 
 data/scp_titles.json: .venv
 	$(PYTHON_VENV) python -m scrapy crawl scp_titles -o data/scp_titles.json
@@ -37,7 +37,19 @@ goi: data/goi.json
 data/goi.json: .venv
 	$(PYTHON_VENV) python -m scrapy crawl goi -o data/goi.json
 
-scp_postprocess: scp_crawl data/processed/goi data/processed/items data/processed/tales
+supplement: supplement_crawl supplement_postprocess
+
+supplement_crawl: data/scp_supplement.json
+
+data/scp_supplement.json: .venv
+	$(PYTHON_VENV) python -m scrapy crawl scp_supplement -o data/scp_supplement.json
+
+supplement_postprocess: supplement_crawl data/processed/supplement
+
+data/processed/supplement: .venv
+	$(PYTHON_VENV) python -m scp_crawler.postprocessing run-postproc-supplement
+
+scp_postprocess: scp_crawl data/processed/goi data/processed/items data/processed/tales data/processed/supplement
 
 data/processed/goi: .venv
 	$(PYTHON_VENV) python -m scp_crawler.postprocessing run-postproc-goi

diff --git a/scp_crawler/items.py b/scp_crawler/items.py
@@ -33,6 +33,10 @@ class ScpGoi(WikiPage):
     pass
 
 
+class ScpSupplement(WikiPage):
+    pass
+
+
 class ScpTitle(scrapy.Item):
     title = scrapy.Field()
     scp = scrapy.Field()

diff --git a/scp_crawler/postprocessing.py b/scp_crawler/postprocessing.py
@@ -273,5 +273,53 @@ def run_postproc_goi():
     to_file(tales, processed_path / "index.json")
 
 
+@cli.command()
+def run_postproc_supplement():
+
+    processed_path = Path(cwd + "/data/processed/supplement")
+    os.makedirs(processed_path, exist_ok=True)
+
+    print("Processing Supplement list.")
+
+    supplement_list = from_file(cwd + "/data/scp_supplement.json")
+    supplements = {}
+    for supplement in tqdm(supplement_list, smoothing=0):
+
+        supplement["images"] = get_images(supplement["raw_content"])
+        supplement["hubs"] = get_hubs(supplement["link"])
+        supplement["raw_source"] = get_wiki_source(supplement["page_id"], supplement["domain"])
+
+        # Convert history dict to list and sort by date.
+        supplement["history"] = process_history(supplement["history"])
+
+        if len(supplement["history"]) > 0:
+            supplement["created_at"] = supplement["history"][0]["date"]
+            supplement["creator"] = supplement["history"][0]["author"]
+        else:
+            supplement["created_at"] = "unknown"
+            supplement["creator"] = "unknown"
+
+        supplement["link"] = supplement["url"].replace("https://scp-wiki.wikidot.com/", "")
+
+        # Extract parent SCP from title or link
+        scp_match = re.search(r"scp-\d+", supplement["link"], re.IGNORECASE)
+        supplement["parent_scp"] = scp_match.group(0).upper() if scp_match else None
+
+        # Extract parent tale series from link
+        tale_match = re.match(r"([a-z\-]+)-\d+$", supplement["link"])
+        supplement["parent_tale"] = tale_match.group(1) if tale_match else None
+
+        supplements[supplement["link"]] = supplement
+
+    to_file(supplements, processed_path / f"content_supplement.json")
+
+    for supplement_id in supplements:
+        del supplements[supplement_id]["raw_content"]
+        del supplements[supplement_id]["raw_source"]
+        supplements[supplement_id]["content_file"] = f"content_supplement.json"
+
+    to_file(supplements, processed_path / "index.json")
+
+
 if __name__ == "__main__":
     cli()
diff --git a/scp_crawler/spiders/scp.py b/scp_crawler/spiders/scp.py
@@ -8,7 +8,7 @@
 from scrapy.linkextractors import LinkExtractor
 from scrapy.spiders import CrawlSpider, Rule
 
-from ..items import ScpGoi, ScpHub, ScpItem, ScpTale, ScpTitle
+from ..items import ScpGoi, ScpHub, ScpItem, ScpSupplement, ScpTale, ScpTitle
 
 DOMAIN = "scp-wiki.wikidot.com"
 INT_DOMAIN = "scp-int.wikidot.com"
@@ -343,7 +343,7 @@ class ScpTaleSpider(CrawlSpider, WikiMixin):
 
     rules = (
         Rule(LinkExtractor(allow=[re.escape("tales-by-title"), re.escape("system:page-tags/tag/tale")])),
-        Rule(LinkExtractor(allow=[r".*"]), callback="parse_tale"),
+        Rule(LinkExtractor(deny=[r"system:.*", r".*:.*", re.escape("tag-search")]), callback="parse_tale"),
     )
 
     def parse_tale(self, response, original_link=None):
@@ -531,6 +531,53 @@ def parse_tale(self, response, original_link=None):
         return self.get_history_request(item["page_id"], 1, item)
 
 
+class ScpSupplementSpider(CrawlSpider, WikiMixin):
+    name = "scp_supplement"
+
+    start_urls = [
+        f"http://{DOMAIN}/system:page-tags/tag/supplement",
+    ]
+
+    allowed_domains = [DOMAIN]
+
+    domain = DOMAIN
+
+    rules = (
+        Rule(LinkExtractor(allow=[re.escape("system:page-tags/tag/supplement")])),
+        Rule(LinkExtractor(allow=[r".*"]), callback="parse_supplement"),
+    )
+
+    def parse_supplement(self, response, original_link=None):
+        self.logger.debug("Reviewing Potential SCP Supplement page: %s", response.url)
+        content = self.get_content(response)
+        tags = self.get_tags(response)
+
+        if not content or not tags:
+            return None
+
+        redirect = self.follow_splash_redirects(response, tags, self.parse_supplement)
+        if redirect:
+            return redirect
+
+        if "supplement" not in tags:
+            return None
+
+        self.logger.info("Processing SCP Supplement page: %s", response.url)
+        content_soup = BeautifulSoup(content, "lxml")
+
+        item = ScpSupplement()
+        item["title"] = self.get_title(response)
+        item["url"] = response.url
+        item["domain"] = self.domain
+        item["link"] = original_link if original_link else self.get_simple_link(response.url)
+        item["tags"] = tags
+        item["page_id"] = self.get_page_id(response)
+        item["rating"] = get_rating(response)
+        item["raw_content"] = str(clean_content_soup(content_soup))
+        item["references"] = self.get_content_references(response)
+        return self.get_history_request(item["page_id"], 1, item)
+
+
 def get_rating(response):
     try:
         return int(response.css(".rate-points .number::text").get())