Mtg-Spoiler-App/mtg.py at main · Zephxr/Mtg-Spoiler-App · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import requests
from bs4 import BeautifulSoup
from discordFunc import process_sets
import os

try:
    import cloudscraper
    USE_CLOUDSCRAPER = True
except ImportError:
    USE_CLOUDSCRAPER = False

newSets = []  # List to store new sets that are found
oldSets = []  # List to store previously seen sets
allSets = set()  # Set to store all unique sets (including new and old)

# Get the directory of this file for proper path handling
script_dir = os.path.dirname(os.path.abspath(__file__))

# Load the list of all sets from the 'all_sets.txt' file to keep track of the sets
with open(os.path.join(script_dir, "all_sets.txt"), "r") as f:
    allSets = set(line.strip() for line in f.readlines())

# Load old sets (sets that were previously seen) from the 'old_sets.txt' file
with open(os.path.join(script_dir, "old_sets.txt"), "r") as f:
    oldSets = [line.strip() for line in f.readlines()]

# Scrape the website to find new sets
url = "https://www.magicspoiler.com/mtg-spoilers/"

# Use cloudscraper if available (handles Cloudflare protection)
if USE_CLOUDSCRAPER:
    scraper = cloudscraper.create_scraper(
        browser={
            'browser': 'firefox',
            'platform': 'windows',
            'desktop': True
        }
    )
    response = scraper.get(url)
else:
    # Fallback to requests with browser-like headers
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:144.0) Gecko/20100101 Firefox/144.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br, zstd',
        'Sec-GPC': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Priority': 'u=0, i',
        'TE': 'trailers',
    }
    response = requests.get(url, headers=headers)

response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')

# Extract new sets from the website by checking the appropriate HTML structure
for a_tag in soup.find_all('a', href=True):
    # Check if the anchor contains a div with class "upcoming-set" (indicating a new set)
    upcoming_set_div = a_tag.find('div', class_='upcoming-set')
    if upcoming_set_div:
        href = a_tag['href']

        # Extract the set ID by splitting the URL at '/' and taking the second-to-last part
        parts = href.strip('/').split('/')
        set_id = parts[-1]  # The second-to-last part is the set ID

        # Only add the set if it is not already in the "oldSets" list
        if set_id not in oldSets:
            # Add the set to the list of sets if it hasn't been processed before
            allSets.add(set_id)  # Add the new set to the allSets collection to keep track

for set in allSets:
    if set not in oldSets:
        newSets.append(set)

# Update the 'all_sets.txt' file with the latest set data
# This ensures that we keep track of all the sets we've seen
with open(os.path.join(script_dir, "all_sets.txt"), "w") as f:
    for item in allSets:
        f.write(item + '\n')

# Process the newly found sets by passing them to the discordFunc process_sets function
import asyncio
asyncio.run(process_sets(newSets))