Skip to content

Commit aceb131

Browse files
authored
Merge branch 'main' into add_magnetdl_scraper
2 parents bbb57ea + 0eacb2c commit aceb131

File tree

3 files changed

+178
-1
lines changed

3 files changed

+178
-1
lines changed

scraper/services/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from base import *
22
#import child modules
33
from scraper.services import rarbg
4+
from scraper.services import rarbgv2
45
from scraper.services import x1337
56
from scraper.services import jackett
67
from scraper.services import prowlarr
@@ -19,7 +20,7 @@
1920

2021
#define subclass method
2122
def __subclasses__():
22-
return [rarbg,x1337,jackett,prowlarr,orionoid,nyaa,torrentio,zilean,torbox,mediafusion,comet,eztv,thepiratebay,torrentgalaxy,yts,magnetdl]
23+
return [rarbg,rarbgv2,x1337,jackett,prowlarr,orionoid,nyaa,torrentio,zilean,torbox,mediafusion,comet,eztv,thepiratebay,torrentgalaxy,yts,limetorrents,magnetdl]
2324

2425
active = ['torrentio']
2526
overwrite = []

scraper/services/limetorrents.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import urllib.request
2+
import urllib.parse
3+
from bs4 import BeautifulSoup
4+
from ui.ui_print import *
5+
import releases
6+
7+
name = "limetorrents"
8+
session = urllib.request.build_opener()
9+
10+
11+
def setup(cls, new=False):
12+
from scraper.services import setup
13+
setup(cls, new)
14+
15+
16+
def scrape(query, altquery):
17+
from scraper.services import active
18+
scraped_releases = []
19+
if 'limetorrents' in active:
20+
ui_print("[limetorrents] using extended query: " + query, ui_settings.debug)
21+
22+
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'}
23+
24+
url = f'https://www.limetorrents.lol/search/all/' + urllib.parse.quote(query, safe=':/') + '/'
25+
response = None
26+
try:
27+
ui_print("[limetorrents] Sending GET request to URL: " + url, ui_settings.debug)
28+
request = urllib.request.Request(url, headers=headers)
29+
response = session.open(request)
30+
status_code = response.getcode()
31+
32+
if status_code == 200:
33+
content = response.read().decode('utf-8', errors='ignore')
34+
soup = BeautifulSoup(content, 'html.parser')
35+
torrentList = soup.select('tr:has(td.tdleft)')[4::1]
36+
if torrentList:
37+
ui_print(f"[limetorrents] Found {len(torrentList)} torrent(s)", ui_settings.debug)
38+
for count, torrent in enumerate(torrentList):
39+
title_element = torrent.select_one('div.tt-name a:nth-of-type(2)')
40+
title = title_element.get_text() if title_element else 'Unknown Title'
41+
title = regex.sub(r'[^\w\s\.\-]', '', title) # a good place for this is in the classes.py during the regex bits that checks for matches
42+
title = title.replace(" ", '.')
43+
title = regex.sub(r'\.+', ".", title)
44+
if regex.match(r'(' + altquery.replace('.', r'\.').replace(r"\.*", ".*") + ')', title, regex.I):
45+
download_element = torrent.select_one('td.item-icons a[href^="magnet"]')
46+
download = download_element['href'] if download_element else '#'
47+
48+
link_element = torrent.select_one('div.tt-name a:nth-of-type(1)')
49+
link = link_element['href'] if link_element else '#'
50+
download = link.replace("http://itorrents.org/torrent/", "magnet:?xt=urn:btih:").replace(".torrent?title=", "&dn=")
51+
52+
size_element = torrent.select_one('td.tdnormal:nth-of-type(3)')
53+
size = size_element.get_text().strip() if size_element else '0 GB'
54+
size_match = regex.search(r'([0-9]*\.?[0-9]+)\s*(KB|MB|GB)', size, regex.I)
55+
56+
seeders_element = torrent.select_one('td.tdseed')
57+
seeders = int(seeders_element.get_text().strip()) if seeders_element else 0
58+
59+
if size_match:
60+
size_value = float(size_match.group(1))
61+
size_unit = size_match.group(2).upper()
62+
63+
if size_unit == 'KB':
64+
size = size_value / (1024 * 1024)
65+
elif size_unit == 'MB':
66+
size = size_value / 1024
67+
elif size_unit == 'GB':
68+
size = size_value
69+
else:
70+
size = float(size_value)
71+
72+
scraped_releases += [releases.release('[limetorrents]', 'torrent', title, [], size, [download], seeders=seeders)]
73+
ui_print(f"[limetorrents] Scraped release: title={title}, size={size:.2f} GB, seeders={seeders}", ui_settings.debug)
74+
else:
75+
ui_print("[limetorrents] No torrents found", ui_settings.debug)
76+
else:
77+
ui_print("[limetorrents] Failed to retrieve the page. Status code: " + str(status_code), ui_settings.debug)
78+
79+
except Exception as e:
80+
if hasattr(response, "status_code") and not str(response.status_code).startswith("2"):
81+
ui_print('[limetorrents] error ' + str(response.status_code) + ': limetorrents is temporarily not reachable')
82+
else:
83+
ui_print('[limetorrents] error: unknown error. turn on debug printing for more information.')
84+
response = None
85+
ui_print('[limetorrents] error: exception: ' + str(e), ui_settings.debug)
86+
return scraped_releases

scraper/services/rarbgv2.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import urllib.request
2+
import urllib.parse
3+
from bs4 import BeautifulSoup
4+
from ui.ui_print import *
5+
import releases
6+
7+
name = "rarbgv2"
8+
session = urllib.request.build_opener()
9+
10+
11+
def setup(cls, new=False):
12+
from scraper.services import setup
13+
setup(cls, new)
14+
15+
16+
def scrape(query, altquery):
17+
from scraper.services import active
18+
scraped_releases = []
19+
if 'rarbgv2' in active:
20+
ui_print("[rarbg] using extended query: " + query, ui_settings.debug)
21+
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'}
22+
url = 'http://therarbg.com/get-posts/keywords:' + urllib.parse.quote(query.replace('.', ' ').replace('?', ''), safe=':/') + '/'
23+
response = None
24+
try:
25+
ui_print("[rarbg] Sending GET request to URL: " + url, ui_settings.debug)
26+
request = urllib.request.Request(url, headers=headers)
27+
response = session.open(request)
28+
status_code = response.getcode()
29+
30+
if status_code == 200:
31+
content = response.read().decode('utf-8', errors='ignore')
32+
soup = BeautifulSoup(content, 'html.parser')
33+
torrentList = soup.select('a[href*="/post-detail/"]')
34+
sizeList = soup.select('td[style*="left"]')
35+
seederList = soup.select('td[style*="color: green"]')
36+
if torrentList:
37+
ui_print(f"[rarbg] Found {len(torrentList)} torrent(s)", ui_settings.debug)
38+
for count, torrent in enumerate(torrentList):
39+
title = torrent.getText().strip()
40+
title = regex.sub(r'[^\w\s\.\-]', '', title)
41+
title = title.replace(" ", '.')
42+
title = regex.sub(r'\.+', ".", title)
43+
ui_print("[rarbg] Processing torrent: " + title, ui_settings.debug)
44+
if regex.match(r'(' + altquery.replace('.', r'\.').replace(r"\.*", ".*") + ')', title, regex.I):
45+
link = torrent['href']
46+
request = urllib.request.Request(escape_url('http://therarbg.com' + link), headers=headers)
47+
response = session.open(request)
48+
content = response.read().decode('utf-8', errors='ignore')
49+
soup = BeautifulSoup(content, 'html.parser')
50+
download = soup.select('a[href^="magnet"]')[0]['href']
51+
seeders = seederList[count].contents[0]
52+
size = sizeList[count].contents[0].replace(' ', ' ').replace('\xa0', ' ')
53+
size_match = regex.search(r'([0-9]*\.?[0-9]+)\s*(KB|MB|GB)', size, regex.I)
54+
55+
if size_match:
56+
size_value = float(size_match.group(1))
57+
size_unit = size_match.group(2).upper()
58+
59+
if size_unit == 'KB':
60+
size = size_value / (1024 * 1024) # Convert KB to GB
61+
elif size_unit == 'MB':
62+
size = size_value / 1024 # Convert MB to GB
63+
elif size_unit == 'GB':
64+
size = size_value
65+
else:
66+
size = float(size_value)
67+
68+
scraped_releases += [releases.release('[rarbg]', 'torrent', title, [], size, [download], seeders=int(seeders))]
69+
ui_print(f"[rarbg] Scraped release: title={title}, size={size} GB, seeders={seeders}", ui_settings.debug)
70+
else:
71+
ui_print("[rarbg] No torrents found", ui_settings.debug)
72+
else:
73+
ui_print("[rarbg] Failed to retrieve the page. Status code: " + str(status_code), ui_settings.debug)
74+
75+
except Exception as e:
76+
if hasattr(response, "status_code") and not str(response.status_code).startswith("2"):
77+
ui_print('[rarbg] error ' + str(response.status_code) + ': rarbg is temporarily not reachable')
78+
else:
79+
ui_print('[rarbg] error: unknown error. turn on debug printing for more information.')
80+
response = None
81+
ui_print('[rarbg] error: exception: ' + str(e), ui_settings.debug)
82+
return scraped_releases
83+
84+
85+
# properly escapes any non-ascii characters in url
86+
def escape_url(url):
87+
parts = urllib.parse.urlsplit(url)
88+
path = urllib.parse.quote(parts.path)
89+
query = urllib.parse.quote(parts.query, safe="=&?") # Adjust safe characters as needed
90+
return urllib.parse.urlunsplit((parts.scheme, parts.netloc, path, query, parts.fragment))

0 commit comments

Comments
 (0)