Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 124 additions & 49 deletions whatscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import urllib.request
import urllib.parse
import threading
import signal
from datetime import datetime
import argparse
import json
Expand All @@ -17,6 +18,18 @@
print("\n python3 -m pip install google")
exit()

# Global stop event for threads
stop_event = threading.Event()

# Signal handler for Ctrl+C
def signal_handler(sig, frame):
print("\n[!] Received Ctrl+C, stopping threads and exiting...")
stop_event.set() # Signal threads to stop
sys.exit(0)

# Register signal handler
signal.signal(signal.SIGINT, signal_handler)

GROUP_NAME_REGEX = re.compile(r'(og:title\" content=\")(.*?)(\")')
GROUP_IMAGE_REGEX = re.compile(r'(og:image\" content=\")(.*?)(\")')
lock = threading.Lock()
Expand Down Expand Up @@ -62,15 +75,24 @@


def linkcheck(url):
if stop_event.is_set():
print("[DEBUG] linkcheck: Stop event set, exiting")
return {"name": None, "url": url, "image": None}
print("\nTrying URL:", url, end='\r')
group_info = {"name": None, "url": url, "image": None}
try:
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}
hdr = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://www.google.com/'
}
req = urllib.request.Request(url, headers=hdr)
resp = urllib.request.urlopen(req)
except Exception:
resp = urllib.request.urlopen(req, timeout=10)
except Exception as e:
print(f"[DEBUG] linkcheck: Failed to fetch {url}: {e}")
return group_info
if (resp.getcode() != 404):
if resp.getcode() != 404:
resp = resp.read().decode("utf-8")
group_info["name"] = unescape(GROUP_NAME_REGEX.search(resp).group(2))
group_info["image"] = unescape(GROUP_IMAGE_REGEX.search(resp).group(2))
Expand All @@ -85,6 +107,9 @@ def pad(s):


def scrape(txt, download_image=False):
if stop_event.is_set():
print("[DEBUG] scrape: Stop event set, exiting")
return
if isinstance(txt, bytes):
txt = txt.decode("utf-8")
match = []
Expand All @@ -93,58 +118,88 @@ def scrape(txt, download_image=False):
match = [item[0] for item in match2]
match = list(set(match))
for lmt in match:
if stop_event.is_set():
print("[DEBUG] scrape: Stop event set in loop, breaking")
break
lmt = pad(lmt)
info = linkcheck(lmt)
if info['name']:
print("[i] Group Name: ", info['name'])
print("[i] Group Link: ", info['url'])
print("[i] Group Image: ", info['image'])
lock.acquire()
if SAVE.endswith(".json"):
with open(SAVE, "r+", encoding='utf-8') as jsonFile:
data = json.load(jsonFile)
data.append(info)
jsonFile.seek(0)
json.dump(data, jsonFile)
jsonFile.truncate()
else:
with open(SAVE, "a", encoding='utf-8') as f:
write_data = " | ".join(info.values())+"\n"
f.write(write_data)
if download_image:
image_path = urllib.parse.urlparse(info['image'])
path, _ = urllib.request.urlretrieve(
info["image"], os.path.basename(image_path.path))
print("[i] Image Path: ", path)
lock.release()
try:
if SAVE.endswith(".json"):
with open(SAVE, "r+", encoding='utf-8') as jsonFile:
data = json.load(jsonFile)
data.append(info)
jsonFile.seek(0)
json.dump(data, jsonFile)
jsonFile.truncate()
else:
with open(SAVE, "a", encoding='utf-8') as f:
write_data = " | ".join(info.values()) + "\n"
f.write(write_data)
if download_image:
image_path = urllib.parse.urlparse(info['image'])
path, _ = urllib.request.urlretrieve(
info["image"], os.path.basename(image_path.path), timeout=10)
print("[i] Image Path: ", path)
finally:
lock.release()


def scrap_from_google(index):
print("[*] Initializing...")
if index >= len(availabledom):
if index >= len(availabledom) or stop_event.is_set():
print("[DEBUG] scrap_from_google: Stop event set or invalid index, exiting")
return
query = "intext:chat.whatsapp.com inurl:" + availabledom[index]
print("[*] Querying Google By Dorks ...")
for url in search(query, tld="com", num=10, stop=None, pause=2):
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}
req = urllib.request.Request(url, headers=hdr)
txt = urllib.request.urlopen(req).read().decode("utf8")
scrape(txt)
try:
for url in search(query, tld="com", num=5, stop=5, pause=1):
if stop_event.is_set():
print("[DEBUG] scrap_from_google: Stop event set in loop, breaking")
break
hdr = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://www.google.com/'
}
req = urllib.request.Request(url, headers=hdr)
txt = urllib.request.urlopen(req, timeout=10).read().decode("utf8")
scrape(txt)
except Exception as e:
print(f"[DEBUG] scrap_from_google: Error during search: {e}")


def scrap_from_link(index):
print("[*] Initializing...")
if index >= len(site_urls):
if index >= len(site_urls) or stop_event.is_set():
print("[DEBUG] scrap_from_link: Stop event set or invalid index, exiting")
return
r = urllib.request.urlopen(site_urls[index]).read().decode()
scrape(r)
print(f"[DEBUG] scrap_from_link: Fetching {site_urls[index]}")
try:
hdr = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://www.google.com/'
}
req = urllib.request.Request(site_urls[index], headers=hdr)
r = urllib.request.urlopen(req, timeout=10).read().decode()
scrape(r)
except Exception as e:
print(f"[DEBUG] scrap_from_link: Error fetching {site_urls[index]}: {e}")
return # Explicitly return to ensure thread exits


def update_tool():
print("[*] Updating Please Wait...")
try:
txt = urllib.request.urlopen(
"https://github.com/TheSpeedX/WhatScraper/raw/master/whatscraper.py").read()
"https://github.com/TheSpeedX/WhatScraper/raw/master/whatscraper.py", timeout=10).read()
with open(sys.argv[0], "wb") as f:
f.write(txt)
print("[$] Update Successful")
Expand All @@ -158,13 +213,18 @@ def initialize_google_scrapper():
threads = []
size = len(availabledom)
prompt = "[#] Enter the number of threads(1-" + str(size) + "):- "
thread_count = min(size, int(input(prompt)))
thread_count = min(size, int(input(prompt)) + 1)
for i in range(thread_count):
thread = threading.Thread(target=scrap_from_google, args=(i,))
thread.daemon = True
thread.start()
threads.append(thread)
for i in threads:
i.join()
while i.is_alive() and not stop_event.is_set():
i.join(timeout=1.0) # Check stop_event periodically
if stop_event.is_set():
print("[DEBUG] initialize_google_scrapper: Stop event set, exiting join")
break


def initialize_site_scrapper():
Expand All @@ -174,11 +234,15 @@ def initialize_site_scrapper():
thread_count = min(size, int(input(prompt)))
for i in range(thread_count):
thread = threading.Thread(target=scrap_from_link, args=(i,))
thread.daemon = True
thread.start()
threads.append(thread)

for i in threads:
i.join()
while i.is_alive() and not stop_event.is_set():
i.join(timeout=1.0) # Check stop_event periodically
if stop_event.is_set():
print("[DEBUG] initialize_site_scrapper: Stop event set, exiting join")
break


def initialize_file_scrapper():
Expand All @@ -195,13 +259,19 @@ def initialize_file_scrapper():
head = [next(strm) for x in range(op)]
thread = threading.Thread(
target=scrape, args=(b'\n'.join(head),))
thread.daemon = True
thread.start()
threads.append(thread)
thread = threading.Thread(target=scrape, args=(strm.read(),))
thread.daemon = True
thread.start()
threads.append(thread)
for i in threads:
i.join()
while i.is_alive() and not stop_event.is_set():
i.join(timeout=1.0) # Check stop_event periodically
if stop_event.is_set():
print("[DEBUG] initialize_file_scrapper: Stop event set, exiting join")
break


def main():
Expand All @@ -221,7 +291,7 @@ def main():
scrape(args.link, download_image=True)
return
if args.json:
SAVE = SAVE.split(".")[0]+".json"
SAVE = SAVE.split(".")[0] + ".json"
with open(SAVE, "w", encoding='utf-8') as jsonFile:
json.dump([], jsonFile)
print("""
Expand All @@ -233,21 +303,26 @@ def main():

try:
inp = int(input("[#] Enter Choice: "))
except Exception:
except ValueError:
print("\t[!] Invalid Choice..")
exit()

if inp == 1:
initialize_google_scrapper()
elif inp == 2:
initialize_site_scrapper()
elif inp == 3:
initialize_file_scrapper()
elif inp == 4:
update_tool()
else:
print("[!] Invalid Choice..")
try:
if inp == 1:
initialize_google_scrapper()
elif inp == 2:
initialize_site_scrapper()
elif inp == 3:
initialize_file_scrapper()
elif inp == 4:
update_tool()
else:
print("[!] Invalid Choice..")
except KeyboardInterrupt:
print("\n[!] Received Ctrl+C in main, stopping threads and exiting...")
stop_event.set()
sys.exit(0)


if __name__ == "__main__":
main()
main()