Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
2073bd0
experimenting qss
Sakth1 Oct 26, 2025
dfefbaf
cleaned up gitignore
Sakth1 Oct 26, 2025
c0fd176
cleaned up gitignore
Sakth1 Oct 26, 2025
2aa8baf
improved ui
Sakth1 Oct 27, 2025
e30e3a8
added appstate to store variables
Sakth1 Oct 27, 2025
fe570a9
Merge pull request #60 from Sakth1/main
Sakth1 Oct 27, 2025
67d632b
started on video page
Sakth1 Oct 27, 2025
2d87dcf
ui improvements
Sakth1 Oct 27, 2025
2d76d12
Merge branch 'Dev-official' of https://github.com/Sakth1/youtube_tran…
Sakth1 Oct 27, 2025
5d8a824
proxies not working
Sakth1 Oct 27, 2025
4837270
got the proxies working
Sakth1 Oct 28, 2025
d657abb
proxy is working now
Sakth1 Oct 28, 2025
18fb1a5
added threaded proxy validation and initiated thumbnail download
Sakth1 Oct 28, 2025
287f4a9
add update db when UNIQUE constraint
Sakth1 Oct 28, 2025
ad098a3
fixed downloading bug
Sakth1 Oct 28, 2025
6ae227f
Revert "Bump aiofiles from 24.1.0 to 25.1.0"
Sakth1 Oct 28, 2025
a38a77a
Merge pull request #66 from Sakth1/revert-65-dependabot/pip/aiofiles-…
Sakth1 Oct 28, 2025
6bae921
Add target branch for Dependabot updates
Sakth1 Oct 28, 2025
168ba69
sync (#67)
Sakth1 Oct 28, 2025
0854081
sync (#68)
Sakth1 Oct 28, 2025
3152ffa
corrections
Sakth1 Oct 29, 2025
a8251ab
Merge branch 'Dev-official' of https://github.com/Sakth1/youtube_tran…
Sakth1 Oct 29, 2025
83d6f9a
reworking search
Sakth1 Oct 29, 2025
a2cbd4a
added centralized proxy url in app_state
Sakth1 Oct 29, 2025
8988a11
Tried to work out utilizing proxies
Sakth1 Oct 29, 2025
b6600c6
Merge branch 'Dev-official' of https://github.com/Sakth1/youtube_tran…
Sakth1 Oct 29, 2025
6cd292c
fixed conflict
Sakth1 Oct 29, 2025
774fc95
Tried to fix proxy handling
Sakth1 Oct 29, 2025
154ebe3
minor changes to proxy
Sakth1 Oct 30, 2025
ea7d788
minor changes to proxy
Sakth1 Oct 30, 2025
726c517
Merge branch 'Dev-official' of https://github.com/Sakth1/youtube_tran…
Sakth1 Oct 30, 2025
c61eab0
TODO: Can't get valid proxy from swiftshadow
Sakth1 Oct 30, 2025
eb0b672
Removed swiftshadow, scraping proxies manually
Sakth1 Oct 31, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ updates:
directory: "/" # Location of package manifests
schedule:
interval: "weekly"
target-branch: "Dev-official"
24 changes: 16 additions & 8 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
.vscode/
# Virtual environments
.venv/
UI/__pycache__
Backend/__pycache__
trial.py
audio/

# IDE and editor settings
.vscode/

# Python cache files
__pycache__/
UI/__pycache__/
Backend/__pycache__/
Data/__pycache__/
utils/__pycache__/

# Temporary or generated files
output/
try/
Data/__pycache__
utils/__pycache__
experiments/

# Notes and documentation drafts
notes.excalidraw
Dev_Dairy.md
30 changes: 18 additions & 12 deletions Backend/ScrapeChannel.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
import scrapetube
from pathlib import Path
from datetime import datetime
import urllib.request

from Data.DatabaseManager import DatabaseManager
from utils.Proxy import Proxy

def download_with_proxy(url, save_path, proxy_url=None):
if proxy_url is None:
return

import requests
try:
response = requests.get(url, proxies={'http': proxy_url, 'https': proxy_url}, timeout=15.0, stream=True)
response.raise_for_status()
with open(save_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
except Exception as e:
print(f"[ERROR] Failed to download {url}: {e}")

class Search:
def __init__(self, db: DatabaseManager):
Expand All @@ -15,29 +25,25 @@ def __init__(self, db: DatabaseManager):
def search_channel(self, name: str = None, limit: int = 6):
if not name:
return {"None": {"title": None, "url": None}}

proxy = Proxy().get_proxy()
if proxy:
pass
else:
proxy = None

self.channels = {}
search_results = scrapetube.get_search(name, results_type="channel", limit=limit)
proxy_url = Proxy().get_working_proxy()

for ch in search_results:
title = ch.get("title", {}).get("simpleText")
sub_count = ch.get("videoCountText", {}).get("accessibility", {}).get("accessibilityData", {}).get("label")
desc = ch.get("descriptionSnippet", {}).get("runs")[0].get("text") if ch.get("descriptionSnippet") else None
channel_id = ch.get("channelId")
profile_url = "https:" + ch.get("thumbnail", {}).get("thumbnails")[0].get("url")

try:
profile_save_path = rf"{self.db.profile_pic_dir}/{channel_id}.png"
urllib.request.urlretrieve(profile_url, profile_save_path)
print(f'pic saved to {profile_save_path}')
download_with_proxy(profile_url, profile_save_path, proxy_url)
except Exception as e:
print(f"Failed to save profile picture: {e}")
import traceback
traceback.print_exc()

if channel_id:
url = f"https://www.youtube.com/channel/{channel_id}"
Expand Down
152 changes: 53 additions & 99 deletions Backend/ScrapeVideo.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,25 @@
import yt_dlp
from datetime import datetime
from pathlib import Path
import os

from utils.Proxy import Proxy
from Data.DatabaseManager import DatabaseManager # Your DB class

from Data.DatabaseManager import DatabaseManager


def download_with_proxy(url, save_path, proxy_url=None):
if proxy_url is None:
return

import requests
try:
response = requests.get(url, proxies={'http': proxy_url, 'https': proxy_url}, timeout=15.0, stream=True)
response.raise_for_status()
with open(save_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
except Exception as e:
print(f"[ERROR] Failed to download {url}: {e}")

class Videos:
def __init__(self, db: DatabaseManager):
Expand All @@ -24,12 +39,6 @@ def fetch_video_urls(self, channel_id: int, channel_url: str):
'quiet': True,
}

# Choose proxy
proxy = Proxy().get_proxy()
if proxy:
ydl_opts['proxy'] = proxy
print(f"[INFO] Using proxy for videos: {proxy}")

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(channel_url, download=False)

Expand All @@ -41,102 +50,47 @@ def fetch_video_urls(self, channel_id: int, channel_url: str):

for entry in entries:
entry_name = entry.get('title')
proxy_url = Proxy().get_working_proxy()

# --- Normal Videos ---
if entry_name == f'{channel_name} - Videos':
video_entries = entry.get('entries')
for i, video_entry in enumerate(video_entries):
video_id = video_entry.get('id')
title = video_entry.get('title')
url = video_entry.get('url')
views = video_entry.get('view_count')
duration = video_entry.get('duration')

# Save JSON file for raw data
file_path = self.db.save_json_file(
self.db.video_dir, f"video_{video_id}", video_entry
)

# Insert into DB
self.db.insert("VIDEO", {
"channel_id": channel_id,
"title": title,
"desc": video_entry.get("description"),
"duration": duration,
"view_count": views,
"like_count": video_entry.get("like_count"),
"pub_date": video_entry.get("upload_date"),
"status": "active",
"created_at": datetime.now().isoformat(),
"file_path": str(file_path)
})

self.videos[i] = {
"title": title,
"url": url,
"views": views,
"duration": duration
}
self.video_url.append(url)

# --- Live Videos ---
elif entry_name == f'{channel_name} - Live':
live_entries = entry.get('entries')
for i, live_entry in enumerate(live_entries):
title = live_entry.get('title')
url = live_entry.get('url')
views = live_entry.get('view_count')
duration = live_entry.get('duration')

self.live[i] = {
"title": title,
"url": url,
"views": views,
"duration": duration
}

# --- Shorts ---
video_type = 'video'
elif entry_name == f'{channel_name} - Shorts':
shorts_entries = entry.get('entries')
for i, shorts_entry in enumerate(shorts_entries):
title = shorts_entry.get('title')
url = shorts_entry.get('url')
views = shorts_entry.get('view_count')
duration = shorts_entry.get('duration')

self.shorts[i] = {
"title": title,
"url": url,
"views": views,
"duration": duration
}

# Final structured dict (for immediate use)
self.content = {
"live": self.live,
"shorts": self.shorts,
"videos": self.videos,
"video_url": self.video_url
}

return self.content
video_type = 'shorts'
elif entry_name == f'{channel_name} - Live':
video_type = 'live'

video_entries = entry.get('entries')
for i, video_entry in enumerate(video_entries):
video_id = video_entry.get('id')
title = video_entry.get('title')
url = video_entry.get('url')
views = video_entry.get('view_count')
duration = video_entry.get('duration')

thumbnail_url = video_entry.get("thumbnails")[-1].get("url")
os.makedirs(f"{self.db.thumbnail_dir}/{channel_id}", exist_ok=True)
profile_save_path = rf"{self.db.thumbnail_dir}/{channel_id}/{video_id}.png"
download_with_proxy(thumbnail_url, profile_save_path, proxy_url)

# Insert into DB
self.db.insert("VIDEO", {
"video_id": video_id,
"channel_id": channel_id,
"video_type": video_type,
"video_url": url,
"title": title,
"desc": video_entry.get("description"),
"duration": duration,
"view_count": views,
"like_count": video_entry.get("like_count"),
"pub_date": video_entry.get("upload_date"),
})

return

except Exception as e:
import traceback
traceback.print_exc()
print(f"Error while fetching video URLs: {e}")
return {}


if __name__ == "__main__":
db = DatabaseManager()
videos = Videos(db)

# Let's say we already inserted a CHANNEL and got its id
channel_id = 1
channel_url = "https://www.youtube.com/@mrbeast"

results = videos.fetch_video_urls(channel_id, channel_url)

print("Fetched:", results["videos"])
print("Saved video entries in DB:", db.fetch("VIDEO", "channel_id=?", (channel_id,)))

db.close()
41 changes: 34 additions & 7 deletions Data/DatabaseManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,16 @@ def __init__(self, base_dir: Optional[str] = None, db_name: str = "data.db"):
self.channel_dir = self.base_dir / "Channels"
self.profile_pic_dir = self.base_dir / "ProfilePics"
self.transcript_dir = self.base_dir / "Transcripts"
self.thumbnail_dir = self.base_dir / "Thumbnails"
self.comment_dir = self.base_dir / "Comments"
self.proxy_dir = self.base_dir / "Proxies"
self.video_dir = self.base_dir / "Videos"

# Ensure directories exist
for folder in [self.db_dir, self.transcript_dir, self.comment_dir, self.proxy_dir, self.video_dir, self.channel_dir, self.profile_pic_dir]:
for folder in [self.db_dir, self.transcript_dir, self.comment_dir,
self.proxy_dir, self.video_dir, self.channel_dir,
self.profile_pic_dir, self.thumbnail_dir]:
folder.mkdir(parents=True, exist_ok=True)
print(f"Created directory: {folder}")

# Thread-local storage for database connections
self._local = threading.local()
Expand Down Expand Up @@ -61,7 +63,9 @@ def _create_tables(self):

CREATE TABLE IF NOT EXISTS VIDEO (
video_id TEXT PRIMARY KEY,
channel_id TEXT,
channel_id TEXT,
video_type TEXT,
video_url TEXT,
title TEXT,
desc TEXT,
duration TEXT,
Expand Down Expand Up @@ -97,10 +101,33 @@ def insert(self, table: str, data: Dict[str, Any]) -> int:
values = tuple(data.values())
query = f"INSERT INTO {table} ({keys}) VALUES ({placeholders})"
cursor = conn.cursor()
cursor.execute(f"SELECT * FROM {table}")
cursor.execute(query, values)
conn.commit()
return cursor.lastrowid

try:
cursor.execute(query, values)
conn.commit()
return cursor.lastrowid
except sqlite3.IntegrityError as e:
# If it's a UNIQUE constraint error, try updating instead
if "UNIQUE constraint failed" in str(e):
# Extract the primary key column name from the error or table
if table == "VIDEO":
pk_column = "video_id"
elif table == "CHANNEL":
pk_column = "channel_id"
else:
# For other tables with auto-increment primary keys, re-raise
raise

# Get the primary key value from data
if pk_column in data:
pk_value = data[pk_column]
# Update instead of insert
update_data = {k: v for k, v in data.items() if k != pk_column}
return self.update(table, update_data, f"{pk_column}=?", (pk_value,))
else:
raise
else:
raise

def fetch(self, table: str, where: Optional[str] = None, params: Tuple = ()) -> List[Dict[str, Any]]:
conn = self._get_connection()
Expand Down
Loading