22
33from bs4 import BeautifulSoup
44from colorama import Fore
5-
5+ import requests
66import argparse
7- import html
8- import os
97import re
10- import requests
8+ import os
119
1210parser = argparse .ArgumentParser (description = 'API to download lectures off msc-mu.com' )
1311parser .add_argument ('-b' , '--batch' , type = int , metavar = '' , help = 'to specify batch number' )
1412parser .add_argument ('-c' , '--course' , type = int , metavar = '' , help = 'to specify course number' )
1513parser .add_argument ('-f' , '--folder' , type = str , metavar = '' , help = 'to specify destination folder' )
1614args = parser .parse_args ()
1715
18- #FOLDER = '\\Documents\\Human Systems\\CVS\\' #Beggining with ~
19- FOLDER = '/documents/med/' # For linux
16+ FOLDER = '/dox/med'
2017
2118HEADERS = headers = {
22- "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML , like Gecko) Chrome/103 .0.0.0 Safari/537.36"
19+ "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (HTML , like Gecko) Chrome/120 .0.0.0 Safari/537.36"
2320 }
2421
22+
2523def choose_batch ():
2624 batches = [
27- [1 , '2022 ' , 'https://msc-mu.com/level/17' ],
25+ [1 , 'Athar ' , 'https://msc-mu.com/level/17' ],
2826 [2 , 'Rou7' , 'https://msc-mu.com/level/16' ],
2927 [3 , 'Wateen' , 'https://msc-mu.com/level/15' ],
3028 [4 , 'Nabed' , 'https://msc-mu.com/level/14' ],
3129 [5 , 'Wareed' , 'https://msc-mu.com/level/13' ],
3230 [6 , 'Minors' , 'https://msc-mu.com/level/10' ],
33- [7 , 'Majors' , 'https://msc-mu.com/level/9' ]
31+ [7 , 'Majors' , 'https://msc-mu.com/level/9' ]
3432 ]
3533 print ('\n ' )
3634 if args .batch :
3735 batch_url = batches [args .batch - 1 ][2 ]
3836 print (Fore .GREEN + '\n [*] Searching' , batches [args .batch - 1 ][1 ] + '\' s batch...\n ' )
3937 return batch_url
4038 for batch in batches :
41- print (str (batch [0 ]) + ') ' + batch [1 ] )
39+ print (str (batch [0 ]) + ') ' + batch [1 ])
4240 selected_batch = input ('\n [*] Which batch are you?\n \n >> ' )
4341 try :
4442 selected_batch = int (selected_batch )
@@ -51,6 +49,7 @@ def choose_batch():
5149 print ('\n [*]Invalid Input\n ' )
5250 return choose_batch ()
5351
52+
5453def find_courses (url ):
5554 page = requests .get (url , headers = HEADERS )
5655 doc = BeautifulSoup (page .text , 'html.parser' )
@@ -63,18 +62,6 @@ def find_courses(url):
6362 courses .append ([x + 1 , course_name , course_number ])
6463 return courses
6564
66- def find_subject_folder (name , doc ):
67- if ''' not in name :
68- name = html .unescape (name )
69- else :
70- name = name .strip (''' )
71- name = html .unescape (name )
72- folder_source = doc .find_all ("a" , string = name )[0 ].parent .parent .parent .parent .parent .parent .parent .parent .parent .parent .parent .parent .parent .parent
73- folder = re .findall ('''</i>
74- (.*)
75-
76- </h6>''' , folder_source .decode ())
77- return folder [0 ]
7865
7966def choose_course (courses ):
8067 if args .course :
@@ -98,44 +85,6 @@ def choose_course(courses):
9885 print ('\n [*]Invalid Input\n ' )
9986 return choose_course (courses )
10087
101- def download_lectures (url , folder ):
102- course_page = requests .get (url , headers = HEADERS )
103- extensions = ['.pdf' , '.pptx' ]
104- for extension in extensions :
105- links = re .findall ('<a href="(.*)">.*' + extension + '</a>' , course_page .content .decode ())
106- names = re .findall ('<a href=".*">(.*)' + extension + '</a>' , course_page .content .decode ())
107- doc = BeautifulSoup (course_page .text , 'html.parser' )
108- y = 0
109- prev_sub_folder = None
110- subject_folders_list = []
111- for x , link in enumerate (links ):
112- link = link .strip () + extension
113- subject_folder = find_subject_folder (names [x ] + extension , doc )
114- if subject_folder != prev_sub_folder :
115- if subject_folder in subject_folders_list :
116- subject_folder = subject_folder + '-extras'
117- y = 0
118- new_name = str (y + 1 ) + '. ' + names [x ] + extension
119- y += 1
120- subject_folders_list .append (subject_folder )
121- prev_sub_folder = subject_folder
122- file_path = folder + subject_folder + '/' + new_name
123- if os .path .isfile (file_path ):
124- if new_name .startswith ('1.' ):
125- print ('\n ################ ' + subject_folder + ' ################\n ' )
126- print ( Fore .MAGENTA + new_name + ' <is already downloaded there XD>' + Fore .RESET )
127- continue
128- if not os .path .isdir (folder + subject_folder ):
129- os .makedirs (folder + subject_folder )
130- print ('\n ################ ' + subject_folder + ' ################\n ' )
131-
132- response = requests .get (link , headers = HEADERS )
133- with open (file_path , 'wb' ) as file :
134- file .write (response .content )
135- print ('[*] Downloaded ' + new_name )
136-
137-
138- # If not specified, prompt the user to input a folder
13988
14089def choose_folder ():
14190 folder = os .path .expanduser ("~" ) + FOLDER
@@ -144,15 +93,17 @@ def choose_folder():
14493 args .folder = os .path .expanduser (args .folder )
14594 if os .path .isdir (args .folder ):
14695 folder = args .folder
96+ if not folder [- 1 ] == os .path .sep :
97+ folder = folder + os .path .sep
14798 return folder
14899 else :
149100 print ('\n [*] Folder Not found! ' , end = '' )
150101 quit ()
151102 else :
152- answer = input ('[*] Your default destination is ' + folder + "\n [*] Do you want to keep that (Y/n): " )
103+ answer = input ('[*] Your default destination is ' + folder + "\n [*] Do you want to keep that (Y/n): " )
153104 if answer == 'n' or answer == 'no' or answer == 'N' :
154105 valid_folder = False
155- while valid_folder == False :
106+ while not valid_folder :
156107 selected_folder = input ('\n [*] Enter the Folder you want to save material in.\n \n >> ' )
157108 # Adds a seperator at the end if the user didn't
158109 if not selected_folder .endswith (os .path .sep ):
@@ -163,41 +114,103 @@ def choose_folder():
163114 valid_folder = True
164115 else :
165116 print ('\n [*] Folder Not found! ' , end = '' )
117+ if not folder [- 1 ] == os .path .sep :
118+ folder = folder + os .path .sep
166119 return folder
167120
168- # Gets the name of the course from the course number, and makes a folder with that name
121+
122+ def create_nav_links_dictionary (soup ):
123+ navigate_dict = {}
124+ nav_links = soup .find_all ('li' , attrs = {"class" : "nav-item" })
125+ for navigate_link in nav_links :
126+ if navigate_link .h5 :
127+ nav_name = navigate_link .h5 .text .strip ()
128+ nav_number = navigate_link .a .get ('aria-controls' )
129+ navigate_dict [nav_number ] = nav_name
130+ return navigate_dict
131+
169132
170133def make_course_folder (courses , index , folder ):
171134 course_name = None
172135 for course in courses :
173136 if course [2 ] == index :
174137 course_name = course [1 ]
175138 break
176- new_folder = folder + os . path . sep + course_name + os .path .sep
139+ new_folder = folder + course_name + os .path .sep
177140 if not os .path .isdir (new_folder ):
178141 os .mkdir (new_folder )
179142 folder = new_folder
180143 return folder
181144
145+
146+ def find_files_paths_and_links (navigation_dict , soup ):
147+ file_tags = soup .find_all ('a' , string = lambda text : text and '.pdf' in text ) + soup .find_all ('a' , string = lambda text : text and '.ppt' in text )
148+ files_list = []
149+ path = []
150+ associated_nav_link_id = ''
151+ for file_tag in file_tags :
152+ current_tag = file_tag
153+ if not current_tag :
154+ print ('no pdf or pptx files!' )
155+ quit ()
156+ while True :
157+ current_tag = current_tag .parent
158+ if current_tag .name == 'div' and 'mb-3' in current_tag .get ('class' , []):
159+ path .append (current_tag .h6 .text .strip ())
160+ if current_tag .name == 'div' and 'tab-pane' in current_tag .get ('class' , []):
161+ associated_nav_link_id = current_tag .get ('id' )
162+ if not current_tag .parent :
163+ break
164+ path .append (navigation_dict [associated_nav_link_id ])
165+ path .reverse ()
166+ basename = file_tag .text
167+ file_path = "/" .join (path ) + os .path .sep
168+ path .clear ()
169+
170+ file_link = file_tag .get ('href' )
171+ files_list .append ([file_path , file_link , basename ])
172+ return files_list
173+
174+
175+ def download_from_dict (path_link_dict , folder ):
176+ for path , link , name in path_link_dict :
177+
178+ if os .path .isfile (folder + path + name ):
179+ print (Fore .MAGENTA + path + name + ' <is already downloaded there XD>' + Fore .RESET )
180+ continue
181+
182+ if not os .path .isdir (folder + path ):
183+ os .makedirs (folder + path )
184+
185+ response = requests .get (link , headers = HEADERS )
186+ with open (folder + path + name , 'wb' ) as file :
187+ file .write (response .content )
188+ print ('[*] Downloaded ' + name )
189+
190+
182191def main ():
183192 folder = choose_folder ()
184193 batch_url = choose_batch ()
185194 courses = find_courses (batch_url )
186195 course_number = choose_course (courses )
187196 folder = make_course_folder (courses , course_number , folder )
188197 download_url = 'https://msc-mu.com/courses/' + course_number
189- download_lectures (download_url , folder )
198+ course_page = requests .get (download_url , headers = HEADERS )
199+ soup = BeautifulSoup (course_page .text , 'html.parser' )
200+
201+ nav_dict = create_nav_links_dictionary (soup )
202+ file_dict = find_files_paths_and_links (nav_dict , soup )
203+ download_from_dict (file_dict , folder )
204+
190205
191206if __name__ == '__main__' :
192207 print (Fore .CYAN + '#' * 54 + Fore .RESET )
193-
194208 try :
195209 main ()
196210 except KeyboardInterrupt :
197211 print (Fore .RED + '\n [*] KeyboardInterrupt' )
198212 print (Fore .GREEN + '[*] Good bye!' )
199213 quit ()
200-
201214 print (Fore .GREEN + '\n \n [*] Done...' )
202215 print ('[*] Goodbye!' )
203216 input ('[*] Press anything to' + Fore .RED + ' exit' )
0 commit comments