1
+ from selenium import webdriver
2
+ from selenium .webdriver .common .by import By
3
+ from selenium .webdriver .chrome .service import Service
4
+ from selenium .webdriver .support .ui import WebDriverWait
5
+ from selenium .webdriver .support import expected_conditions as EC
6
+ from selenium .webdriver .chrome .options import Options
7
+
8
+ """
9
+ Example code :
10
+ python_scraper = Courses("python",5)
11
+ print(python_scraper.scrape_all())
12
+ """
13
+ class Courses :
14
+ def __init__ (self , keyword , page_count ):
15
+ self .keyword = keyword
16
+ self .page_count = page_count
17
+
18
+ def __scrape_page (self ):
19
+ chromedriver_path = ''
20
+ options = Options ()
21
+ options .add_argument ("--headless" )
22
+ driver = webdriver .Chrome (service = Service (chromedriver_path ), options = options )
23
+ wait = WebDriverWait (driver , 100 )
24
+ driver .get ('https://www.coursera.org/search?query=' + self .keyword )
25
+ return wait , driver
26
+ def scrape_all (self ):
27
+ wait , driver = self .__scrape_page ()
28
+ courses_data = []
29
+ try :
30
+ j = 0
31
+ for i in range (self .page_count ):
32
+ courses = wait .until (EC .visibility_of_all_elements_located ((By .CSS_SELECTOR , 'main ul>li' )))
33
+ for course in courses :
34
+ title = driver .execute_script ('return arguments[0].querySelector("h3")?.innerText' ,course )
35
+ description = driver .execute_script ('return arguments[0].querySelector("p>span")?.innerText' , course )
36
+ review = driver .execute_script ('return arguments[0].querySelector("div:has(>svg)")?.innerText.replace("\\ n\\ n","⭐")' , course )
37
+ url = driver .execute_script ('return String(arguments[0].querySelector("a")?.href)' , course )
38
+ data = {"id" :j ,"title" :title ,"description" :description ,"review" :review ,"url" :url }
39
+ courses_data += [data ]
40
+ j += 1
41
+ next_btn = driver .find_element (By .CSS_SELECTOR , 'button[aria-label="Next Page"]' )
42
+ if 'disabled' in next_btn .get_attribute ('class' ):
43
+ print ('There are no more pages' )
44
+ break
45
+ else :
46
+ next_btn .click ()
47
+ return {
48
+ "data" : courses_data ,
49
+ "message" : f"Course Titles for { self .keyword } "
50
+ }
51
+ except :
52
+ return {
53
+ "data" : None ,
54
+ "message" : f"No courses found for { self .keyword } "
55
+ }
56
+ def course_titles (self ):
57
+ wait , driver = self .__scrape_page ()
58
+ titles = []
59
+ try :
60
+ for i in range (self .page_count ):
61
+ courses = wait .until (EC .visibility_of_all_elements_located ((By .CSS_SELECTOR , 'main ul>li' )))
62
+ titles .extend ([driver .execute_script ('return arguments[0].querySelector("h3")?.innerText' , course ) for course in courses ])
63
+ next_btn = driver .find_element (By .CSS_SELECTOR , 'button[aria-label="Next Page"]' )
64
+ if 'disabled' in next_btn .get_attribute ('class' ):
65
+ print ('There are no more pages' )
66
+ break
67
+ else :
68
+ next_btn .click ()
69
+ return {
70
+ "data" : titles ,
71
+ "message" : f"Course Titles for { self .keyword } "
72
+ }
73
+ except :
74
+ return {
75
+ "data" : None ,
76
+ "message" : f"No courses found for { self .keyword } "
77
+ }
78
+ def course_description (self ):
79
+ wait , driver = self .__scrape_page ()
80
+ descriptions = []
81
+ try :
82
+ for i in range (self .page_count ):
83
+ courses = wait .until (EC .visibility_of_all_elements_located ((By .CSS_SELECTOR , 'main ul>li' )))
84
+ descriptions .extend ([driver .execute_script ('return arguments[0].querySelector("p>span")?.innerText' , course ) for course in courses ])
85
+ next_btn = driver .find_element (By .CSS_SELECTOR , 'button[aria-label="Next Page"]' )
86
+ if 'disabled' in next_btn .get_attribute ('class' ):
87
+ print ('There are no more pages' )
88
+ break
89
+ else :
90
+ next_btn .click ()
91
+ return {
92
+ "data" : descriptions ,
93
+ "message" : f"Course Titles for { self .keyword } "
94
+ }
95
+ except :
96
+ return {
97
+ "data" : None ,
98
+ "message" : f"No courses found for { self .keyword } "
99
+ }
100
+ def course_reviews (self ):
101
+ wait , driver = self .__scrape_page ()
102
+ reviews = []
103
+ try :
104
+ for i in range (self .page_count ):
105
+ courses = wait .until (EC .visibility_of_all_elements_located ((By .CSS_SELECTOR , 'main ul>li' )))
106
+ reviews .extend ([driver .execute_script ('return arguments[0].querySelector("div:has(>svg)")?.innerText.replace("\\ n\\ n","⭐")' , course ) for course in courses ])
107
+ next_btn = driver .find_element (By .CSS_SELECTOR , 'button[aria-label="Next Page"]' )
108
+ if 'disabled' in next_btn .get_attribute ('class' ):
109
+ print ('There are no more pages' )
110
+ break
111
+ else :
112
+ next_btn .click ()
113
+ return {
114
+ "data" : reviews ,
115
+ "message" : f"Course Titles for { self .keyword } "
116
+ }
117
+ except :
118
+ return {
119
+ "data" : None ,
120
+ "message" : f"No courses found for { self .keyword } "
121
+ }
122
+ def course_urls (self ):
123
+ wait , driver = self .__scrape_page ()
124
+ urls = []
125
+ try :
126
+ for i in range (self .page_count ):
127
+ courses = wait .until (EC .visibility_of_all_elements_located ((By .CSS_SELECTOR , 'main ul>li' )))
128
+ urls .extend ([driver .execute_script ('return String(arguments[0].querySelector("a")?.href)' , course ) for course in courses ])
129
+ next_btn = driver .find_element (By .CSS_SELECTOR , 'button[aria-label="Next Page"]' )
130
+ if 'disabled' in next_btn .get_attribute ('class' ):
131
+ print ('There are no more pages' )
132
+ break
133
+ else :
134
+ next_btn .click ()
135
+ return {
136
+ "data" : urls ,
137
+ "message" : f"Course Titles for { self .keyword } "
138
+ }
139
+ except :
140
+ return {
141
+ "data" : None ,
142
+ "message" : f"No courses found for { self .keyword } "
143
+ }
144
+ python_scraper = Courses ("python" ,5 )
145
+ print (python_scraper .scrape_all ())
0 commit comments