Skip to content

Commit a83d9da

Browse files
committed
Added script to scrape coursera
1 parent 2717a2c commit a83d9da

File tree

3 files changed

+180
-0
lines changed

3 files changed

+180
-0
lines changed

Coursera Scraper/README.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Coursera Courses Scraper
2+
## Description
3+
A simple scraping module that gets coursera courses by web scraping. The purpose of this is to provide an alternate way to get list of courses from coursera
4+
5+
### Language
6+
- [X] Python
7+
8+
### Usage
9+
To access the `courses`, this application imports the following modules.
10+
```python
11+
import selenium
12+
```
13+
14+
### Instructions to run this application
15+
16+
1. Fork the repository and open `courses.py`
17+
2. Initialize the courses class with
18+
```python
19+
c = Courses("<Course_Name>","<No_of_pages>")
20+
```
21+
3. Use any of the functions to get required data like
22+
```python
23+
c.scrape_all()
24+
```
25+
4. It will return a dictionary containing the list of courses
26+
27+
##### Example Output
28+
The functions will return -
29+
```
30+
{
31+
data : [<List of Dictionaries>],
32+
msg : Course Titles for <Keyword>
33+
}
34+
```

Coursera Scraper/courses.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
from selenium import webdriver
2+
from selenium.webdriver.common.by import By
3+
from selenium.webdriver.chrome.service import Service
4+
from selenium.webdriver.support.ui import WebDriverWait
5+
from selenium.webdriver.support import expected_conditions as EC
6+
from selenium.webdriver.chrome.options import Options
7+
8+
"""
9+
Example code :
10+
python_scraper = Courses("python",5)
11+
print(python_scraper.scrape_all())
12+
"""
13+
class Courses:
14+
def __init__(self, keyword, page_count):
15+
self.keyword = keyword
16+
self.page_count = page_count
17+
18+
def __scrape_page(self):
19+
chromedriver_path = ''
20+
options = Options()
21+
options.add_argument("--headless")
22+
driver = webdriver.Chrome(service=Service(chromedriver_path), options=options)
23+
wait = WebDriverWait(driver, 100)
24+
driver.get('https://www.coursera.org/search?query=' + self.keyword)
25+
return wait, driver
26+
def scrape_all(self):
27+
wait, driver = self.__scrape_page()
28+
courses_data = []
29+
try:
30+
j = 0
31+
for i in range(self.page_count):
32+
courses = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'main ul>li')))
33+
for course in courses:
34+
title = driver.execute_script('return arguments[0].querySelector("h3")?.innerText',course)
35+
description = driver.execute_script('return arguments[0].querySelector("p>span")?.innerText', course)
36+
review = driver.execute_script('return arguments[0].querySelector("div:has(>svg)")?.innerText.replace("\\n\\n","⭐")', course)
37+
url = driver.execute_script('return String(arguments[0].querySelector("a")?.href)', course)
38+
data = {"id":j,"title":title,"description":description,"review":review,"url":url}
39+
courses_data += [data]
40+
j+=1
41+
next_btn = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next Page"]')
42+
if 'disabled' in next_btn.get_attribute('class'):
43+
print('There are no more pages')
44+
break
45+
else:
46+
next_btn.click()
47+
return {
48+
"data": courses_data,
49+
"message": f"Course Titles for {self.keyword}"
50+
}
51+
except:
52+
return {
53+
"data": None,
54+
"message": f"No courses found for {self.keyword}"
55+
}
56+
def course_titles(self):
57+
wait, driver = self.__scrape_page()
58+
titles = []
59+
try:
60+
for i in range(self.page_count):
61+
courses = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'main ul>li')))
62+
titles.extend([driver.execute_script('return arguments[0].querySelector("h3")?.innerText', course) for course in courses])
63+
next_btn = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next Page"]')
64+
if 'disabled' in next_btn.get_attribute('class'):
65+
print('There are no more pages')
66+
break
67+
else:
68+
next_btn.click()
69+
return {
70+
"data": titles,
71+
"message": f"Course Titles for {self.keyword}"
72+
}
73+
except:
74+
return {
75+
"data": None,
76+
"message": f"No courses found for {self.keyword}"
77+
}
78+
def course_description(self):
79+
wait, driver = self.__scrape_page()
80+
descriptions = []
81+
try:
82+
for i in range(self.page_count):
83+
courses = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'main ul>li')))
84+
descriptions.extend([driver.execute_script('return arguments[0].querySelector("p>span")?.innerText', course) for course in courses])
85+
next_btn = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next Page"]')
86+
if 'disabled' in next_btn.get_attribute('class'):
87+
print('There are no more pages')
88+
break
89+
else:
90+
next_btn.click()
91+
return {
92+
"data": descriptions,
93+
"message": f"Course Titles for {self.keyword}"
94+
}
95+
except:
96+
return {
97+
"data": None,
98+
"message": f"No courses found for {self.keyword}"
99+
}
100+
def course_reviews(self):
101+
wait, driver = self.__scrape_page()
102+
reviews = []
103+
try:
104+
for i in range(self.page_count):
105+
courses = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'main ul>li')))
106+
reviews.extend([driver.execute_script('return arguments[0].querySelector("div:has(>svg)")?.innerText.replace("\\n\\n","⭐")', course) for course in courses])
107+
next_btn = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next Page"]')
108+
if 'disabled' in next_btn.get_attribute('class'):
109+
print('There are no more pages')
110+
break
111+
else:
112+
next_btn.click()
113+
return {
114+
"data": reviews,
115+
"message": f"Course Titles for {self.keyword}"
116+
}
117+
except:
118+
return {
119+
"data": None,
120+
"message": f"No courses found for {self.keyword}"
121+
}
122+
def course_urls(self):
123+
wait, driver = self.__scrape_page()
124+
urls = []
125+
try:
126+
for i in range(self.page_count):
127+
courses = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'main ul>li')))
128+
urls.extend([driver.execute_script('return String(arguments[0].querySelector("a")?.href)', course) for course in courses])
129+
next_btn = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next Page"]')
130+
if 'disabled' in next_btn.get_attribute('class'):
131+
print('There are no more pages')
132+
break
133+
else:
134+
next_btn.click()
135+
return {
136+
"data": urls,
137+
"message": f"Course Titles for {self.keyword}"
138+
}
139+
except:
140+
return {
141+
"data": None,
142+
"message": f"No courses found for {self.keyword}"
143+
}
144+
python_scraper = Courses("python",5)
145+
print(python_scraper.scrape_all())

Coursera Scraper/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
selenium==4.9.1

0 commit comments

Comments
 (0)