Merge pull request #311 from realpython/python-web-scraping-practical-introduction

KateFinegan · web-flow · commit fe609dd67ab4 · 2022-10-11T13:45:20.000-06:00
Add materials for Python Web Scraping tutorial
diff --git a/python-web-scraping-practical-introduction/README.md b/python-web-scraping-practical-introduction/README.md
@@ -0,0 +1,26 @@
+# A Practical Introduction to Web Scraping in Python
+
+This repository holds the code for the Real Python [A Practical Introduction to Web Scraping in Python](https://realpython.com/python-web-scraping-practical-introduction/) tutorial.
+
+## Dependencies
+
+To run the examples in this repository, you need to have the dependencies installed. You should first create a virtual environment:
+
+```console
+$ python -m venv venv
+$ source venv/bin/activate
+```
+
+Then, navigate into the subfolder and install the requirements with `pip`:
+
+```console
+(venv) $ python -m pip install -r requirements.txt
+```
+
+## Author
+
+- [David Amos](https://realpython.com/team/damos/)
+
+## License
+
+Distributed under the MIT license. See [`LICENSE`](../LICENSE) for more information.
diff --git a/python-web-scraping-practical-introduction/beauty_soup.py b/python-web-scraping-practical-introduction/beauty_soup.py
@@ -0,0 +1,13 @@
+from urllib.request import urlopen
+
+from bs4 import BeautifulSoup
+
+url = "http://olympus.realpython.org/profiles/dionysus"
+page = urlopen(url)
+html = page.read().decode("utf-8")
+soup = BeautifulSoup(html, "html.parser")
+image1, image2 = soup.find_all("img")
+
+print(image1.name)
+print(image2.name)
+print(soup.title.string)
diff --git a/python-web-scraping-practical-introduction/mech_soup.py b/python-web-scraping-practical-introduction/mech_soup.py
@@ -0,0 +1,15 @@
+import time
+
+import mechanicalsoup
+
+browser = mechanicalsoup.Browser()
+
+for i in range(4):
+    page = browser.get("http://olympus.realpython.org/dice")
+    tag = page.soup.select("#result")[0]
+    result = tag.text
+    print(f"The result of your dice roll is: {result}")
+
+    # Wait 10 seconds if this isn't the last request
+    if i < 3:
+        time.sleep(10)
diff --git a/python-web-scraping-practical-introduction/regex_soup.py b/python-web-scraping-practical-introduction/regex_soup.py
@@ -0,0 +1,13 @@
+import re
+from urllib.request import urlopen
+
+url = "http://olympus.realpython.org/profiles/dionysus"
+page = urlopen(url)
+html = page.read().decode("utf-8")
+
+pattern = "<title.*?>.*?</title.*?>"
+match_results = re.search(pattern, html, re.IGNORECASE)
+title = match_results.group()
+title = re.sub("<.*?>", "", title)  # Remove HTML tags
+
+print(title)
diff --git a/python-web-scraping-practical-introduction/requirements.txt b/python-web-scraping-practical-introduction/requirements.txt
@@ -0,0 +1,9 @@
+beautifulsoup4==4.11.1
+certifi==2022.9.24
+charset-normalizer==2.1.1
+idna==3.4
+lxml==4.9.1
+MechanicalSoup==1.2.0
+requests==2.28.1
+soupsieve==2.3.2.post1
+urllib3==1.26.12