Merge branch 'cont1' of https://github.com/Shivansh-Jain-github/Amazing-Python-Scripts into cont1

MrResilient · MrResilient · commit 8a9853c069a1 · 2023-08-08T08:40:01.000+05:30
diff --git a/Web Scrapping using Beautiful Soup/README.md b/Web Scrapping using Beautiful Soup/README.md
@@ -0,0 +1,27 @@
+# Web Scraping with Beautiful Soup
+
+This script performs web scraping on a CodeChef problem statement webpage using the Beautiful Soup library in Python.
+
+## Description
+
+The Python script utilizes the `requests` and `BeautifulSoup` libraries to extract information from a CodeChef problem statement webpage. It demonstrates the following actions:
+
+- Printing the title of the webpage.
+- Finding and printing all links on the page.
+- Extracting text from paragraphs.
+- Extracting image URLs.
+- Counting and categorizing HTML tags.
+- Filtering and printing valid links.
+- Saving extracted data to a text file.
+
+## Prerequisites
+
+Ensure you have the following libraries installed:
+
+- `requests`
+- `beautifulsoup4`
+
+You can install them using the following commands:
+
+```bash
+pip install requests beautifulsoup4
diff --git a/Web Scrapping using Beautiful Soup/code.py b/Web Scrapping using Beautiful Soup/code.py
@@ -0,0 +1,57 @@
+import requests
+from bs4 import BeautifulSoup
+import re
+
+url = 'https://www.codechef.com/problems/TWORANGES?tab=statement'
+response = requests.get(url)
+soup = BeautifulSoup(response.content, 'html.parser')
+
+# Print the title of the webpage
+print(f"Title: {soup.title.text}\n")
+
+# Find and print all links on the page
+print("Links on the page:")
+for link in soup.find_all('a'):
+    print(link.get('href'))
+
+# Extract text from paragraphs
+print("\nText from paragraphs:")
+for paragraph in soup.find_all('p'):
+    print(paragraph.text)
+
+# Extract image URLs
+print("\nImage URLs:")
+for img in soup.find_all('img'):
+    img_url = img.get('src')
+    if img_url:
+        print(img_url)
+
+# Count and categorize tags
+print("\nTag counts:")
+tag_counts = {}
+for tag in soup.find_all():
+    tag_name = tag.name
+    if tag_name:
+        tag_counts[tag_name] = tag_counts.get(tag_name, 0) + 1
+
+for tag, count in tag_counts.items():
+    print(f"{tag}: {count}")
+
+# Filter and print valid links
+print("\nValid links:")
+for link in soup.find_all('a'):
+    href = link.get('href')
+    if href and re.match(r'^https?://', href):
+        print(href)
+
+# Save data to a file
+with open('webpage_data.txt', 'w') as file:
+    file.write(f"Title: {soup.title.text}\n\n")
+    file.write("Links on the page:\n")
+    for link in soup.find_all('a'):
+        file.write(f"{link.get('href')}\n")
+    file.write("\nText from paragraphs:\n")
+    for paragraph in soup.find_all('p'):
+        file.write(f"{paragraph.text}\n")
+
+print("\nData saved to 'webpage_data.txt'")