|
| 1 | +import requests |
| 2 | +from bs4 import BeautifulSoup |
| 3 | +import re |
| 4 | + |
| 5 | +url = 'https://www.codechef.com/problems/TWORANGES?tab=statement' |
| 6 | +response = requests.get(url) |
| 7 | +soup = BeautifulSoup(response.content, 'html.parser') |
| 8 | + |
| 9 | +# Print the title of the webpage |
| 10 | +print(f"Title: {soup.title.text}\n") |
| 11 | + |
| 12 | +# Find and print all links on the page |
| 13 | +print("Links on the page:") |
| 14 | +for link in soup.find_all('a'): |
| 15 | + print(link.get('href')) |
| 16 | + |
| 17 | +# Extract text from paragraphs |
| 18 | +print("\nText from paragraphs:") |
| 19 | +for paragraph in soup.find_all('p'): |
| 20 | + print(paragraph.text) |
| 21 | + |
| 22 | +# Extract image URLs |
| 23 | +print("\nImage URLs:") |
| 24 | +for img in soup.find_all('img'): |
| 25 | + img_url = img.get('src') |
| 26 | + if img_url: |
| 27 | + print(img_url) |
| 28 | + |
| 29 | +# Count and categorize tags |
| 30 | +print("\nTag counts:") |
| 31 | +tag_counts = {} |
| 32 | +for tag in soup.find_all(): |
| 33 | + tag_name = tag.name |
| 34 | + if tag_name: |
| 35 | + tag_counts[tag_name] = tag_counts.get(tag_name, 0) + 1 |
| 36 | + |
| 37 | +for tag, count in tag_counts.items(): |
| 38 | + print(f"{tag}: {count}") |
| 39 | + |
| 40 | +# Filter and print valid links |
| 41 | +print("\nValid links:") |
| 42 | +for link in soup.find_all('a'): |
| 43 | + href = link.get('href') |
| 44 | + if href and re.match(r'^https?://', href): |
| 45 | + print(href) |
| 46 | + |
| 47 | +# Save data to a file |
| 48 | +with open('webpage_data.txt', 'w') as file: |
| 49 | + file.write(f"Title: {soup.title.text}\n\n") |
| 50 | + file.write("Links on the page:\n") |
| 51 | + for link in soup.find_all('a'): |
| 52 | + file.write(f"{link.get('href')}\n") |
| 53 | + file.write("\nText from paragraphs:\n") |
| 54 | + for paragraph in soup.find_all('p'): |
| 55 | + file.write(f"{paragraph.text}\n") |
| 56 | + |
| 57 | +print("\nData saved to 'webpage_data.txt'") |
0 commit comments