kaspersky-security-tools/scripts/web_scrapping/qanda.py at main · hectorflores28/kaspersky-security-tools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import requests
from bs4 import BeautifulSoup
import time
import random
from requests.exceptions import RequestException
import ssl

# A list of common User-Agent strings
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/89.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0'
]

# Disable SSL warnings for this script
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

# Define the output file name
output_file = "all_faqs.txt"

def get_faqs_from_url(url):
    """
    Scrapes a given URL to find FAQ content and saves it to a file.
    """
    try:
        # Select a random User-Agent from the list
        headers = {
            'User-Agent': random.choice(user_agents),
            'Accept-Language': 'en-US,en;q=0.9',
            'Referer': url
        }

        # Add a random delay before making the request
        time.sleep(random.uniform(2, 5))

        response = requests.get(url, headers=headers, timeout=15, verify=False)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        faq_keywords = ['faq', 'frequently asked questions', 'q&a', 'help center', 'preguntas frecuentes']
        faq_selectors = ['div', 'section', 'ul', 'article', 'main']

        found_faqs = False

        for selector in faq_selectors:
            for element in soup.find_all(selector):
                if any(keyword in element.get_text().lower() for keyword in faq_keywords) or \
                   any(keyword in str(element.get('id', '')).lower() for keyword in faq_keywords) or \
                   any(keyword in str(element.get('class', '')).lower() for keyword in faq_keywords):

                    faq_content = element.get_text(separator="\n", strip=True)

                    # Append the found FAQs to the text file
                    with open(output_file, "a", encoding="utf-8") as f:
                        f.write(f"--- FAQ content from {url} ---\n")
                        f.write(faq_content)
                        f.write("\n" + "-" * 50 + "\n\n")

                    print(f"--- Successfully saved FAQ content from {url} ---")
                    found_faqs = True
                    break
            if found_faqs:
                break

        if not found_faqs:
            print(f"--- No clear FAQ section found on {url} ---")

    except requests.exceptions.HTTPError as errh:
        print(f"--- HTTP Error accessing {url}: {errh} ---")
    except RequestException as e:
        print(f"--- An unexpected error occurred while accessing {url}: {e} ---")

# Define the array of URLs you want to scrape
urls_to_scrape = [
    'https://www.apple.com/shop/help/payments',
    'https://www.amazon.com/gp/help/customer/display.html?nodeId=GN7B6F3E689C8G6Z'

# Iterate through the URL array and call the scraper function
for url in urls_to_scrape:
    get_faqs_from_url(url.strip())