Shanghai-LiveIdol-Approval-Inquiry-Query/main.py at main · jesse1108/Shanghai-LiveIdol-Approval-Inquiry-Query · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import requests
import csv
from bs4 import BeautifulSoup
import re
import time

def get_links_and_titles(page_url, retries=3):
    for _ in range(retries):
        try:
            response = requests.get(page_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            links_and_titles = []

            for tr in soup.find_all('tr', align='left'):
                td = tr.find('td')
                link = td.find('a', target='blank')
                if link:
                    title = link.text.strip()
                    url = 'http://wsbs.wgj.sh.gov.cn' + link['href']
                    links_and_titles.append((url, title))

            return links_and_titles
        except requests.exceptions.RequestException as e:
            print(f"Error: {e}")
            time.sleep(5)
    return []

def check_address_and_date_in_page(url, addresses, year, retries=3):
    for _ in range(retries):
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            text = soup.text

            found_address = None
            for address in addresses:
                if address in text:
                    found_address = address
                    break

            if not found_address:
                return None

            date_pattern = re.compile(r'演出日期：\s*(\d{4})')
            match = date_pattern.search(text)
            if match:
                date_year = int(match.group(1))
                if date_year >= year:
                    return found_address
        except requests.exceptions.RequestException as e:
            print(f"Error: {e}")
            time.sleep(5)
    return None

def main():
    base_url = 'http://wsbs.wgj.sh.gov.cn/shwgj_ywtb/core/web/welcome/index!toResultNotice.action?flag=1'
    addresses_to_check = [
        '宜昌路179号', '延安西路1735号', '凯旋路851号', '愚园路1398号', '愚园路1250号',
        '万航渡后路19号', '虹许路731号', '长宁路999号', '重庆南路308号'
    ]
    year_to_check = 2023
    output_file = 'output.csv'

    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Address', 'Title', 'URL'])

        page_number = 1
        while True:
            print(f'Checking page {page_number}...')
            page_url = f'{base_url}&pageDoc.pageNo={page_number}'
            links_and_titles = get_links_and_titles(page_url)

            if not links_and_titles:
                break

            for url, title in links_and_titles:
                found_address = check_address_and_date_in_page(url, addresses_to_check, year_to_check)
                if found_address:
                    print(f'Found address {found_address} and date in: {title}\nURL: {url}\n')
                    csvwriter.writerow([found_address, title, url])

            page_number += 1

            if page_number % 11 == 0:
                print('Waiting for 30 seconds...')
                time.sleep(30)

if __name__ == '__main__':
    main()