-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawling.py
More file actions
119 lines (94 loc) · 3.13 KB
/
crawling.py
File metadata and controls
119 lines (94 loc) · 3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import re
from tqdm import tqdm
import time
import pymysql
import config
def get_book_titles():
# Initialize variables
title_list = []
detail_list = []
maximum = 20
# Set options for Chrome
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
# Crawl each page for traditional fairy tales
with webdriver.Chrome(options=chrome_options) as driver:
for i in tqdm(range(1, maximum + 1)):
URL = f"http://18children.president.pa.go.kr/our_space/fairy_tales.php?srh%5Bcategory%5D=07&srh%5Bpage%5D={i}"
driver.get(URL)
# Wait for page to load
driver.implicitly_wait(10)
# Get all titles
elements_title = driver.find_elements(By.CLASS_NAME, "title")
for element in elements_title:
titles = element.find_elements(By.TAG_NAME, "a")
title = titles[0].text
title_list.append(title)
# Get all details
for j in range(1, 6):
detail = driver.find_element(By.XPATH, f'//*[@id="content"]/div[2]/div[1]/ul/li[{j}]/dl/dt/a')
detail.click()
# Switch to new tab
driver.switch_to.window(driver.window_handles[-1])
elements_content = driver.find_elements(By.CLASS_NAME, 'content')
for element in elements_content:
element_text = element.text
detail_list.append(element_text)
driver.close()
# Switch back to original tab
driver.switch_to.window(driver.window_handles[0])
# Remove non-Korean characters from titles
title_replace = [re.sub(r"[^가-힣]", "", title) for title in title_list]
return title_replace, detail_list
start = time.time()
# Connect to database
db = pymysql.Connect(
host='localhost',
user='jenga',
password=config.database_password,
database='jenga',
charset='utf8',
)
cursor = db.cursor()
# Database Create Table
# create_table_query = """
# CREATE
# Connect to database
db = pymysql.Connect(
host='localhost',
user='jenga',
password=config.database_password,
database='jenga',
charset='utf8',
)
cursor = db.cursor()
# Database Create Table
# create_table_query = """
# CREATE TABLE book (
# id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
# title VARCHAR(255),
# detail TEXT
# )
# """
# cursor.execute(create_table_query)
# Clear book table in database
sql = "TRUNCATE TABLE book"
cursor.execute(sql)
# Get book titles and details
titles, details = get_book_titles()
print("Titles: ", titles)
print("Details: ", details)
# Insert titles and details into database
sql = "INSERT INTO book (title, detail) VALUES (%s, %s)"
for title, detail in zip(titles, details):
cursor.execute(sql, (title, detail))
db.commit()
db.close()
end = time.time()
print(f"{end - start:.5f} sec")