-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
70 lines (55 loc) · 2.42 KB
/
scraper.py
File metadata and controls
70 lines (55 loc) · 2.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import requests
from lxml import html
count = 0
with open("adj_translations.txt", "a") as translation_file:
next_link = "https://scn.wiktionary.org/wiki/Catigur%C3%ACa:Aggittivi_siciliani"
while next_link:
try:
previous_link = next_link
response = requests.get(next_link)
parsed_body = html.fromstring(response.text)
# Find words on the page
sicilian = parsed_body.xpath('//ul/li/a/@href')
for i in range(0, 200):
url = "https://scn.wiktionary.org" + sicilian[i]
response = requests.get(url)
parsed = html.fromstring(response.text)
first_section = parsed.xpath("//ul/li/a/text()")
second_section = parsed.xpath("//dl/dd/a/text()")
grammar = parsed.xpath("//div/p/i/text()")
if second_section:
for part in second_section:
first_section.append(part)
for f in first_section:
grammar.append(f)
word = sicilian[i].split("i/")[1].strip()
# if not word.endswith("ri") or word.endswith("si"):
# continue
# Replace UTF
word = word.replace("%C3%AC", "ì")
word = word.replace("%C3%B2", "ò")
word = word.replace("%C3%A0", "à")
word = word.replace("%C3%A8", "è")
word = word.replace("%C3%B9", "ù")
word = word.replace("%C3%A7", "ç")
word = word.replace("%C3%AE", "î")
# print(word)
translation_file.write(word + "\t" + ",".join(grammar) + "\n")
count += 1
if count%50 == 0:
print(count, "Words extracted. Last word:", word)
# Look for the next page:
hrefs_list = parsed_body.xpath('//div/a/@href')
hrefs_list = list(set(hrefs_list))
for link in hrefs_list:
if "pagefrom" in link:
next_link = "https://scn.wiktionary.org" + link
print(next_link)
print("NEXT PAGE")
# If there is no next page, break
if next_link == previous_link:
break
# It is a terrible exception handler, but still, I need it.
except Exception as err:
print(err)
pass