kamusi/app.py at main · Kalebu/kamusi · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys


driver = webdriver.Chrome()
base_url = "http://kamusi.appsmata.com"


def login(email, password):
    # Login to the website
    driver.get(f"{base_url}/login")
    time.sleep(5)
    driver.find_element_by_xpath(
        "/html/body/div/div[2]/div/div/div[1]/div/form/div[1]/div/input"
    ).send_keys(email)
    driver.find_element_by_xpath(
        "/html/body/div/div[2]/div/div/div[1]/div/form/div[2]/div/input"
    ).send_keys(password)

    print("Logged in")
    driver.find_element_by_xpath(
        "/html/body/div/div[2]/div/div/div[1]/div/form/div[4]/button"
    ).click()
    return driver


def get_soup(url):
    # Get the soup of the page
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    return soup


def generate_all_urls(max_page=833):
    # Generate all the urls
    urls = []
    for i in range(1, max_page + 1):
        urls.append(f"{base_url}/words?page={i}")
    return urls


def scrape_page(url):
    # Scrape the page
    soup = get_soup(url)
    table = soup.find_all("table")
    df = pd.read_html(str(table))[0]
    return df


def scrap_all_urls(urls):
    # Scrape all the urls
    all_df = []
    for url in urls:
        df = scrape_page(url)
        all_df.append(df)
    return all_df


if __name__ == "__main__":
    # Login to the website
    driver = login("email", "password")
    # Get the soup of the page
    urls = generate_all_urls()
    # Scrape all the urls
    all_df = scrap_all_urls(urls)
    # Combine all the dataframes
    df = pd.concat(all_df)
    # Save the dataframe
    df.to_csv("words.csv", index=False)