scrapy/beforward.py at master · florencemayo/scrapy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/env python3
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import time
import re
import math
import csv

driver = webdriver.Chrome()

# dictionary for different make of a cars from JP
# car_make_dict = {1: 'TOYOTA', 2: 'HONDA', 3: 'NISSAN', 4: 'MAZDA', 5: 'MITSUBISHI'}
car_make_dict = {5: 'MITSUBISHI', 4: 'MAZDA', 3: 'NISSAN', 2: 'HONDA', 1: 'TOYOTA'}


# DETAILED CSV
csv_detailed_file = open('beforward_toyota_detailed.csv', 'w', encoding='utf-8', newline='')
writer_detailed = csv.writer(csv_detailed_file)
writer_detailed.writerow(['Car Make','Location' ,'Chassis No', 'Version/Class', 'Model Code', 'Milleage', 'Engine Size (CC)', 'Engine Code', \
    'Drive','Steering','Transmission','External Color','Registration Year/Month', 'Fuel','Manufacture Year/Month','Price($)'])

# START FETCHING LINKS FOR EACH VEHICLE
# Get data from
# result_urls = ['https://www.beforward.jp/stocklist/from_stocklist=1/kmode=and/make={}/sortkey=n/view_cnt=100'.format(x) for x in car_make_dict.keys()]

result_urls = ['https://www.beforward.jp/stocklist/from_stocklist=1/kmode=and/make={}/mfg_year_from=2015/mfg_year_to=2019/sortkey=n/view_cnt=100'.format(x) for x in car_make_dict.keys()]

links_list = []
for url in result_urls:
    driver.get(url)

    # get cars of first page
    total_cars_raw = driver.find_element_by_xpath('//div[@class="results-hits"]').text
    totals_cars = int(''.join(re.findall('\d+', total_cars_raw)))
    print(totals_cars)


    index = 1
    cars_fetched = 0
    # links_list = []
    # price = 0

    # while index < 2:
    while True:
        index+=1

        try:
            wait_fetching = WebDriverWait(driver, 10)
            cars = wait_fetching.until(EC.presence_of_all_elements_located((By.XPATH, './/div[@class="cars-box"]/table/tbody/tr/td[@class="make-model-td"]')))
            print(len(cars))
            cars_fetched+=len(cars)
        except:
            driver.refresh()
            continue

        if(cars_fetched < totals_cars):
            for car in cars:
                try:
                    price_offer = car.find_element_by_xpath('..//td/[@class="is-underoffer-td"]').text
                    continue
                except:
                    pass

                try:
                    title = car.find_element_by_xpath('.//span[@class="model-title"]').text
                    year = ''.join(re.findall('\d+', title))
                    year = year[:4]
                    print(title)
                    print('Year : '+ year)
                except:
                    pass

                link = ''
                try:
                    link = car.find_element_by_xpath('.//a').get_attribute("href")
                    # if(int(year) >= 2015 & int(year) <= 2020):
                    links_list.append(link)
                except:
                    pass

            try:
                page_next_button = WebDriverWait(driver, 10)
                page_next = page_next_button.until(EC.element_to_be_clickable((By.XPATH,
                '//li/a[@class="pagination-next"]')))
                driver.execute_script("arguments[0].click();", page_next)
            except Exception as e:
                print(e)


        else:
            break

# END FETCHING LINKS FOR EACH VEHICLE

# -----------------------------------
print('-'*50)
print('Total Links:' + str(len(links_list)))
for l in links_list:
    print('Link: '+ l)

# -----------------------------------

# BEGIN EXTRACTING SPECIFICATIONS FOR EACH CAR COLLECTED EARLIER

# START FETCHING CAR SPECIFICATIONS FOR EACH LINK COLLECTED ABOVE
time.sleep(3)

counter = 0
while counter < len(links_list):
    print("Car {}".format(counter+1))
    print("Link: "+ links_list[counter])

    driver.get(links_list[counter])
    counter += 1
    time.sleep(3)

    # get the car make
    try:
        car_make = driver.find_element_by_xpath('//div[@class="car-info-area cf"]/h1').text
        car_make = ''.join(re.findall('\D+', car_make))
    except:
        # counter += 1
        continue

    # get the price
    try:
        price = driver.find_element_by_xpath('//span[@class="ip-usd-price"]').text
        price = ''.join(re.findall('\d+', price))
    except:
        # counter += 1
        continue

    # Get all the rows on the specification table
    try:
        wait_fetch_rows = WebDriverWait(driver, 10)
        spec_rows = wait_fetch_rows.until(EC.presence_of_all_elements_located((By.XPATH, './/div[@class="cf"]/table/tbody/tr')))
        print('Number of headers in specification table: {}'.format(len(spec_rows)))
    except:
        # time.sleep(2)
        # driver.refresh()
        # counter += 1
        continue


    print('-'*50)
    row_dict = {}
    row_dict['car_make'] = car_make
    j = 0
    while(j < len(spec_rows)):
        try:
            th_values = spec_rows[j].find_elements_by_xpath('.//th')
            td_values = spec_rows[j].find_elements_by_xpath('.//td')
            if(len(th_values) >= 1):
                for k in range(len(th_values)):
                    th_value = ((spec_rows[j].find_elements_by_xpath('.//th')[k]).text).strip()
                    td_value = ((spec_rows[j].find_elements_by_xpath('.//td')[k]).text).strip()

                    th_dimension = ''
                    if(th_value.find('Dimension') != -1):
                        th_dimension = th_value
                    temp = ['Ref No','Sub Ref No', 'Auction Grade', 'Max Loading Capacity', 'Weight', 'M3', 'Doors','Seats', th_dimension]
                    if (not(th_value in temp)):
                        if (len(td_value) > 0):
                            row_dict[th_value] = td_value
                        else:
                            row_dict[th_value] = 'N/A'

                        print(th_value + ' : '+ td_value)
            j+=1
        except:
            pass
    print('Price : '+ price)
    print('-'*50)
    row_dict['Price'] = price
    writer_detailed.writerow(row_dict.values())
    # counter+=1