-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
48 lines (35 loc) · 1.21 KB
/
main.py
File metadata and controls
48 lines (35 loc) · 1.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from ref_caradisiac.spiders.caradisiac import RefModeleSpider, RefMarqueSpider
from scrapy.crawler import CrawlerProcess
import re, io, json, os, time
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'ref_caradisiac.settings') #add path to scrapy settings to the project
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
def remove(filename):
if os.path.exists(filename):
os.remove(filename)
BASE_DIR = './'
f = open('modeles.json')
data = json.load(f)
f.close()
#print(data)
process = CrawlerProcess(settings)
for item in data:
print (item['href'])
marque = re.search('(?<=\/auto--)(.*)(?=\/)', item['href'])
if marque:
marque= marque.group(1)
else :
continue
marque = marque.split('/modeles')[0]
if marque is not None:
json_path = os.path.join(BASE_DIR, 'modeles', '%s.json'%marque)
process.settings.set('FEED_URI',json_path)
print(10*'*')
print(marque)
print(10*'*')
remove('%s.json'%marque)
remove(json_path)
process.crawl(RefModeleSpider,marque=marque)
#time.sleep(3)
#process.stop()
process.start() # the script will block here until the crawling is finished