diff --git a/a.md b/a.md deleted file mode 100644 index e69de29..0000000 diff --git a/download.py b/download.py new file mode 100644 index 0000000..4651420 --- /dev/null +++ b/download.py @@ -0,0 +1,110 @@ +import random +import requests + + +class Download: + """下载网页html""" + def __init__(self): + """ + 创建一个随机的请求头 + """ + self.user_agent_list = [ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", + "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", + "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", + "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36" + ] + self.head_connection = ['keep-alive'] + self.head_accept_language = ['zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'] + self.head_accept = ['text/css,*/*;q=0.1'] + # self.ip_list = ip.get_ip_list() + self.proxy = [ + '106.5.173.163:3276', + '118.117.138.173:2645', + '111.74.232.220:9756', + '60.17.248.204:2121', + '116.208.96.24:3154', + '100.18.21.221:8110', + '117.68.145.78:2644', + '182.111.49.213:4162', + '36.34.14.53:6436', + '114.226.135.105:9287', + '222.189.89.180:5638', + '36.34.15.96:6436', + '117.57.170.138:3852', + '112.85.10.250:1131', + '115.219.76.29:2316', + '60.173.24.251:6890', + '117.71.152.248:2319', + '106.110.249.222:3456', + '60.187.145.145:2315', + '117.90.2.47:3217', + '222.163.253.2:2862', + '123.189.48.142:9706', + '60.160.186.100:7654', + '36.33.18.1:6436', + '171.215.203.35:2645', + '59.62.194.171:6344', + '114.99.22.214:6890', + '111.77.20.64:4162', + '182.100.162.23:4162', + '60.168.23.241:2644', + '42.54.231.82:3529', + '115.153.104.137:2314', + '117.68.242.119:2644', + '106.5.5.120:9756', + '100.18.25.49:8110', + '182.111.98.113:2314', + '49.67.138.134:2137', + '117.68.242.186:2644', + '223.215.149.202:2319', + '175.151.220.99:1767', + '183.145.53.113:2315', + '117.90.2.51:3217', + '36.45.194.35:3215', + '123.152.37.190:2682', + '117.70.137.207:6436', + ] + + def get_url(self, url, timeout, num_retries=3): + """ + 构造请求头,并获取响应 + :param url: + :param timeout: + :return: + """ + UA = random.choice(self.user_agent_list) + headers = { + 'Connection': self.head_connection[0], + 'Accept': self.head_accept[0], + 'Acccept-Language': self.head_accept_language[0], + 'Use-Agent': UA, + } + ip = random.choice(self.proxy) + proxies = {'http': ip} + try: + response = requests.get(url, timeout=timeout, headers=headers, proxies=proxies) + except: + print("获取网页出错") + response = None + if num_retries > 0: + print('获取页面倒数第%s次' % num_retries) + return self.get_url(url, timeout, num_retries-1) + else: + return response \ No newline at end of file diff --git a/kuaidi_1.py b/kuaidi_1.py new file mode 100644 index 0000000..5191766 --- /dev/null +++ b/kuaidi_1.py @@ -0,0 +1,85 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +from download import Download +from orderno import Orderno +import time +import datetime + + +class Kuaidu: + def __init__(self): + self.bash_url = 'http://www.kuaidi.com/index-ajaxselectcourierinfo-' + self.bash_html = '-.html' + self.orderno = Orderno() + + def data(self): + """ + 构造url链接 + :return: 实际的快递单信息链接 + """ + print('Begin') + orderno = self.orderno + orderno_list = orderno.shipping_orderno() + print('今天要爬取的采购单数目: %s' % len(orderno_list)) + for i in orderno_list: + url = self.bash_url + i + self.bash_html + self.get_text(i, url) + + def get_text(self, shippingorderno, url): + """ + 获取快递单信息 + :param shippingorderno: + :param url: 快递单链接列表 + :return: 快递单号、详细描述、交易时间 + """ + download = Download() + response = download.get_url(url, 5) + if response: + try: + j = response.json() + print("开始获取%s的信息 " % shippingorderno) + except: + pass + else: + for data in j['data']: + track_date = data['time'].strip() + description = data['context'].strip() + self.save_date(shippingorderno, description, track_date) + time.sleep(0.8) + + def save_date(self, shippingorderno, description, track_date): + """ + 保存至数据库 + :param shippingorderno: + :param description: + :param track_date: + :return: + """ + order = self.orderno + db = order.db + create_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + cursor = order.cursor + data = { + 'shippingorderno': shippingorderno, + 'description': description, + 'create_date': create_date, + 'track_date': track_date, + } + table = 'shippingtrackdetail' + keys = ', '.join(data.keys()) + values = ', '.join(['%s'] * len(data)) + sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values) + # 已存在的数据不再存入 + cursor.execute("select * from shippingtrackdetail WHERE shippingorderno='{}' " + "and description='{}'".format(shippingorderno, description)) + isExists = cursor.rowcount + if not isExists: + try: + cursor.execute(sql, tuple(data.values())) + print('%s:%s save successful' % (shippingorderno, description)) + db.commit() + except: + print('Failed') + db.rollback() + else: + print('%s:%s 已经存在' % (shippingorderno, description)) diff --git a/orderno.py b/orderno.py new file mode 100644 index 0000000..5e0267f --- /dev/null +++ b/orderno.py @@ -0,0 +1,76 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +import pymysql +import re + + +class Orderno(): + def __init__(self): + self.db = pymysql.connect(host='your dress', user='username', password='password', port=3306, db='', + charset='utf8') + self.cursor = self.db.cursor() + + def repleni_orderno(self): + """ + 获取单列表 + :return: + """ + sql = "" + cursor = self.cursor + try: + cursor.execute(sql) + results = cursor.fetchall() + ls = [] + for i in range(len(results)): + if results[i][0]: + ls.append(results[i][0]) + replen = [] + for row in ls: + result = re.findall('[0-9]{8,}', row) + if result: + replen.append(result) + # 检测已完成的订单不再去查询 + repleni_orderno = [] + for m in replen: + for n in m: + sql = "" + cursor.execute(sql) + isExists = cursor.rowcount + if not isExists: + repleni_orderno.append(n) + return repleni_orderno + except: + pass + + def shipping_orderno(self): + """ + 获取单列表,并将两个表合并 + :return: + """ + sql = "" + cursor = self.cursor + try: + cursor.execute(sql) + results = cursor.fetchall() + ls = [] + for i in range(len(results)): + if results[i][0]: + ls.append(results[i][0]) + replen = [] + for row in ls: + result = re.findall('[0-9]{8,}', row) + if result: + replen.append(result) + # 检测已完成的订单不再去查询 + orderno = self.repleni_orderno() + for m in replen: + for n in m: + sql = "" + cursor.execute(sql) + isExists = cursor.rowcount + if not isExists: + orderno.append(n) + return orderno + except: + pass +