jerryjunpy · jerryjunpy · Nov 3, 2017 · Nov 3, 2017 · Nov 3, 2017 · Nov 3, 2017
diff --git a/a.md b/a.md
diff --git a/download.py b/download.py
@@ -0,0 +1,110 @@
+import random
+import requests
+
+
+class Download:
+    """下载网页html"""
+    def __init__(self):
+        """
+        创建一个随机的请求头
+        """
+        self.user_agent_list = [
+            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
+            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
+            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
+            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
+            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
+            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
+            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
+            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
+            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
+            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
+            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
+            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
+            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0",
+            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
+        ]
+        self.head_connection = ['keep-alive']
+        self.head_accept_language = ['zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3']
+        self.head_accept = ['text/css,*/*;q=0.1']
+        # self.ip_list = ip.get_ip_list()
+        self.proxy = [
+            '106.5.173.163:3276',
+            '118.117.138.173:2645',
+            '111.74.232.220:9756',
+            '60.17.248.204:2121',
+            '116.208.96.24:3154',
+            '100.18.21.221:8110',
+            '117.68.145.78:2644',
+            '182.111.49.213:4162',
+            '36.34.14.53:6436',
+            '114.226.135.105:9287',
+            '222.189.89.180:5638',
+            '36.34.15.96:6436',
+            '117.57.170.138:3852',
+            '112.85.10.250:1131',
+            '115.219.76.29:2316',
+            '60.173.24.251:6890',
+            '117.71.152.248:2319',
+            '106.110.249.222:3456',
+            '60.187.145.145:2315',
+            '117.90.2.47:3217',
+            '222.163.253.2:2862',
+            '123.189.48.142:9706',
+            '60.160.186.100:7654',
+            '36.33.18.1:6436',
+            '171.215.203.35:2645',
+            '59.62.194.171:6344',
+            '114.99.22.214:6890',
+            '111.77.20.64:4162',
+            '182.100.162.23:4162',
+            '60.168.23.241:2644',
+            '42.54.231.82:3529',
+            '115.153.104.137:2314',
+            '117.68.242.119:2644',
+            '106.5.5.120:9756',
+            '100.18.25.49:8110',
+            '182.111.98.113:2314',
+            '49.67.138.134:2137',
+            '117.68.242.186:2644',
+            '223.215.149.202:2319',
+            '175.151.220.99:1767',
+            '183.145.53.113:2315',
+            '117.90.2.51:3217',
+            '36.45.194.35:3215',
+            '123.152.37.190:2682',
+            '117.70.137.207:6436',
+        ]
+
+    def get_url(self, url, timeout, num_retries=3):
+        """
+        构造请求头，并获取响应
+        :param url:
+        :param timeout:
+        :return:
+        """
+        UA = random.choice(self.user_agent_list)
+        headers = {
+            'Connection': self.head_connection[0],
+            'Accept': self.head_accept[0],
+            'Acccept-Language': self.head_accept_language[0],
+            'Use-Agent': UA,
+                   }
+        ip = random.choice(self.proxy)
+        proxies = {'http': ip}
+        try:
+            response = requests.get(url, timeout=timeout, headers=headers, proxies=proxies)
+        except:
+            print("获取网页出错")
+            response = None
+            if num_retries > 0:
+                print('获取页面倒数第%s次' % num_retries)
+                return self.get_url(url, timeout, num_retries-1)
+        else:
+            return response
diff --git a/kuaidi_1.py b/kuaidi_1.py
@@ -0,0 +1,85 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+from download import Download
+from orderno import Orderno
+import time
+import datetime
+
+
+class Kuaidu:
+    def __init__(self):
+        self.bash_url = 'http://www.kuaidi.com/index-ajaxselectcourierinfo-'
+        self.bash_html = '-.html'
+        self.orderno = Orderno()
+
+    def data(self):
+        """
+        构造url链接
+        :return: 实际的快递单信息链接
+        """
+        print('Begin')
+        orderno = self.orderno
+        orderno_list = orderno.shipping_orderno()
+        print('今天要爬取的采购单数目: %s' % len(orderno_list))
+        for i in orderno_list:
+            url = self.bash_url + i + self.bash_html
+            self.get_text(i, url)
+
+    def get_text(self, shippingorderno, url):
+        """
+        获取快递单信息
+        :param shippingorderno:
+        :param url: 快递单链接列表
+        :return: 快递单号、详细描述、交易时间
+        """
+        download = Download()
+        response = download.get_url(url, 5)
+        if response:
+            try:
+                j = response.json()
+                print("开始获取%s的信息 " % shippingorderno)
+            except:
+                pass
+            else:
+                for data in j['data']:
+                    track_date = data['time'].strip()
+                    description = data['context'].strip()
+                    self.save_date(shippingorderno, description, track_date)
+            time.sleep(0.8)
+
+    def save_date(self, shippingorderno, description, track_date):
+        """
+        保存至数据库
+        :param shippingorderno:
+        :param description:
+        :param track_date:
+        :return:
+        """
+        order = self.orderno
+        db = order.db
+        create_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+        cursor = order.cursor
+        data = {
+            'shippingorderno': shippingorderno,
+            'description': description,
+            'create_date': create_date,
+            'track_date': track_date,
+        }
+        table = 'shippingtrackdetail'
+        keys = ', '.join(data.keys())
+        values = ', '.join(['%s'] * len(data))
+        sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
+        # 已存在的数据不再存入
+        cursor.execute("select * from shippingtrackdetail WHERE shippingorderno='{}' "
+                       "and description='{}'".format(shippingorderno, description))
+        isExists = cursor.rowcount
+        if not isExists:
+            try:
+                cursor.execute(sql, tuple(data.values()))
+                print('%s:%s save successful' % (shippingorderno, description))
+                db.commit()
+            except:
+                print('Failed')
+                db.rollback()
+        else:
+            print('%s:%s 已经存在' % (shippingorderno, description))
diff --git a/orderno.py b/orderno.py
@@ -0,0 +1,76 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+import pymysql
+import re
+
+
+class Orderno():
+    def __init__(self):
+        self.db = pymysql.connect(host='your dress', user='username', password='password', port=3306, db='',
+                                  charset='utf8')
+        self.cursor = self.db.cursor()
+
+    def repleni_orderno(self):
+        """
+        获取单列表
+        :return:
+        """
+        sql = ""
+        cursor = self.cursor
+        try:
+            cursor.execute(sql)
+            results = cursor.fetchall()
+            ls = []
+            for i in range(len(results)):
+                if results[i][0]:
+                    ls.append(results[i][0])
+            replen = []
+            for row in ls:
+                result = re.findall('[0-9]{8,}', row)
+                if result:
+                    replen.append(result)
+            # 检测已完成的订单不再去查询
+            repleni_orderno = []
+            for m in replen:
+                for n in m:
+                    sql = ""
+                    cursor.execute(sql)
+                    isExists = cursor.rowcount
+                    if not isExists:
+                        repleni_orderno.append(n)
+            return repleni_orderno
+        except:
+            pass
+
+    def shipping_orderno(self):
+        """
+        获取单列表，并将两个表合并
+        :return:
+        """
+        sql = ""
+        cursor = self.cursor
+        try:
+            cursor.execute(sql)
+            results = cursor.fetchall()
+            ls = []
+            for i in range(len(results)):
+                if results[i][0]:
+                    ls.append(results[i][0])
+            replen = []
+            for row in ls:
+                result = re.findall('[0-9]{8,}', row)
+                if result:
+                    replen.append(result)
+            # 检测已完成的订单不再去查询
+            orderno = self.repleni_orderno()
+            for m in replen:
+                for n in m:
+                    sql = ""
+                    cursor.execute(sql)
+                    isExists = cursor.rowcount
+                    if not isExists:
+                        orderno.append(n)
+            return orderno
+        except:
+            pass
+