Skip to content

Latest commit

 

History

History
200 lines (163 loc) · 7 KB

File metadata and controls

200 lines (163 loc) · 7 KB
Error in user YAML: (<unknown>): could not find expected ':' while scanning a simple key at line 3 column 1
---
- oeasy Python 0565
- 这是 oeasy 系统化 Python 教程,从基础一步步讲,扎实、完整、不跳步。愿意花时间学,就能真正学会。
本教程同步发布在: 

     个人网站: `https://oeasy.org` 
     蓝桥云课: `https://www.lanqiao.cn/courses/3584` 
     GitHub: `https://github.com/overmind1980/oeasy-python-tutorial` 
     Gitee: `https://gitee.com/overmind1980/oeasypython` 
---

GoPup

微博

import pandas as pd
import time
from DrissionPage import WebPage

# Define parameters
keyword = '春晚'
page = 10000
begindate = '2025-01-28'
enddate = '2025-01-29'
data = []
for pa in range(1,page+1):
    # Initialize WebPage
    wp = WebPage()
    wp.get(f'https://s.weibo.com/weibo?q={keyword}&typeall=1&suball=1&timescope=custom%3A{begindate}-20%3A{enddate}-14&Refer=g&page={pa}')
    # Scrape the data
    cars = wp.eles('xpath://div[@action-type="feed_list_item"]')

    # Create lists to store the scraped data

    for i in cars:
        nickname = i.ele('xpath://a[@class="name"]').text
        shijian = i.ele('xpath://div[@class="from"]/a').text
        content = i.ele('xpath://p[@node-type="feed_list_content"]').text
        # Append data to list
        data.append([nickname, shijian, content])
    time.sleep(1)
    print(f'爬完了第{pa}页')
# Convert the data into a DataFrame
df = pd.DataFrame(data, columns=['Nickname', 'Time', 'Content'])

# Save the DataFrame to an Excel file
df.to_excel('weibo_scraped_data.xlsx', index=False)

# Print a message when done
print("Data saved to 'weibo_scraped_data.xlsx'")

抖音

from DrissionPage import ChromiumPage
from time import sleep
import datetime
import csv

f = open('data.csv',mode='w',encoding='utf-8-sig',newline='')
csv_writer = csv.DictWriter(f, fieldnames=['昵称','地区','时间','评论内容','点赞数'])
csv_writer.writeheader()

# 创建一个ChromiumPage实例
driver = ChromiumPage()

driver.listen.start('aweme/v1/web/comment/list/')

# 打开抖音视频页面
driver.get('https://www.douyin.com/user/self?modal_id=7362514241460407589&showTab=favorite_collection')

for page in range(50):
    print(f'正在采集第{page+1}页的数据内容')
    driver.scroll.to_bottom()

    resp = driver.listen.wait()

    json_data = resp.response.body
    comments = json_data['comments']
    for index in comments:
        text = index ['text']
        nickname = index['user']['nickname']
        create_time = index['create_time']
        digg_count = index['digg_count']
        date = str(datetime.datetime.fromtimestamp(create_time))
        ip_label = index['ip_label']
        dit = {
            '昵称':nickname,
            '地区':ip_label,
            '时间':date,
            '评论内容':text,
            '点赞数':digg_count
        }

        csv_writer.writerow(dit)
        print(dit)

租房

import csv
import random
import time

from lxml import etree
import requests
from retry.api import retry_call
from DrissionPage import WebPage, ChromiumOptions


def open_web():
    co = ChromiumOptions()
    co.set_argument('--start-maximized')
    co.headless(False)
    c_driver = WebPage(chromium_options=co, timeout=10)
    return c_driver


def get_cookies():
    driver = open_web()
    driver.get('https://bj.zu.ke.com/zufang/BJ2041545568089538560.html')
    time.sleep(2)
    if '本次访问已触发人机验证,请按指示操作' in driver.html:
        input('请在页面端完成验证码验证后,点击回车继续抓取数据:')

    cookie_str = '; '.join([f'{i.get("name")}={i.get("value")}' for i in driver.cookies()])
    driver.close()

    return cookie_str


def retry_request(crawl_url, req_type='get', retry_times=6):
    global cookies
    while retry_times:
        retry_times -= 1
        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
                "cookie": cookies
            }
            resp = retry_call(getattr(requests, req_type), fargs=[crawl_url],
                              fkwargs={"timeout": 10, "headers": headers}, delay=0, tries=1)
            if '本次访问已触发人机验证,请按指示操作' in resp.text:
                cookies = get_cookies()
                raise ValueError('触发验证码,已完成验证后重新获取cookies!!!')

            if resp.status_code == 200 and resp.content:
                return resp.text
        except Exception as e:
            print(e)
        if not retry_times:
            return '<html></html>'
        time.sleep(random.randint(1, 3))


def crawl_rent_house():
    for page in range(1, 101):
        print(f'抓取第:{page} 页出租房屋!!!')
        page_href = f'https://bj.zu.ke.com/zufang/pg{page}/'
        page_html = retry_request(page_href)
        house_items = etree.HTML(page_html).xpath('.//div[@class="content__list--item"]')

        for item in house_items:
            title = ''.join(item.xpath('./*[@class="content__list--item--main"]/p[1]/a[1]//text()')).strip()
            rent_price = ''.join(item.xpath('./div[@class="content__list--item--main"]/*[@class="content__list--item-price"]//text()')).strip()
            house_href = 'https://bj.zu.ke.com' + item.xpath('./*[@class="content__list--item--main"]/p[1]/a[1]/@href')[0]
            house_info = ''.join(item.xpath('./div[@class="content__list--item--main"]/*[@class="content__list--item--des"]//text()')).strip()
            house_labels = [i.strip().replace(' ', '') for i in house_info.split('/') if len(i.strip()) > 1] if '/' in house_info else []
            park_addr = house_labels[0] if house_labels else ''
            de_html = retry_request(house_href)
            house_area = ''.join(etree.HTML(de_html).xpath('//li[contains(text(), "面积:")]//text()')).replace('面积:', '')
            forward = ''.join(etree.HTML(de_html).xpath('//li[contains(text(), "朝向:")]//text()')).replace('朝向:', '')
            rent_time = ''.join(etree.HTML(de_html).xpath('//li[contains(text(), "租期:")]//text()')).replace('租期:', '')
            floor = ''.join(etree.HTML(de_html).xpath('//li[contains(text(), "楼层:")]//text()')).replace('楼层:', '')
            house_tags = ' '.join(
                [i for i in etree.HTML(de_html).xpath('//*[@class="content__aside--tags"]/i//text()') if len(i) > 1]
            )
            hx = ''
            for value in house_labels:
                if ('室' in value) and ('厅' in value) and ('卫' in value):
                    hx = value
                    break
            data = [house_href, title, rent_price, park_addr, house_area, forward, hx, rent_time, floor, house_tags]
            print(data)
            with open('楼盘租房房源信息.csv', 'a', encoding='utf-8', newline='', errors='ignore') as fl:
                csv.writer(fl).writerow(data)


if __name__ == '__main__':
    cookies = get_cookies()
    crawl_rent_house()


  • 本文来自 oeasy Python 系统教程。
  • 想完整、扎实学 Python,
  • 搜索 oeasy 即可。