Error in user YAML: (<unknown>): could not find expected ':' while scanning a simple key at line 3 column 1

---
- oeasy Python 0565
- 这是 oeasy 系统化 Python 教程，从基础一步步讲，扎实、完整、不跳步。愿意花时间学，就能真正学会。
本教程同步发布在： 

     个人网站： `https://oeasy.org` 
     蓝桥云课： `https://www.lanqiao.cn/courses/3584` 
     GitHub： `https://github.com/overmind1980/oeasy-python-tutorial` 
     Gitee： `https://gitee.com/overmind1980/oeasypython` 
---

GoPup

微博

import pandas as pd
import time
from DrissionPage import WebPage

# Define parameters
keyword = '春晚'
page = 10000
begindate = '2025-01-28'
enddate = '2025-01-29'
data = []
for pa in range(1,page+1):
    # Initialize WebPage
    wp = WebPage()
    wp.get(f'https://s.weibo.com/weibo?q={keyword}&typeall=1&suball=1&timescope=custom%3A{begindate}-20%3A{enddate}-14&Refer=g&page={pa}')
    # Scrape the data
    cars = wp.eles('xpath://div[@action-type="feed_list_item"]')

    # Create lists to store the scraped data

    for i in cars:
        nickname = i.ele('xpath://a[@class="name"]').text
        shijian = i.ele('xpath://div[@class="from"]/a').text
        content = i.ele('xpath://p[@node-type="feed_list_content"]').text
        # Append data to list
        data.append([nickname, shijian, content])
    time.sleep(1)
    print(f'爬完了第{pa}页')
# Convert the data into a DataFrame
df = pd.DataFrame(data, columns=['Nickname', 'Time', 'Content'])

# Save the DataFrame to an Excel file
df.to_excel('weibo_scraped_data.xlsx', index=False)

# Print a message when done
print("Data saved to 'weibo_scraped_data.xlsx'")

抖音

from DrissionPage import ChromiumPage
from time import sleep
import datetime
import csv

f = open('data.csv',mode='w',encoding='utf-8-sig',newline='')
csv_writer = csv.DictWriter(f, fieldnames=['昵称','地区','时间','评论内容','点赞数'])
csv_writer.writeheader()

# 创建一个ChromiumPage实例
driver = ChromiumPage()

driver.listen.start('aweme/v1/web/comment/list/')

# 打开抖音视频页面
driver.get('https://www.douyin.com/user/self?modal_id=7362514241460407589&showTab=favorite_collection')

for page in range(50):
    print(f'正在采集第{page+1}页的数据内容')
    driver.scroll.to_bottom()

    resp = driver.listen.wait()

    json_data = resp.response.body
    comments = json_data['comments']
    for index in comments:
        text = index ['text']
        nickname = index['user']['nickname']
        create_time = index['create_time']
        digg_count = index['digg_count']
        date = str(datetime.datetime.fromtimestamp(create_time))
        ip_label = index['ip_label']
        dit = {
            '昵称':nickname,
            '地区':ip_label,
            '时间':date,
            '评论内容':text,
            '点赞数':digg_count
        }

        csv_writer.writerow(dit)
        print(dit)

租房

import csv
import random
import time

from lxml import etree
import requests
from retry.api import retry_call
from DrissionPage import WebPage, ChromiumOptions


def open_web():
    co = ChromiumOptions()
    co.set_argument('--start-maximized')
    co.headless(False)
    c_driver = WebPage(chromium_options=co, timeout=10)
    return c_driver


def get_cookies():
    driver = open_web()
    driver.get('https://bj.zu.ke.com/zufang/BJ2041545568089538560.html')
    time.sleep(2)
    if '本次访问已触发人机验证，请按指示操作' in driver.html:
        input('请在页面端完成验证码验证后，点击回车继续抓取数据：')

    cookie_str = '; '.join([f'{i.get("name")}={i.get("value")}' for i in driver.cookies()])
    driver.close()

    return cookie_str


def retry_request(crawl_url, req_type='get', retry_times=6):
    global cookies
    while retry_times:
        retry_times -= 1
        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
                "cookie": cookies
            }
            resp = retry_call(getattr(requests, req_type), fargs=[crawl_url],
                              fkwargs={"timeout": 10, "headers": headers}, delay=0, tries=1)
            if '本次访问已触发人机验证，请按指示操作' in resp.text:
                cookies = get_cookies()
                raise ValueError('触发验证码，已完成验证后重新获取cookies！！！')

            if resp.status_code == 200 and resp.content:
                return resp.text
        except Exception as e:
            print(e)
        if not retry_times:
            return '<html></html>'
        time.sleep(random.randint(1, 3))


def crawl_rent_house():
    for page in range(1, 101):
        print(f'抓取第：{page} 页出租房屋！！！')
        page_href = f'https://bj.zu.ke.com/zufang/pg{page}/'
        page_html = retry_request(page_href)
        house_items = etree.HTML(page_html).xpath('.//div[@class="content__list--item"]')

        for item in house_items:
            title = ''.join(item.xpath('./*[@class="content__list--item--main"]/p[1]/a[1]//text()')).strip()
            rent_price = ''.join(item.xpath('./div[@class="content__list--item--main"]/*[@class="content__list--item-price"]//text()')).strip()
            house_href = 'https://bj.zu.ke.com' + item.xpath('./*[@class="content__list--item--main"]/p[1]/a[1]/@href')[0]
            house_info = ''.join(item.xpath('./div[@class="content__list--item--main"]/*[@class="content__list--item--des"]//text()')).strip()
            house_labels = [i.strip().replace(' ', '') for i in house_info.split('/') if len(i.strip()) > 1] if '/' in house_info else []
            park_addr = house_labels[0] if house_labels else ''
            de_html = retry_request(house_href)
            house_area = ''.join(etree.HTML(de_html).xpath('//li[contains(text(), "面积：")]//text()')).replace('面积：', '')
            forward = ''.join(etree.HTML(de_html).xpath('//li[contains(text(), "朝向：")]//text()')).replace('朝向：', '')
            rent_time = ''.join(etree.HTML(de_html).xpath('//li[contains(text(), "租期：")]//text()')).replace('租期：', '')
            floor = ''.join(etree.HTML(de_html).xpath('//li[contains(text(), "楼层：")]//text()')).replace('楼层：', '')
            house_tags = ' '.join(
                [i for i in etree.HTML(de_html).xpath('//*[@class="content__aside--tags"]/i//text()') if len(i) > 1]
            )
            hx = ''
            for value in house_labels:
                if ('室' in value) and ('厅' in value) and ('卫' in value):
                    hx = value
                    break
            data = [house_href, title, rent_price, park_addr, house_area, forward, hx, rent_time, floor, house_tags]
            print(data)
            with open('楼盘租房房源信息.csv', 'a', encoding='utf-8', newline='', errors='ignore') as fl:
                csv.writer(fl).writerow(data)


if __name__ == '__main__':
    cookies = get_cookies()
    crawl_rent_house()

本文来自 oeasy Python 系统教程。
想完整、扎实学 Python，
搜索 oeasy 即可。

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

GoPup

微博

抖音

租房

FilesExpand file tree

0565.md

Latest commit

History

0565.md

File metadata and controls

GoPup

微博

抖音

租房