Error in user YAML: (<unknown>): could not find expected ':' while scanning a simple key at line 3 column 1
---
- oeasy Python 0565
- 这是 oeasy 系统化 Python 教程,从基础一步步讲,扎实、完整、不跳步。愿意花时间学,就能真正学会。
本教程同步发布在:
个人网站: `https://oeasy.org`
蓝桥云课: `https://www.lanqiao.cn/courses/3584`
GitHub: `https://github.com/overmind1980/oeasy-python-tutorial`
Gitee: `https://gitee.com/overmind1980/oeasypython`
---import pandas as pd
import time
from DrissionPage import WebPage
# Define parameters
keyword = '春晚'
page = 10000
begindate = '2025-01-28'
enddate = '2025-01-29'
data = []
for pa in range(1,page+1):
# Initialize WebPage
wp = WebPage()
wp.get(f'https://s.weibo.com/weibo?q={keyword}&typeall=1&suball=1×cope=custom%3A{begindate}-20%3A{enddate}-14&Refer=g&page={pa}')
# Scrape the data
cars = wp.eles('xpath://div[@action-type="feed_list_item"]')
# Create lists to store the scraped data
for i in cars:
nickname = i.ele('xpath://a[@class="name"]').text
shijian = i.ele('xpath://div[@class="from"]/a').text
content = i.ele('xpath://p[@node-type="feed_list_content"]').text
# Append data to list
data.append([nickname, shijian, content])
time.sleep(1)
print(f'爬完了第{pa}页')
# Convert the data into a DataFrame
df = pd.DataFrame(data, columns=['Nickname', 'Time', 'Content'])
# Save the DataFrame to an Excel file
df.to_excel('weibo_scraped_data.xlsx', index=False)
# Print a message when done
print("Data saved to 'weibo_scraped_data.xlsx'")
from DrissionPage import ChromiumPage
from time import sleep
import datetime
import csv
f = open('data.csv',mode='w',encoding='utf-8-sig',newline='')
csv_writer = csv.DictWriter(f, fieldnames=['昵称','地区','时间','评论内容','点赞数'])
csv_writer.writeheader()
# 创建一个ChromiumPage实例
driver = ChromiumPage()
driver.listen.start('aweme/v1/web/comment/list/')
# 打开抖音视频页面
driver.get('https://www.douyin.com/user/self?modal_id=7362514241460407589&showTab=favorite_collection')
for page in range(50):
print(f'正在采集第{page+1}页的数据内容')
driver.scroll.to_bottom()
resp = driver.listen.wait()
json_data = resp.response.body
comments = json_data['comments']
for index in comments:
text = index ['text']
nickname = index['user']['nickname']
create_time = index['create_time']
digg_count = index['digg_count']
date = str(datetime.datetime.fromtimestamp(create_time))
ip_label = index['ip_label']
dit = {
'昵称':nickname,
'地区':ip_label,
'时间':date,
'评论内容':text,
'点赞数':digg_count
}
csv_writer.writerow(dit)
print(dit)
import csv
import random
import time
from lxml import etree
import requests
from retry.api import retry_call
from DrissionPage import WebPage, ChromiumOptions
def open_web():
co = ChromiumOptions()
co.set_argument('--start-maximized')
co.headless(False)
c_driver = WebPage(chromium_options=co, timeout=10)
return c_driver
def get_cookies():
driver = open_web()
driver.get('https://bj.zu.ke.com/zufang/BJ2041545568089538560.html')
time.sleep(2)
if '本次访问已触发人机验证,请按指示操作' in driver.html:
input('请在页面端完成验证码验证后,点击回车继续抓取数据:')
cookie_str = '; '.join([f'{i.get("name")}={i.get("value")}' for i in driver.cookies()])
driver.close()
return cookie_str
def retry_request(crawl_url, req_type='get', retry_times=6):
global cookies
while retry_times:
retry_times -= 1
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
"cookie": cookies
}
resp = retry_call(getattr(requests, req_type), fargs=[crawl_url],
fkwargs={"timeout": 10, "headers": headers}, delay=0, tries=1)
if '本次访问已触发人机验证,请按指示操作' in resp.text:
cookies = get_cookies()
raise ValueError('触发验证码,已完成验证后重新获取cookies!!!')
if resp.status_code == 200 and resp.content:
return resp.text
except Exception as e:
print(e)
if not retry_times:
return '<html></html>'
time.sleep(random.randint(1, 3))
def crawl_rent_house():
for page in range(1, 101):
print(f'抓取第:{page} 页出租房屋!!!')
page_href = f'https://bj.zu.ke.com/zufang/pg{page}/'
page_html = retry_request(page_href)
house_items = etree.HTML(page_html).xpath('.//div[@class="content__list--item"]')
for item in house_items:
title = ''.join(item.xpath('./*[@class="content__list--item--main"]/p[1]/a[1]//text()')).strip()
rent_price = ''.join(item.xpath('./div[@class="content__list--item--main"]/*[@class="content__list--item-price"]//text()')).strip()
house_href = 'https://bj.zu.ke.com' + item.xpath('./*[@class="content__list--item--main"]/p[1]/a[1]/@href')[0]
house_info = ''.join(item.xpath('./div[@class="content__list--item--main"]/*[@class="content__list--item--des"]//text()')).strip()
house_labels = [i.strip().replace(' ', '') for i in house_info.split('/') if len(i.strip()) > 1] if '/' in house_info else []
park_addr = house_labels[0] if house_labels else ''
de_html = retry_request(house_href)
house_area = ''.join(etree.HTML(de_html).xpath('//li[contains(text(), "面积:")]//text()')).replace('面积:', '')
forward = ''.join(etree.HTML(de_html).xpath('//li[contains(text(), "朝向:")]//text()')).replace('朝向:', '')
rent_time = ''.join(etree.HTML(de_html).xpath('//li[contains(text(), "租期:")]//text()')).replace('租期:', '')
floor = ''.join(etree.HTML(de_html).xpath('//li[contains(text(), "楼层:")]//text()')).replace('楼层:', '')
house_tags = ' '.join(
[i for i in etree.HTML(de_html).xpath('//*[@class="content__aside--tags"]/i//text()') if len(i) > 1]
)
hx = ''
for value in house_labels:
if ('室' in value) and ('厅' in value) and ('卫' in value):
hx = value
break
data = [house_href, title, rent_price, park_addr, house_area, forward, hx, rent_time, floor, house_tags]
print(data)
with open('楼盘租房房源信息.csv', 'a', encoding='utf-8', newline='', errors='ignore') as fl:
csv.writer(fl).writerow(data)
if __name__ == '__main__':
cookies = get_cookies()
crawl_rent_house()
- 本文来自 oeasy Python 系统教程。
- 想完整、扎实学 Python,
- 搜索 oeasy 即可。