Skip to content

Commit 05a1782

Browse files
authored
Merge pull request #764 from yangtao210/main
新增存储到mongoDB
2 parents 45ec4b4 + ef6948b commit 05a1782

File tree

25 files changed

+1381
-679
lines changed

25 files changed

+1381
-679
lines changed

config/base_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
CDP_HEADLESS = False
5656

5757
# 浏览器启动超时时间(秒)
58-
BROWSER_LAUNCH_TIMEOUT = 30
58+
BROWSER_LAUNCH_TIMEOUT = 60
5959

6060
# 是否在程序结束时自动关闭浏览器
6161
# 设置为False可以保持浏览器运行,便于调试

config/db_config.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,4 +42,19 @@
4242

4343
sqlite_db_config = {
4444
"db_path": SQLITE_DB_PATH
45+
}
46+
47+
# mongodb config
48+
MONGODB_HOST = os.getenv("MONGODB_HOST", "localhost")
49+
MONGODB_PORT = os.getenv("MONGODB_PORT", 27017)
50+
MONGODB_USER = os.getenv("MONGODB_USER", "")
51+
MONGODB_PWD = os.getenv("MONGODB_PWD", "")
52+
MONGODB_DB_NAME = os.getenv("MONGODB_DB_NAME", "media_crawler")
53+
54+
mongodb_config = {
55+
"host": MONGODB_HOST,
56+
"port": int(MONGODB_PORT),
57+
"user": MONGODB_USER,
58+
"password": MONGODB_PWD,
59+
"db_name": MONGODB_DB_NAME,
4560
}

media_platform/bilibili/core.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -497,11 +497,12 @@ async def launch_browser(
497497
"height": 1080
498498
},
499499
user_agent=user_agent,
500+
channel="chrome", # 使用系统的Chrome稳定版
500501
)
501502
return browser_context
502503
else:
503504
# type: ignore
504-
browser = await chromium.launch(headless=headless, proxy=playwright_proxy)
505+
browser = await chromium.launch(headless=headless, proxy=playwright_proxy, channel="chrome")
505506
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
506507
return browser_context
507508

media_platform/kuaishou/core.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -333,10 +333,11 @@ async def launch_browser(
333333
proxy=playwright_proxy, # type: ignore
334334
viewport={"width": 1920, "height": 1080},
335335
user_agent=user_agent,
336+
channel="chrome", # 使用系统的Chrome稳定版
336337
)
337338
return browser_context
338339
else:
339-
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
340+
browser = await chromium.launch(headless=headless, proxy=playwright_proxy, channel="chrome") # type: ignore
340341
browser_context = await browser.new_context(
341342
viewport={"width": 1920, "height": 1080}, user_agent=user_agent
342343
)

media_platform/tieba/core.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -614,10 +614,11 @@ async def launch_browser(
614614
proxy=playwright_proxy, # type: ignore
615615
viewport={"width": 1920, "height": 1080},
616616
user_agent=user_agent,
617+
channel="chrome", # 使用系统的Chrome稳定版
617618
)
618619
return browser_context
619620
else:
620-
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
621+
browser = await chromium.launch(headless=headless, proxy=playwright_proxy, channel="chrome") # type: ignore
621622
browser_context = await browser.new_context(
622623
viewport={"width": 1920, "height": 1080}, user_agent=user_agent
623624
)

media_platform/weibo/core.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -343,10 +343,11 @@ async def launch_browser(
343343
"height": 1080
344344
},
345345
user_agent=user_agent,
346+
channel="chrome", # 使用系统的Chrome稳定版
346347
)
347348
return browser_context
348349
else:
349-
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
350+
browser = await chromium.launch(headless=headless, proxy=playwright_proxy, channel="chrome") # type: ignore
350351
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
351352
return browser_context
352353

media_platform/zhihu/core.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -429,10 +429,11 @@ async def launch_browser(
429429
proxy=playwright_proxy, # type: ignore
430430
viewport={"width": 1920, "height": 1080},
431431
user_agent=user_agent,
432+
channel="chrome", # 使用系统的Chrome稳定版
432433
)
433434
return browser_context
434435
else:
435-
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
436+
browser = await chromium.launch(headless=headless, proxy=playwright_proxy, channel="chrome") # type: ignore
436437
browser_context = await browser.new_context(
437438
viewport={"width": 1920, "height": 1080}, user_agent=user_agent
438439
)

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ dependencies = [
1616
"httpx==0.28.1",
1717
"jieba==0.42.1",
1818
"matplotlib==3.9.0",
19+
"motor>=3.3.0",
1920
"opencv-python>=4.11.0.86",
2021
"pandas==2.2.3",
2122
"parsel==1.9.1",

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,4 @@ cryptography>=45.0.7
2424
alembic>=1.16.5
2525
asyncmy>=0.2.10
2626
sqlalchemy>=2.0.43
27+
motor>=3.3.0

store/bilibili/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,14 @@ class BiliStoreFactory:
2828
"db": BiliDbStoreImplement,
2929
"json": BiliJsonStoreImplement,
3030
"sqlite": BiliSqliteStoreImplement,
31+
"mongodb": BiliMongoStoreImplement,
3132
}
3233

3334
@staticmethod
3435
def create_store() -> AbstractStore:
3536
store_class = BiliStoreFactory.STORES.get(config.SAVE_DATA_OPTION)
3637
if not store_class:
37-
raise ValueError("[BiliStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite ...")
38+
raise ValueError("[BiliStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...")
3839
return store_class()
3940

4041

0 commit comments

Comments
 (0)