Skip to content

Commit b9935b1

Browse files
committed
replace scrape_search max_videos param with max_pages and fix search scraper sessions
1 parent 0ba443e commit b9935b1

File tree

1 file changed

+18
-16
lines changed

1 file changed

+18
-16
lines changed

tiktok-scraper/tiktok.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -185,8 +185,13 @@ async def scrape_profiles(urls: List[str]) -> List[Dict]:
185185

186186
def parse_search(response: ScrapeApiResponse) -> List[Dict]:
187187
"""parse search data from the API response"""
188-
data = json.loads(response.scrape_result["content"])
189-
search_data = data["data"]
188+
try:
189+
data = json.loads(response.scrape_result["content"])
190+
search_data = data["data"]
191+
except Exception as e:
192+
log.error(f"Failed to parse JSON from search API response: {e}")
193+
return None
194+
190195
parsed_search = []
191196
for item in search_data:
192197
if item["type"] == 1: # get the item if it was item only
@@ -212,7 +217,7 @@ def parse_search(response: ScrapeApiResponse) -> List[Dict]:
212217

213218
async def obtain_session(url: str) -> str:
214219
"""create a session to save the cookies and authorize the search API"""
215-
session_id = "tiktok_search_session"
220+
session_id = str(uuid.uuid4().hex)
216221
await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG, render_js=True, session=session_id))
217222
return session_id
218223

@@ -265,7 +270,8 @@ def form_api_url(cursor: int):
265270
]
266271
async for response in SCRAPFLY.concurrent_scrape(_other_pages):
267272
data = parse_search(response)
268-
search_data.extend(data)
273+
if data is not None:
274+
search_data.extend(data)
269275

270276
log.success(f"scraped {len(search_data)} from the search API from the keyword {keyword}")
271277
return search_data
@@ -288,11 +294,11 @@ def parse_channel(videos: List[Dict]) -> List[Dict]:
288294
parsed_data.append(result)
289295
return parsed_data
290296

291-
async def scrape_channel(url: str, max_videos: int = 100, max_videos_per_request: int = 18) -> List[Dict]:
297+
async def scrape_channel(url: str, max_pages: int = 5, max_videos_per_request: int = 18) -> List[Dict]:
292298
"""scrape video data from a channel by calling the item_list API directly
293299
Args:
294300
url (str): The channel URL to scrape.
295-
max_videos (int, optional): Maximum total number of videos to fetch. Defaults to 500.
301+
max_pages (int, optional): Maximum number of pages to fetch. Defaults to 5.
296302
max_videos_per_request (int, optional): Number of videos to request per API call.
297303
recommend to be within (10, 20). Some channels may fail if this value is set higher.
298304
"""
@@ -362,13 +368,14 @@ def build_api_url(cursor: int = 0) -> str:
362368
all_videos = []
363369
cursor = 0
364370
has_more = True
371+
current_page = 0
365372

366373
# Create a session to maintain cookies
367374
session_id = "tiktok_channel_session"
368-
log.info(f"starting video fetch loop, max_videos={max_videos}")
375+
log.info(f"starting video fetch loop, max_pages={max_pages}")
369376

370-
while has_more and len(all_videos) < max_videos:
371-
log.info(f"fetching videos batch, cursor: {cursor}, current total: {len(all_videos)}")
377+
while has_more and current_page < max_pages:
378+
log.info(f"fetching videos batch, page: {current_page + 1}/{max_pages}, cursor: {cursor}, current total: {len(all_videos)}")
372379

373380
api_response = await SCRAPFLY.async_scrape(
374381
ScrapeConfig(
@@ -393,16 +400,11 @@ def build_api_url(cursor: int = 0) -> str:
393400
# Update cursor for next page
394401
has_more = data.get("hasMore", False)
395402
cursor = data.get("cursor", 0)
396-
log.debug(f"hasMore={has_more}, next cursor={cursor}")
403+
current_page += 1
404+
log.debug(f"hasMore={has_more}, next cursor={cursor}, current_page={current_page}")
397405
else:
398406
log.warning("no videos found in response, stopping pagination")
399407
break
400-
401-
# Stop if we've reached the desired count
402-
if len(all_videos) >= max_videos:
403-
all_videos = all_videos[:max_videos]
404-
log.info(f"reached max_videos limit, truncating to {max_videos}")
405-
break
406408

407409
log.info(f"parsing {len(all_videos)} videos")
408410
# Parse the video data

0 commit comments

Comments
 (0)