|
1 | | -import re |
2 | | -from collections import Counter |
3 | | -from datetime import datetime |
4 | | -from typing import Any |
5 | | - |
6 | | -import pandas as pd |
7 | | -from tqdm.auto import tqdm |
8 | | - |
9 | | -from media_impact_monitor.util.cache import get |
10 | | -from media_impact_monitor.util.env import RAPIDAPI_KEY |
11 | | - |
12 | | -headers = { |
13 | | - "x-rapidapi-key": RAPIDAPI_KEY, |
14 | | - "x-rapidapi-host": "tiktok-scraper7.p.rapidapi.com", |
15 | | -} |
16 | | - |
17 | | - |
18 | | -def get_videos_for_keywords( |
19 | | - keywords: str, n: int, cursor: int = 0 |
20 | | -) -> list[dict[str, Any]]: |
21 | | - """ |
22 | | - Get videos for a given set of keywords. |
23 | | - Problem: This returns max ~150 videos, even for very popular keywords. |
24 | | - Use hashtag query to get more videos. |
25 | | - """ |
26 | | - url = "https://tiktok-scraper7.p.rapidapi.com/feed/search" |
27 | | - query = { |
28 | | - "keywords": keywords, |
29 | | - "region": "us", # location of the proxy server |
30 | | - "count": 30, # max: 30 |
31 | | - "cursor": cursor, |
32 | | - "publish_time": "0", # 0 - ALL 1 - Past 24 hours 7 - This week 30 - This month 90 - Last 3 months 180 - Last 6 months |
33 | | - "sort_type": "0", # 0 - Relevance 1 - Like count 3 - Date posted |
34 | | - } |
35 | | - response = get(url, headers=headers, params=query) |
36 | | - # print(response.json()) |
37 | | - data = response.json()["data"] |
38 | | - videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"] |
39 | | - if has_more and cursor < n: |
40 | | - videos.extend(get_videos_for_keywords(keywords=keywords, n=n, cursor=cursor)) |
41 | | - return videos |
42 | | - |
43 | | - |
44 | | -def get_hashtag_suggestions(keywords: str) -> Counter: |
45 | | - videos = get_videos_for_keywords(keywords, n=100) |
46 | | - titles = [video["title"] for video in videos] |
47 | | - hashtags = [re.findall(r"#(\w+)", title) for title in titles] |
48 | | - hashtags = [item for sublist in hashtags for item in sublist] |
49 | | - hashtag_counts = Counter(hashtags) |
50 | | - return hashtag_counts |
51 | | - |
52 | | - |
53 | | -def get_hashtag_id(hashtag: str) -> str: |
54 | | - url = "https://tiktok-scraper7.p.rapidapi.com/challenge/info" |
55 | | - querystring = { |
56 | | - "challenge_name": hashtag, |
57 | | - } |
58 | | - response = get(url, headers=headers, params=querystring) |
59 | | - return response.json()["data"]["id"] |
60 | | - |
61 | | - |
62 | | -def get_videos_for_hashtag_id( |
63 | | - hashtag_id: str, n: int, cursor: int = 0, verbose: bool = True |
64 | | -) -> list[dict[str, Any]]: |
65 | | - url = "https://tiktok-scraper7.p.rapidapi.com/challenge/posts" |
66 | | - query = { |
67 | | - "challenge_id": hashtag_id, |
68 | | - "count": 20, # max: 20 |
69 | | - "cursor": cursor, |
70 | | - } |
71 | | - response = get(url, headers=headers, params=query) |
72 | | - data = response.json()["data"] |
73 | | - videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"] |
74 | | - if has_more and cursor < n: |
75 | | - if verbose: |
76 | | - print(cursor) |
77 | | - videos.extend( |
78 | | - get_videos_for_hashtag_id( |
79 | | - hashtag_id=hashtag_id, n=n, cursor=cursor, verbose=verbose |
80 | | - ) |
81 | | - ) |
82 | | - return videos |
83 | | - |
84 | | - |
85 | | -def get_videos_for_hashtag( |
86 | | - hashtag: str, n: int, cursor: int = 0, verbose: bool = True |
87 | | -) -> list[dict[str, Any]]: |
88 | | - hashtag_id = get_hashtag_id(hashtag) |
89 | | - return get_videos_for_hashtag_id(hashtag_id, n=n, cursor=cursor, verbose=verbose) |
90 | | - |
91 | | - |
92 | | -def get_video_history_for_hashtag( |
93 | | - hashtag: str, n: int, verbose: bool = True |
94 | | -) -> pd.DataFrame: |
95 | | - """ |
96 | | - Get video history for a hashtag. |
97 | | - Returns a time series of views and posts. |
98 | | - Views are computed by summing the views of all videos that were posted in a given day -- that is, the views do not correspond to the dates when the videos were actually viewed. It is recommended to just use posts, or comments (see `get_comment_history_for_hashtag`). |
99 | | - """ |
100 | | - videos = get_videos_for_hashtag(hashtag, n=n, verbose=verbose) |
101 | | - df = pd.DataFrame( |
102 | | - { |
103 | | - "date": [datetime.fromtimestamp(video["create_time"]) for video in videos], |
104 | | - "id": [video["video_id"] for video in videos], |
105 | | - "title": [video["title"] for video in videos], |
106 | | - "views": [video["play_count"] for video in videos], |
107 | | - } |
108 | | - ) |
109 | | - df["date"] = pd.to_datetime(df["date"]) |
110 | | - df = df.sort_values("date") |
111 | | - ts = ( |
112 | | - df.resample("1D", on="date") |
113 | | - .agg( |
114 | | - { |
115 | | - "views": "sum", |
116 | | - "id": "count", |
117 | | - } |
118 | | - ) |
119 | | - .rename(columns={"id": "posts"}) |
120 | | - ) |
121 | | - ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0) |
122 | | - return ts |
123 | | - |
124 | | - |
125 | | -def get_comments_for_video( |
126 | | - video_id: str, n: int, cursor: int = 0 |
127 | | -) -> list[dict[str, Any]]: |
128 | | - url = "https://tiktok-scraper7.p.rapidapi.com/comment/list" |
129 | | - query = { |
130 | | - "url": video_id, |
131 | | - "count": 50, # max: 50 (?) |
132 | | - "cursor": cursor, |
133 | | - } |
134 | | - response = get(url, headers=headers, params=query) |
135 | | - data = response.json()["data"] |
136 | | - comments, cursor, has_more = data["comments"], data["cursor"], data["hasMore"] |
137 | | - if has_more and cursor < n: |
138 | | - comments.extend(get_comments_for_video(video_id, n=n, cursor=cursor)) |
139 | | - return comments |
140 | | - |
141 | | - |
142 | | -def get_comment_history_for_hashtag( |
143 | | - hashtag: str, n_posts: int, n_comments: int, verbose: bool = True |
144 | | -) -> pd.DataFrame: |
145 | | - videos = get_videos_for_hashtag(hashtag, n=n_posts, verbose=verbose) |
146 | | - comments = [ |
147 | | - get_comments_for_video(video["video_id"], n=n_comments) |
148 | | - for video in tqdm(videos) |
149 | | - if video["comment_count"] > 0 |
150 | | - ] |
151 | | - comments = [comment for video_comments in comments for comment in video_comments] |
152 | | - comments_df = pd.DataFrame( |
153 | | - { |
154 | | - "date": [ |
155 | | - datetime.fromtimestamp(comment["create_time"]) for comment in comments |
156 | | - ], |
157 | | - "text": [comment["text"] for comment in comments], |
158 | | - "video_id": [comment["video_id"] for comment in comments], |
159 | | - } |
160 | | - ) |
161 | | - ts = ( |
162 | | - comments_df.resample("1W", on="date") |
163 | | - .agg( |
164 | | - { |
165 | | - "text": "count", |
166 | | - } |
167 | | - ) |
168 | | - .rename(columns={"text": "comments"}) |
169 | | - ) |
170 | | - ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0) |
171 | | - return ts |
| 1 | +# import re |
| 2 | +# from collections import Counter |
| 3 | +# from datetime import datetime |
| 4 | +# from typing import Any |
| 5 | + |
| 6 | +# import pandas as pd |
| 7 | +# from tqdm.auto import tqdm |
| 8 | + |
| 9 | +# from media_impact_monitor.util.cache import get |
| 10 | +# from media_impact_monitor.util.env import RAPIDAPI_KEY |
| 11 | + |
| 12 | +# headers = { |
| 13 | +# "x-rapidapi-key": RAPIDAPI_KEY, |
| 14 | +# "x-rapidapi-host": "tiktok-scraper7.p.rapidapi.com", |
| 15 | +# } |
| 16 | + |
| 17 | + |
| 18 | +# def get_videos_for_keywords( |
| 19 | +# keywords: str, n: int, cursor: int = 0 |
| 20 | +# ) -> list[dict[str, Any]]: |
| 21 | +# """ |
| 22 | +# Get videos for a given set of keywords. |
| 23 | +# Problem: This returns max ~150 videos, even for very popular keywords. |
| 24 | +# Use hashtag query to get more videos. |
| 25 | +# """ |
| 26 | +# url = "https://tiktok-scraper7.p.rapidapi.com/feed/search" |
| 27 | +# query = { |
| 28 | +# "keywords": keywords, |
| 29 | +# "region": "us", # location of the proxy server |
| 30 | +# "count": 30, # max: 30 |
| 31 | +# "cursor": cursor, |
| 32 | +# "publish_time": "0", # 0 - ALL 1 - Past 24 hours 7 - This week 30 - This month 90 - Last 3 months 180 - Last 6 months |
| 33 | +# "sort_type": "0", # 0 - Relevance 1 - Like count 3 - Date posted |
| 34 | +# } |
| 35 | +# response = get(url, headers=headers, params=query) |
| 36 | +# # print(response.json()) |
| 37 | +# data = response.json()["data"] |
| 38 | +# videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"] |
| 39 | +# if has_more and cursor < n: |
| 40 | +# videos.extend(get_videos_for_keywords(keywords=keywords, n=n, cursor=cursor)) |
| 41 | +# return videos |
| 42 | + |
| 43 | + |
| 44 | +# def get_hashtag_suggestions(keywords: str) -> Counter: |
| 45 | +# videos = get_videos_for_keywords(keywords, n=100) |
| 46 | +# titles = [video["title"] for video in videos] |
| 47 | +# hashtags = [re.findall(r"#(\w+)", title) for title in titles] |
| 48 | +# hashtags = [item for sublist in hashtags for item in sublist] |
| 49 | +# hashtag_counts = Counter(hashtags) |
| 50 | +# return hashtag_counts |
| 51 | + |
| 52 | + |
| 53 | +# def get_hashtag_id(hashtag: str) -> str: |
| 54 | +# url = "https://tiktok-scraper7.p.rapidapi.com/challenge/info" |
| 55 | +# querystring = { |
| 56 | +# "challenge_name": hashtag, |
| 57 | +# } |
| 58 | +# response = get(url, headers=headers, params=querystring) |
| 59 | +# return response.json()["data"]["id"] |
| 60 | + |
| 61 | + |
| 62 | +# def get_videos_for_hashtag_id( |
| 63 | +# hashtag_id: str, n: int, cursor: int = 0, verbose: bool = True |
| 64 | +# ) -> list[dict[str, Any]]: |
| 65 | +# url = "https://tiktok-scraper7.p.rapidapi.com/challenge/posts" |
| 66 | +# query = { |
| 67 | +# "challenge_id": hashtag_id, |
| 68 | +# "count": 20, # max: 20 |
| 69 | +# "cursor": cursor, |
| 70 | +# } |
| 71 | +# response = get(url, headers=headers, params=query) |
| 72 | +# data = response.json()["data"] |
| 73 | +# videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"] |
| 74 | +# if has_more and cursor < n: |
| 75 | +# if verbose: |
| 76 | +# print(cursor) |
| 77 | +# videos.extend( |
| 78 | +# get_videos_for_hashtag_id( |
| 79 | +# hashtag_id=hashtag_id, n=n, cursor=cursor, verbose=verbose |
| 80 | +# ) |
| 81 | +# ) |
| 82 | +# return videos |
| 83 | + |
| 84 | + |
| 85 | +# def get_videos_for_hashtag( |
| 86 | +# hashtag: str, n: int, cursor: int = 0, verbose: bool = True |
| 87 | +# ) -> list[dict[str, Any]]: |
| 88 | +# hashtag_id = get_hashtag_id(hashtag) |
| 89 | +# return get_videos_for_hashtag_id(hashtag_id, n=n, cursor=cursor, verbose=verbose) |
| 90 | + |
| 91 | + |
| 92 | +# def get_video_history_for_hashtag( |
| 93 | +# hashtag: str, n: int, verbose: bool = True |
| 94 | +# ) -> pd.DataFrame: |
| 95 | +# """ |
| 96 | +# Get video history for a hashtag. |
| 97 | +# Returns a time series of views and posts. |
| 98 | +# Views are computed by summing the views of all videos that were posted in a given day -- that is, the views do not correspond to the dates when the videos were actually viewed. It is recommended to just use posts, or comments (see `get_comment_history_for_hashtag`). |
| 99 | +# """ |
| 100 | +# videos = get_videos_for_hashtag(hashtag, n=n, verbose=verbose) |
| 101 | +# df = pd.DataFrame( |
| 102 | +# { |
| 103 | +# "date": [datetime.fromtimestamp(video["create_time"]) for video in videos], |
| 104 | +# "id": [video["video_id"] for video in videos], |
| 105 | +# "title": [video["title"] for video in videos], |
| 106 | +# "views": [video["play_count"] for video in videos], |
| 107 | +# } |
| 108 | +# ) |
| 109 | +# df["date"] = pd.to_datetime(df["date"]) |
| 110 | +# df = df.sort_values("date") |
| 111 | +# ts = ( |
| 112 | +# df.resample("1D", on="date") |
| 113 | +# .agg( |
| 114 | +# { |
| 115 | +# "views": "sum", |
| 116 | +# "id": "count", |
| 117 | +# } |
| 118 | +# ) |
| 119 | +# .rename(columns={"id": "posts"}) |
| 120 | +# ) |
| 121 | +# ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0) |
| 122 | +# return ts |
| 123 | + |
| 124 | + |
| 125 | +# def get_comments_for_video( |
| 126 | +# video_id: str, n: int, cursor: int = 0 |
| 127 | +# ) -> list[dict[str, Any]]: |
| 128 | +# url = "https://tiktok-scraper7.p.rapidapi.com/comment/list" |
| 129 | +# query = { |
| 130 | +# "url": video_id, |
| 131 | +# "count": 50, # max: 50 (?) |
| 132 | +# "cursor": cursor, |
| 133 | +# } |
| 134 | +# response = get(url, headers=headers, params=query) |
| 135 | +# data = response.json()["data"] |
| 136 | +# comments, cursor, has_more = data["comments"], data["cursor"], data["hasMore"] |
| 137 | +# if has_more and cursor < n: |
| 138 | +# comments.extend(get_comments_for_video(video_id, n=n, cursor=cursor)) |
| 139 | +# return comments |
| 140 | + |
| 141 | + |
| 142 | +# def get_comment_history_for_hashtag( |
| 143 | +# hashtag: str, n_posts: int, n_comments: int, verbose: bool = True |
| 144 | +# ) -> pd.DataFrame: |
| 145 | +# videos = get_videos_for_hashtag(hashtag, n=n_posts, verbose=verbose) |
| 146 | +# comments = [ |
| 147 | +# get_comments_for_video(video["video_id"], n=n_comments) |
| 148 | +# for video in tqdm(videos) |
| 149 | +# if video["comment_count"] > 0 |
| 150 | +# ] |
| 151 | +# comments = [comment for video_comments in comments for comment in video_comments] |
| 152 | +# comments_df = pd.DataFrame( |
| 153 | +# { |
| 154 | +# "date": [ |
| 155 | +# datetime.fromtimestamp(comment["create_time"]) for comment in comments |
| 156 | +# ], |
| 157 | +# "text": [comment["text"] for comment in comments], |
| 158 | +# "video_id": [comment["video_id"] for comment in comments], |
| 159 | +# } |
| 160 | +# ) |
| 161 | +# ts = ( |
| 162 | +# comments_df.resample("1W", on="date") |
| 163 | +# .agg( |
| 164 | +# { |
| 165 | +# "text": "count", |
| 166 | +# } |
| 167 | +# ) |
| 168 | +# .rename(columns={"text": "comments"}) |
| 169 | +# ) |
| 170 | +# ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0) |
| 171 | +# return ts |
0 commit comments