Skip to content

Commit 49aec08

Browse files
refactor: disable TikTok
1 parent d77613d commit 49aec08

5 files changed

Lines changed: 284 additions & 294 deletions

File tree

backend-python/src/media_impact_monitor/cron.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def fill_cache():
5151
)
5252
except Exception as e:
5353
errors.append(f"events {data_source}: {e}")
54-
for media_source in ["news_online", "news_print", "social_tiktok", "web_google"]:
54+
for media_source in ["news_online", "news_print", "web_google"]:
5555
for trend_type in ["keywords", "sentiment"]:
5656
for aggregation in ["daily", "weekly"]:
5757
if aggregation == "daily" and media_source == "web_google":
Lines changed: 171 additions & 171 deletions
Original file line numberDiff line numberDiff line change
@@ -1,171 +1,171 @@
1-
import re
2-
from collections import Counter
3-
from datetime import datetime
4-
from typing import Any
5-
6-
import pandas as pd
7-
from tqdm.auto import tqdm
8-
9-
from media_impact_monitor.util.cache import get
10-
from media_impact_monitor.util.env import RAPIDAPI_KEY
11-
12-
headers = {
13-
"x-rapidapi-key": RAPIDAPI_KEY,
14-
"x-rapidapi-host": "tiktok-scraper7.p.rapidapi.com",
15-
}
16-
17-
18-
def get_videos_for_keywords(
19-
keywords: str, n: int, cursor: int = 0
20-
) -> list[dict[str, Any]]:
21-
"""
22-
Get videos for a given set of keywords.
23-
Problem: This returns max ~150 videos, even for very popular keywords.
24-
Use hashtag query to get more videos.
25-
"""
26-
url = "https://tiktok-scraper7.p.rapidapi.com/feed/search"
27-
query = {
28-
"keywords": keywords,
29-
"region": "us", # location of the proxy server
30-
"count": 30, # max: 30
31-
"cursor": cursor,
32-
"publish_time": "0", # 0 - ALL 1 - Past 24 hours 7 - This week 30 - This month 90 - Last 3 months 180 - Last 6 months
33-
"sort_type": "0", # 0 - Relevance 1 - Like count 3 - Date posted
34-
}
35-
response = get(url, headers=headers, params=query)
36-
# print(response.json())
37-
data = response.json()["data"]
38-
videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"]
39-
if has_more and cursor < n:
40-
videos.extend(get_videos_for_keywords(keywords=keywords, n=n, cursor=cursor))
41-
return videos
42-
43-
44-
def get_hashtag_suggestions(keywords: str) -> Counter:
45-
videos = get_videos_for_keywords(keywords, n=100)
46-
titles = [video["title"] for video in videos]
47-
hashtags = [re.findall(r"#(\w+)", title) for title in titles]
48-
hashtags = [item for sublist in hashtags for item in sublist]
49-
hashtag_counts = Counter(hashtags)
50-
return hashtag_counts
51-
52-
53-
def get_hashtag_id(hashtag: str) -> str:
54-
url = "https://tiktok-scraper7.p.rapidapi.com/challenge/info"
55-
querystring = {
56-
"challenge_name": hashtag,
57-
}
58-
response = get(url, headers=headers, params=querystring)
59-
return response.json()["data"]["id"]
60-
61-
62-
def get_videos_for_hashtag_id(
63-
hashtag_id: str, n: int, cursor: int = 0, verbose: bool = True
64-
) -> list[dict[str, Any]]:
65-
url = "https://tiktok-scraper7.p.rapidapi.com/challenge/posts"
66-
query = {
67-
"challenge_id": hashtag_id,
68-
"count": 20, # max: 20
69-
"cursor": cursor,
70-
}
71-
response = get(url, headers=headers, params=query)
72-
data = response.json()["data"]
73-
videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"]
74-
if has_more and cursor < n:
75-
if verbose:
76-
print(cursor)
77-
videos.extend(
78-
get_videos_for_hashtag_id(
79-
hashtag_id=hashtag_id, n=n, cursor=cursor, verbose=verbose
80-
)
81-
)
82-
return videos
83-
84-
85-
def get_videos_for_hashtag(
86-
hashtag: str, n: int, cursor: int = 0, verbose: bool = True
87-
) -> list[dict[str, Any]]:
88-
hashtag_id = get_hashtag_id(hashtag)
89-
return get_videos_for_hashtag_id(hashtag_id, n=n, cursor=cursor, verbose=verbose)
90-
91-
92-
def get_video_history_for_hashtag(
93-
hashtag: str, n: int, verbose: bool = True
94-
) -> pd.DataFrame:
95-
"""
96-
Get video history for a hashtag.
97-
Returns a time series of views and posts.
98-
Views are computed by summing the views of all videos that were posted in a given day -- that is, the views do not correspond to the dates when the videos were actually viewed. It is recommended to just use posts, or comments (see `get_comment_history_for_hashtag`).
99-
"""
100-
videos = get_videos_for_hashtag(hashtag, n=n, verbose=verbose)
101-
df = pd.DataFrame(
102-
{
103-
"date": [datetime.fromtimestamp(video["create_time"]) for video in videos],
104-
"id": [video["video_id"] for video in videos],
105-
"title": [video["title"] for video in videos],
106-
"views": [video["play_count"] for video in videos],
107-
}
108-
)
109-
df["date"] = pd.to_datetime(df["date"])
110-
df = df.sort_values("date")
111-
ts = (
112-
df.resample("1D", on="date")
113-
.agg(
114-
{
115-
"views": "sum",
116-
"id": "count",
117-
}
118-
)
119-
.rename(columns={"id": "posts"})
120-
)
121-
ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0)
122-
return ts
123-
124-
125-
def get_comments_for_video(
126-
video_id: str, n: int, cursor: int = 0
127-
) -> list[dict[str, Any]]:
128-
url = "https://tiktok-scraper7.p.rapidapi.com/comment/list"
129-
query = {
130-
"url": video_id,
131-
"count": 50, # max: 50 (?)
132-
"cursor": cursor,
133-
}
134-
response = get(url, headers=headers, params=query)
135-
data = response.json()["data"]
136-
comments, cursor, has_more = data["comments"], data["cursor"], data["hasMore"]
137-
if has_more and cursor < n:
138-
comments.extend(get_comments_for_video(video_id, n=n, cursor=cursor))
139-
return comments
140-
141-
142-
def get_comment_history_for_hashtag(
143-
hashtag: str, n_posts: int, n_comments: int, verbose: bool = True
144-
) -> pd.DataFrame:
145-
videos = get_videos_for_hashtag(hashtag, n=n_posts, verbose=verbose)
146-
comments = [
147-
get_comments_for_video(video["video_id"], n=n_comments)
148-
for video in tqdm(videos)
149-
if video["comment_count"] > 0
150-
]
151-
comments = [comment for video_comments in comments for comment in video_comments]
152-
comments_df = pd.DataFrame(
153-
{
154-
"date": [
155-
datetime.fromtimestamp(comment["create_time"]) for comment in comments
156-
],
157-
"text": [comment["text"] for comment in comments],
158-
"video_id": [comment["video_id"] for comment in comments],
159-
}
160-
)
161-
ts = (
162-
comments_df.resample("1W", on="date")
163-
.agg(
164-
{
165-
"text": "count",
166-
}
167-
)
168-
.rename(columns={"text": "comments"})
169-
)
170-
ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0)
171-
return ts
1+
# import re
2+
# from collections import Counter
3+
# from datetime import datetime
4+
# from typing import Any
5+
6+
# import pandas as pd
7+
# from tqdm.auto import tqdm
8+
9+
# from media_impact_monitor.util.cache import get
10+
# from media_impact_monitor.util.env import RAPIDAPI_KEY
11+
12+
# headers = {
13+
# "x-rapidapi-key": RAPIDAPI_KEY,
14+
# "x-rapidapi-host": "tiktok-scraper7.p.rapidapi.com",
15+
# }
16+
17+
18+
# def get_videos_for_keywords(
19+
# keywords: str, n: int, cursor: int = 0
20+
# ) -> list[dict[str, Any]]:
21+
# """
22+
# Get videos for a given set of keywords.
23+
# Problem: This returns max ~150 videos, even for very popular keywords.
24+
# Use hashtag query to get more videos.
25+
# """
26+
# url = "https://tiktok-scraper7.p.rapidapi.com/feed/search"
27+
# query = {
28+
# "keywords": keywords,
29+
# "region": "us", # location of the proxy server
30+
# "count": 30, # max: 30
31+
# "cursor": cursor,
32+
# "publish_time": "0", # 0 - ALL 1 - Past 24 hours 7 - This week 30 - This month 90 - Last 3 months 180 - Last 6 months
33+
# "sort_type": "0", # 0 - Relevance 1 - Like count 3 - Date posted
34+
# }
35+
# response = get(url, headers=headers, params=query)
36+
# # print(response.json())
37+
# data = response.json()["data"]
38+
# videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"]
39+
# if has_more and cursor < n:
40+
# videos.extend(get_videos_for_keywords(keywords=keywords, n=n, cursor=cursor))
41+
# return videos
42+
43+
44+
# def get_hashtag_suggestions(keywords: str) -> Counter:
45+
# videos = get_videos_for_keywords(keywords, n=100)
46+
# titles = [video["title"] for video in videos]
47+
# hashtags = [re.findall(r"#(\w+)", title) for title in titles]
48+
# hashtags = [item for sublist in hashtags for item in sublist]
49+
# hashtag_counts = Counter(hashtags)
50+
# return hashtag_counts
51+
52+
53+
# def get_hashtag_id(hashtag: str) -> str:
54+
# url = "https://tiktok-scraper7.p.rapidapi.com/challenge/info"
55+
# querystring = {
56+
# "challenge_name": hashtag,
57+
# }
58+
# response = get(url, headers=headers, params=querystring)
59+
# return response.json()["data"]["id"]
60+
61+
62+
# def get_videos_for_hashtag_id(
63+
# hashtag_id: str, n: int, cursor: int = 0, verbose: bool = True
64+
# ) -> list[dict[str, Any]]:
65+
# url = "https://tiktok-scraper7.p.rapidapi.com/challenge/posts"
66+
# query = {
67+
# "challenge_id": hashtag_id,
68+
# "count": 20, # max: 20
69+
# "cursor": cursor,
70+
# }
71+
# response = get(url, headers=headers, params=query)
72+
# data = response.json()["data"]
73+
# videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"]
74+
# if has_more and cursor < n:
75+
# if verbose:
76+
# print(cursor)
77+
# videos.extend(
78+
# get_videos_for_hashtag_id(
79+
# hashtag_id=hashtag_id, n=n, cursor=cursor, verbose=verbose
80+
# )
81+
# )
82+
# return videos
83+
84+
85+
# def get_videos_for_hashtag(
86+
# hashtag: str, n: int, cursor: int = 0, verbose: bool = True
87+
# ) -> list[dict[str, Any]]:
88+
# hashtag_id = get_hashtag_id(hashtag)
89+
# return get_videos_for_hashtag_id(hashtag_id, n=n, cursor=cursor, verbose=verbose)
90+
91+
92+
# def get_video_history_for_hashtag(
93+
# hashtag: str, n: int, verbose: bool = True
94+
# ) -> pd.DataFrame:
95+
# """
96+
# Get video history for a hashtag.
97+
# Returns a time series of views and posts.
98+
# Views are computed by summing the views of all videos that were posted in a given day -- that is, the views do not correspond to the dates when the videos were actually viewed. It is recommended to just use posts, or comments (see `get_comment_history_for_hashtag`).
99+
# """
100+
# videos = get_videos_for_hashtag(hashtag, n=n, verbose=verbose)
101+
# df = pd.DataFrame(
102+
# {
103+
# "date": [datetime.fromtimestamp(video["create_time"]) for video in videos],
104+
# "id": [video["video_id"] for video in videos],
105+
# "title": [video["title"] for video in videos],
106+
# "views": [video["play_count"] for video in videos],
107+
# }
108+
# )
109+
# df["date"] = pd.to_datetime(df["date"])
110+
# df = df.sort_values("date")
111+
# ts = (
112+
# df.resample("1D", on="date")
113+
# .agg(
114+
# {
115+
# "views": "sum",
116+
# "id": "count",
117+
# }
118+
# )
119+
# .rename(columns={"id": "posts"})
120+
# )
121+
# ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0)
122+
# return ts
123+
124+
125+
# def get_comments_for_video(
126+
# video_id: str, n: int, cursor: int = 0
127+
# ) -> list[dict[str, Any]]:
128+
# url = "https://tiktok-scraper7.p.rapidapi.com/comment/list"
129+
# query = {
130+
# "url": video_id,
131+
# "count": 50, # max: 50 (?)
132+
# "cursor": cursor,
133+
# }
134+
# response = get(url, headers=headers, params=query)
135+
# data = response.json()["data"]
136+
# comments, cursor, has_more = data["comments"], data["cursor"], data["hasMore"]
137+
# if has_more and cursor < n:
138+
# comments.extend(get_comments_for_video(video_id, n=n, cursor=cursor))
139+
# return comments
140+
141+
142+
# def get_comment_history_for_hashtag(
143+
# hashtag: str, n_posts: int, n_comments: int, verbose: bool = True
144+
# ) -> pd.DataFrame:
145+
# videos = get_videos_for_hashtag(hashtag, n=n_posts, verbose=verbose)
146+
# comments = [
147+
# get_comments_for_video(video["video_id"], n=n_comments)
148+
# for video in tqdm(videos)
149+
# if video["comment_count"] > 0
150+
# ]
151+
# comments = [comment for video_comments in comments for comment in video_comments]
152+
# comments_df = pd.DataFrame(
153+
# {
154+
# "date": [
155+
# datetime.fromtimestamp(comment["create_time"]) for comment in comments
156+
# ],
157+
# "text": [comment["text"] for comment in comments],
158+
# "video_id": [comment["video_id"] for comment in comments],
159+
# }
160+
# )
161+
# ts = (
162+
# comments_df.resample("1W", on="date")
163+
# .agg(
164+
# {
165+
# "text": "count",
166+
# }
167+
# )
168+
# .rename(columns={"text": "comments"})
169+
# )
170+
# ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0)
171+
# return ts

0 commit comments

Comments
 (0)