hisum/convert.py at main · kalgod/hisum · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import argparse
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
import time
import h5py
from tqdm import tqdm
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
import pickle

def get_video_title(video_id):
    # 构造 YouTube 视频的 URL
    url = f'https://www.youtube.com/watch?v={video_id}'

    # 发送请求
    response = requests.get(url)
    print(url)

    # 检查请求是否成功
    if response.status_code == 200:
        # 解析网页内容
        soup = BeautifulSoup(response.text, 'html.parser')

        # 查找标题
        title = soup.find('title').text

        # 清理标题字符串
        title = title.replace(' - YouTube', '').strip()

        return title
    else:
        print(f"Error fetching the video: {response.status_code}")
        return None

def get_meta():
    meta_data = "dataset/metadata.csv"
    df = pd.read_csv(meta_data)
    names={}
    rows=list(df.itertuples())
    i=0
    for row in tqdm(rows):
        # print(row)
        video_id=row.video_id
        youtube_id=row.youtube_id
        name=get_video_title(youtube_id)
        # youtube_name=row.youtube_name
        names[video_id]=name
        with open('tmp.pkl', 'wb') as pickle_file:
            pickle.dump(names, pickle_file)
        i+=1
    return names

last_time = None

def generate_time_vector(n):
    global last_time

    # 如果是第一次调用，则使用当前时间
    if last_time is None:
        current_time = datetime.now()
    else:
        current_time = last_time

    # 创建一个空的时间列表
    time_vector = []

    for _ in range(n):
        # 将当前时间添加到时间列表，并格式化为字符串
        time_vector.append(current_time.strftime('%Y/%m/%d %H:%M:%S'))
        # 每次增加一秒
        current_time += timedelta(seconds=1)

    # 更新 last_time 变量为最后生成的时间
    last_time = current_time

    return np.array(time_vector)

def contains_nan_or_none(row):
    # 尝试将第二个元素转换为 float，如果不能转换则返回 True
    try:
        float(row[1])  # 检查第二列是否可以转换为浮点数
        return False
    except (ValueError, TypeError):
        return True

def main(args):
    # all_data,tmp_col=clean("./dataset/all_bw.csv","./dataset/output_100000.csv")
    # all_data,tmp_col=clean("./dataset/all_bw.csv","./dataset/output.csv")
    # all_data,tmp_col=clean("./dataset/all_bw.csv","./Time-Series-Library/dataset/bandwidth/bandwidth.csv")
    video_data = h5py.File("./dataset/mr_hisum.h5", 'r')
    # names=get_meta()
    all_id=list(video_data.keys())
    res=[]
    for i in range (len(all_id)):
        if (i>=100000/250): break
        video_id=all_id[i]
        name="None"
        gtscore=np.array(video_data[video_id]['gtscore'])
        name_expanded = np.full(gtscore.shape, name)  # 创建与 gtscore 同形状的数组
        id_expanded = np.full(gtscore.shape, video_id)  # 创建与 gtscore 同形状的数组
        date=generate_time_vector(gtscore.shape[0])
        if (np.isnan(gtscore).any()): continue
        # 拼接 gtscore 和扩展后的 video_id
        # print(gtscore.shape,video_id_expanded.shape)
        cur=np.vstack((date,gtscore,id_expanded,name_expanded)).T
        # print(cur.shape)
        res.append(cur)  # 将拼接结果添加到列表中
    res=np.vstack(res)
    # res = np.array([row for row in res if not contains_nan_or_none(row)])
    print(res.shape)
    df = pd.DataFrame(res, columns=['date', 'OT', 'video_id', 'video_name'])
    # 将 DataFrame 导出为 CSV 文件
    csv_file_path = './dataset/gtscore_10w.csv'  # 你想要保存的 CSV 文件路径
    df.to_csv(csv_file_path, index=False)
    print(f"CSV 文件已保存到 {csv_file_path}")
    # all_data,train_idx,test_idx=split(args,"./dataset/output_100000.csv")
    # train_loader=CustomDataset(all_data,train_idx,args)
    # test_loader=CustomDataset(all_data,test_idx,args)
    # train_loader=DataLoader(train_loader,batch_size=args.batch,shuffle=True)
    # test_loader=DataLoader(test_loader,batch_size=args.batch,shuffle=False)

    # if (args.fea_len==1): model=DNNModel(args.in_len,1,args.out_len).to(args.device)
    # else: model=DNNModel(args.in_len,all_data.shape[1],args.out_len).to(args.device)
    # model.eval()
    # model=train(args,train_loader,test_loader,model)

    # model.load_state_dict(torch.load("./checkpoints/model_DNN_inlen_{}_outlen_{}_fealen_{}_epoch_{}.pth".format(args.in_len,args.out_len,args.fea_len,args.epochs-1)))
    # model=model.to(args.device)
    # test_loss,test_rmse,test_mae = test(args,test_loader,model)
    # print("Test Loss: {0:.7f} Test RMSE: {1:.7f} Test MAE: {2:.7f}".format(test_loss,test_rmse,test_mae))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Simple train function with args')
    parser.add_argument('-in_len', type=int, default=5, help='in len')
    parser.add_argument('-out_len', type=int, default=5, help='in len')
    parser.add_argument('-fea_len', type=int, default=1, help='in len')
    parser.add_argument('-batch', type=int, default=32, help='in len')
    parser.add_argument('-epochs', type=int, default=10, help='in len')
    parser.add_argument('-lr', type=float, default=5e-4, help='in len')
    parser.add_argument('-device', type=str, default="cuda", help='in len')

    args = parser.parse_args()
    main(args)