diff --git a/SEBICops/Clusters.ipynb b/SEBICops/Clusters.ipynb new file mode 100644 index 0000000..1712c1e --- /dev/null +++ b/SEBICops/Clusters.ipynb @@ -0,0 +1,2225 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 38, + "id": "23b33f4a-706a-495d-a50c-cb91102e09e2", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import KMeans, DBSCAN\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import silhouette_score" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "23c03090-ccbc-4a74-99d4-8aa14fb97f7b", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "bf88dab2-1ac9-4b1b-86ec-5c3883e617db", + "metadata": {}, + "outputs": [], + "source": [ + "embedding_data = pd.read_pickle('./df_video_info.pkl')\n", + "feature_data = pd.read_feather('./sponsorTimes_filtered_videoID_level.feather')" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "34f5c609-23db-4d2b-9b24-987d8aa00102", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
videoIDfe_segment_duration_0fe_segment_duration_1fe_segment_duration_2views_0views_1views_2votes_0votes_1votes_2videoDurationfe_ratio_segment_duration_with_total_video_duration_0fe_ratio_segment_duration_with_total_video_duration_1fe_ratio_segment_duration_with_total_video_duration_2fe_more_than_35_perc_sponsorship_content
0--kZomtrtIQ5.7560.0000.000000000465.0460.0123770.0000000.0000000
1-2MyBawvlts5.9870.0000.000100000636.0640.0094130.0000000.0000000
2-3AfFa0rV6Q24.20222.07041.2690000002477.0000.0097710.0089100.0166610
3-3Q-k4WQTDI7.3840.0000.000000000242.0240.0305090.0000000.0000000
4-6WBAaHqT8g5.3340.0000.000000000242.0240.0220390.0000000.0000000
5-6Z1ISvWq1U12.2786.8110.000650000779.0780.0157600.0087420.0000000
6-8EtKbQg4Fk19.9380.0000.000000000384.0380.0519170.0000000.0000000
7-8fTqHiG_hc2.3200.0000.00000000032.0310.0724300.0000000.0000000
8-8hK4Y5h_HQ20.2020.0000.000000000321.0320.0629280.0000000.0000000
9-B-f2-b6Nr011.1670.0000.000000000473.0470.0236070.0000000.0000000
\n", + "
" + ], + "text/plain": [ + " videoID fe_segment_duration_0 fe_segment_duration_1 \\\n", + "0 --kZomtrtIQ 5.756 0.000 \n", + "1 -2MyBawvlts 5.987 0.000 \n", + "2 -3AfFa0rV6Q 24.202 22.070 \n", + "3 -3Q-k4WQTDI 7.384 0.000 \n", + "4 -6WBAaHqT8g 5.334 0.000 \n", + "5 -6Z1ISvWq1U 12.278 6.811 \n", + "6 -8EtKbQg4Fk 19.938 0.000 \n", + "7 -8fTqHiG_hc 2.320 0.000 \n", + "8 -8hK4Y5h_HQ 20.202 0.000 \n", + "9 -B-f2-b6Nr0 11.167 0.000 \n", + "\n", + " fe_segment_duration_2 views_0 views_1 views_2 votes_0 votes_1 \\\n", + "0 0.000 0 0 0 0 0 \n", + "1 0.000 1 0 0 0 0 \n", + "2 41.269 0 0 0 0 0 \n", + "3 0.000 0 0 0 0 0 \n", + "4 0.000 0 0 0 0 0 \n", + "5 0.000 6 5 0 0 0 \n", + "6 0.000 0 0 0 0 0 \n", + "7 0.000 0 0 0 0 0 \n", + "8 0.000 0 0 0 0 0 \n", + "9 0.000 0 0 0 0 0 \n", + "\n", + " votes_2 videoDuration \\\n", + "0 0 465.046 \n", + "1 0 636.064 \n", + "2 0 2477.000 \n", + "3 0 242.024 \n", + "4 0 242.024 \n", + "5 0 779.078 \n", + "6 0 384.038 \n", + "7 0 32.031 \n", + "8 0 321.032 \n", + "9 0 473.047 \n", + "\n", + " fe_ratio_segment_duration_with_total_video_duration_0 \\\n", + "0 0.012377 \n", + "1 0.009413 \n", + "2 0.009771 \n", + "3 0.030509 \n", + "4 0.022039 \n", + "5 0.015760 \n", + "6 0.051917 \n", + "7 0.072430 \n", + "8 0.062928 \n", + "9 0.023607 \n", + "\n", + " fe_ratio_segment_duration_with_total_video_duration_1 \\\n", + "0 0.000000 \n", + "1 0.000000 \n", + "2 0.008910 \n", + "3 0.000000 \n", + "4 0.000000 \n", + "5 0.008742 \n", + "6 0.000000 \n", + "7 0.000000 \n", + "8 0.000000 \n", + "9 0.000000 \n", + "\n", + " fe_ratio_segment_duration_with_total_video_duration_2 \\\n", + "0 0.000000 \n", + "1 0.000000 \n", + "2 0.016661 \n", + "3 0.000000 \n", + "4 0.000000 \n", + "5 0.000000 \n", + "6 0.000000 \n", + "7 0.000000 \n", + "8 0.000000 \n", + "9 0.000000 \n", + "\n", + " fe_more_than_35_perc_sponsorship_content \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "5 0 \n", + "6 0 \n", + "7 0 \n", + "8 0 \n", + "9 0 " + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_data.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "935e7a0b-513a-46fe-85d5-9b5399dda065", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
videoIDchannelIDtitlepublishednameembedding
02USGSuPe8SQUCsNxHPbaCWL1tKw2hxGQD6gStock Market के Basics, Risks और Returns - Sha...1.568938e+09AssetYogi[0.026454309, -0.009652234, -0.0023297989, 0.0...
13PGL5pkqwVMUCsNxHPbaCWL1tKw2hxGQD6gMutual Funds Investment Reality for Beginners ...1.578614e+09AssetYogi[0.082667224, -0.10355977, -0.069015354, 0.027...
2um42od-JW-MUCqW8jxh4tH1Z1sWPbkGWL4gMasterclass on communicating effectively | How...1.634342e+09AkshatZayn[0.08096711, -0.042743586, 0.10679469, -0.0306...
31SyX64uQTgMUCqW8jxh4tH1Z1sWPbkGWL4gOne stock that I REGRET not buying.1.634515e+09AkshatZayn[-0.0051836153, -0.015406725, -0.011564667, 0....
4OLj9sgfQPhAUCUMccND2H_CVS0dMZKCPCXAThe Economics Of Drugs 😵 | The Untold Truth Of...1.634429e+09namaskarprasad[0.020754067, 0.0007845427, -0.03164388, -0.03...
54dZPzbdScy8UCRzYN32xtBf3Yxsx5BvJWJwI INVESTED IN A FIXED DEPOSIT!1.634602e+09warikoo[0.05586297, 0.027604798, 0.017507043, 0.07515...
6CZxxlUf55FoUCwAdQUuPT6laN-AQR17fe1gFinancial Plan for your 1st income1.634170e+09pranjalkamra[-0.013564097, 0.025042024, -0.016356125, 0.03...
7GUlg076O89cUCe3qdG0A_gr-sEdat5y2twQCrores of Rupees Lost? | Explained by CA Racha...1.634602e+09CARachanaRanade[-0.0028898187, 0.027031748, -0.04745055, 0.07...
8R8ZWwkbS6WwUCe3qdG0A_gr-sEdat5y2twQShould I own a Credit Card? Explained by CA Ra...1.600733e+09CARachanaRanade[-0.0059682536, 0.024231985, -0.034983452, 0.0...
9dqXy59i5Zd4UCe3qdG0A_gr-sEdat5y2twQHow do I earn more interest on my money? | CA ...1.634688e+09CARachanaRanade[0.044167295, 0.003461873, -0.05475514, 0.0171...
\n", + "
" + ], + "text/plain": [ + " videoID channelID \\\n", + "0 2USGSuPe8SQ UCsNxHPbaCWL1tKw2hxGQD6g \n", + "1 3PGL5pkqwVM UCsNxHPbaCWL1tKw2hxGQD6g \n", + "2 um42od-JW-M UCqW8jxh4tH1Z1sWPbkGWL4g \n", + "3 1SyX64uQTgM UCqW8jxh4tH1Z1sWPbkGWL4g \n", + "4 OLj9sgfQPhA UCUMccND2H_CVS0dMZKCPCXA \n", + "5 4dZPzbdScy8 UCRzYN32xtBf3Yxsx5BvJWJw \n", + "6 CZxxlUf55Fo UCwAdQUuPT6laN-AQR17fe1g \n", + "7 GUlg076O89c UCe3qdG0A_gr-sEdat5y2twQ \n", + "8 R8ZWwkbS6Ww UCe3qdG0A_gr-sEdat5y2twQ \n", + "9 dqXy59i5Zd4 UCe3qdG0A_gr-sEdat5y2twQ \n", + "\n", + " title published \\\n", + "0 Stock Market के Basics, Risks और Returns - Sha... 1.568938e+09 \n", + "1 Mutual Funds Investment Reality for Beginners ... 1.578614e+09 \n", + "2 Masterclass on communicating effectively | How... 1.634342e+09 \n", + "3 One stock that I REGRET not buying. 1.634515e+09 \n", + "4 The Economics Of Drugs 😵 | The Untold Truth Of... 1.634429e+09 \n", + "5 I INVESTED IN A FIXED DEPOSIT! 1.634602e+09 \n", + "6 Financial Plan for your 1st income 1.634170e+09 \n", + "7 Crores of Rupees Lost? | Explained by CA Racha... 1.634602e+09 \n", + "8 Should I own a Credit Card? Explained by CA Ra... 1.600733e+09 \n", + "9 How do I earn more interest on my money? | CA ... 1.634688e+09 \n", + "\n", + " name embedding \n", + "0 AssetYogi [0.026454309, -0.009652234, -0.0023297989, 0.0... \n", + "1 AssetYogi [0.082667224, -0.10355977, -0.069015354, 0.027... \n", + "2 AkshatZayn [0.08096711, -0.042743586, 0.10679469, -0.0306... \n", + "3 AkshatZayn [-0.0051836153, -0.015406725, -0.011564667, 0.... \n", + "4 namaskarprasad [0.020754067, 0.0007845427, -0.03164388, -0.03... \n", + "5 warikoo [0.05586297, 0.027604798, 0.017507043, 0.07515... \n", + "6 pranjalkamra [-0.013564097, 0.025042024, -0.016356125, 0.03... \n", + "7 CARachanaRanade [-0.0028898187, 0.027031748, -0.04745055, 0.07... \n", + "8 CARachanaRanade [-0.0059682536, 0.024231985, -0.034983452, 0.0... \n", + "9 CARachanaRanade [0.044167295, 0.003461873, -0.05475514, 0.0171... " + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "embedding_data.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "43c178dc-ac59-4cc7-b474-9fe985fc508c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['videoID', 'channelID', 'title', 'published', 'name', 'embedding',\n", + " 'fe_segment_duration_0', 'fe_segment_duration_1',\n", + " 'fe_segment_duration_2', 'views_0', 'views_1', 'views_2', 'votes_0',\n", + " 'votes_1', 'votes_2', 'videoDuration',\n", + " 'fe_ratio_segment_duration_with_total_video_duration_0',\n", + " 'fe_ratio_segment_duration_with_total_video_duration_1',\n", + " 'fe_ratio_segment_duration_with_total_video_duration_2',\n", + " 'fe_more_than_35_perc_sponsorship_content'],\n", + " dtype='object')" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.merge(embedding_data,feature_data, on = 'videoID')\n", + "data.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "719bbe0d-61df-4ab9-8d8d-a751a7919b75", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(Index(['videoID', 'fe_segment_duration_0', 'fe_segment_duration_1',\n", + " 'fe_segment_duration_2', 'views_0', 'views_1', 'views_2', 'votes_0',\n", + " 'votes_1', 'votes_2', 'videoDuration',\n", + " 'fe_ratio_segment_duration_with_total_video_duration_0',\n", + " 'fe_ratio_segment_duration_with_total_video_duration_1',\n", + " 'fe_ratio_segment_duration_with_total_video_duration_2',\n", + " 'fe_more_than_35_perc_sponsorship_content'],\n", + " dtype='object'),\n", + " (2363, 15))" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_data.columns, feature_data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "ded996c0-3c6d-40da-8d43-d1d52d8bee53", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
videoIDfe_segment_duration_0fe_segment_duration_1fe_segment_duration_2views_0views_1views_2votes_0votes_1votes_2videoDurationfe_ratio_segment_duration_with_total_video_duration_0fe_ratio_segment_duration_with_total_video_duration_1fe_ratio_segment_duration_with_total_video_duration_2fe_more_than_35_perc_sponsorship_content
0--kZomtrtIQ5.7560.0000.000000000465.0460.0123770.0000000.0000000
1-2MyBawvlts5.9870.0000.000100000636.0640.0094130.0000000.0000000
2-3AfFa0rV6Q24.20222.07041.2690000002477.0000.0097710.0089100.0166610
3-3Q-k4WQTDI7.3840.0000.000000000242.0240.0305090.0000000.0000000
4-6WBAaHqT8g5.3340.0000.000000000242.0240.0220390.0000000.0000000
................................................
2358zwLHJlNMlf40.00024.5990.00007600001092.0000.0000000.0225270.0000000
2359zxKURXHy6es46.0730.0000.000000000733.0730.0628490.0000000.0000000
2360zxdOcHOrAdE0.00020.9800.00000000061.0600.0000000.3435960.0000000
2361zxi7Rm-lWTg43.5200.00043.33736011100555.0000.0784140.0000000.0780850
2362zzJ0iHJ7_nk24.830117.1000.0008731500-10605.2010.0410280.1934890.0000000
\n", + "

2363 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " videoID fe_segment_duration_0 fe_segment_duration_1 \\\n", + "0 --kZomtrtIQ 5.756 0.000 \n", + "1 -2MyBawvlts 5.987 0.000 \n", + "2 -3AfFa0rV6Q 24.202 22.070 \n", + "3 -3Q-k4WQTDI 7.384 0.000 \n", + "4 -6WBAaHqT8g 5.334 0.000 \n", + "... ... ... ... \n", + "2358 zwLHJlNMlf4 0.000 24.599 \n", + "2359 zxKURXHy6es 46.073 0.000 \n", + "2360 zxdOcHOrAdE 0.000 20.980 \n", + "2361 zxi7Rm-lWTg 43.520 0.000 \n", + "2362 zzJ0iHJ7_nk 24.830 117.100 \n", + "\n", + " fe_segment_duration_2 views_0 views_1 views_2 votes_0 votes_1 \\\n", + "0 0.000 0 0 0 0 0 \n", + "1 0.000 1 0 0 0 0 \n", + "2 41.269 0 0 0 0 0 \n", + "3 0.000 0 0 0 0 0 \n", + "4 0.000 0 0 0 0 0 \n", + "... ... ... ... ... ... ... \n", + "2358 0.000 0 76 0 0 0 \n", + "2359 0.000 0 0 0 0 0 \n", + "2360 0.000 0 0 0 0 0 \n", + "2361 43.337 36 0 11 1 0 \n", + "2362 0.000 87 315 0 0 -1 \n", + "\n", + " votes_2 videoDuration \\\n", + "0 0 465.046 \n", + "1 0 636.064 \n", + "2 0 2477.000 \n", + "3 0 242.024 \n", + "4 0 242.024 \n", + "... ... ... \n", + "2358 0 1092.000 \n", + "2359 0 733.073 \n", + "2360 0 61.060 \n", + "2361 0 555.000 \n", + "2362 0 605.201 \n", + "\n", + " fe_ratio_segment_duration_with_total_video_duration_0 \\\n", + "0 0.012377 \n", + "1 0.009413 \n", + "2 0.009771 \n", + "3 0.030509 \n", + "4 0.022039 \n", + "... ... \n", + "2358 0.000000 \n", + "2359 0.062849 \n", + "2360 0.000000 \n", + "2361 0.078414 \n", + "2362 0.041028 \n", + "\n", + " fe_ratio_segment_duration_with_total_video_duration_1 \\\n", + "0 0.000000 \n", + "1 0.000000 \n", + "2 0.008910 \n", + "3 0.000000 \n", + "4 0.000000 \n", + "... ... \n", + "2358 0.022527 \n", + "2359 0.000000 \n", + "2360 0.343596 \n", + "2361 0.000000 \n", + "2362 0.193489 \n", + "\n", + " fe_ratio_segment_duration_with_total_video_duration_2 \\\n", + "0 0.000000 \n", + "1 0.000000 \n", + "2 0.016661 \n", + "3 0.000000 \n", + "4 0.000000 \n", + "... ... \n", + "2358 0.000000 \n", + "2359 0.000000 \n", + "2360 0.000000 \n", + "2361 0.078085 \n", + "2362 0.000000 \n", + "\n", + " fe_more_than_35_perc_sponsorship_content \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "... ... \n", + "2358 0 \n", + "2359 0 \n", + "2360 0 \n", + "2361 0 \n", + "2362 0 \n", + "\n", + "[2363 rows x 15 columns]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_data\n", + "\n", + "# videoID, views_segment1, views_segment2, views_segment3, views_segment4, channelID, title, embbedding, fe_segment1_duration,\n", + "# fe_segment2_duration, fe_segment3_duration, fe_segment4_duration, fe_ratio_segment1_duration_with_total_video_duration, \n", + "# fe_ratio_segment2_duration_with_total_video_duration, fe_ratio_segment3_duration_with_total_video_duration,\n", + "# fe_ratio_segment4_duration_with_total_video_duration, fe_more_than_30_perc_sponsorship_content, fe_count_sponsorships\n", + "\n", + "# videoID, ratios, embedding, " + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "7af97689-b3bd-44d2-99ae-41cc13d108e2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
143
videoID2USGSuPe8SQ
fe_segment_duration_068.680586
fe_segment_duration_10.0
fe_segment_duration_20.0
views_0151
views_10
views_20
votes_00
votes_10
votes_20
videoDuration1800.081
fe_ratio_segment_duration_with_total_video_duration_00.038154
fe_ratio_segment_duration_with_total_video_duration_10.0
fe_ratio_segment_duration_with_total_video_duration_20.0
fe_more_than_35_perc_sponsorship_content0
\n", + "
" + ], + "text/plain": [ + " 143\n", + "videoID 2USGSuPe8SQ\n", + "fe_segment_duration_0 68.680586\n", + "fe_segment_duration_1 0.0\n", + "fe_segment_duration_2 0.0\n", + "views_0 151\n", + "views_1 0\n", + "views_2 0\n", + "votes_0 0\n", + "votes_1 0\n", + "votes_2 0\n", + "videoDuration 1800.081\n", + "fe_ratio_segment_duration_with_total_video_dura... 0.038154\n", + "fe_ratio_segment_duration_with_total_video_dura... 0.0\n", + "fe_ratio_segment_duration_with_total_video_dura... 0.0\n", + "fe_more_than_35_perc_sponsorship_content 0" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_data[feature_data.videoID == '2USGSuPe8SQ'].head(1).T" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "2e1b8335-3c3e-4c09-8eed-f9846d4909a7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2419, 384)" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "embeddings_expanded = pd.DataFrame(np.array(embedding_data['embedding'].tolist())).astype('float64')\n", + "embeddings_expanded.columns = embeddings_expanded.columns.astype(str)\n", + "embeddings_expanded.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "7d86128a-23be-4ac0-b892-d08bc9637b55", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2363, 399)" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = feature_data.join(embeddings_expanded)\n", + "# print(np.isinf(data.fe_more_than_35_perc_sponsorship_content).values.sum()) # get number of inifinity values\n", + "data.replace([np.inf, -np.inf], np.nan, inplace = True)\n", + "data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "51f52fa4-b2fa-4c36-b6f4-66b9f6309586", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2363, 398)" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# to discuss: Embedding 384th value is coming out as NaN in a lot of cases - resolved.\n", + "X_pd = data.drop(['videoID'], axis = 1) \n", + "# print(X_pd.isnull().sum().sum())\n", + "X_pd.dropna(axis = 0, inplace = True)\n", + "X_pd.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "8ec9daef-2aae-489b-8764-5565d08fe18d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fe_segment_duration_0fe_segment_duration_1fe_segment_duration_2views_0views_1views_2votes_0votes_1votes_2videoDuration...374375376377378379380381382383
count2363.0000002363.0000002363.0000002363.0000002363.0000002363.0000002363.0000002363.0000002363.0000002363.000000...2363.0000002363.0000002363.0000002363.0000002363.0000002363.0000002363.0000002363.0000002363.0000002363.000000
mean20.13751127.2364758.20398614.88870153.5209485.7621670.019467-0.0025390.003386951.087211...0.0087160.000507-0.0261510.025948-0.035363-0.0064740.054771-0.058691-0.0361020.027509
std42.11080169.46002327.73202975.225058318.61072144.9393150.2109460.4422560.126794797.083870...0.0406850.0480890.0461120.0403460.0498480.0428340.0496730.0558220.0502800.042570
min0.0000000.0000000.0000000.0000000.0000000.000000-2.000000-15.000000-2.00000012.011000...-0.118398-0.160237-0.160714-0.109925-0.185898-0.139312-0.107283-0.209248-0.175477-0.110835
25%0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000447.362500...-0.019848-0.031422-0.058615-0.000969-0.067551-0.0366730.021013-0.100077-0.072208-0.001920
50%10.0740000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000832.000000...0.0082960.000408-0.0275430.025049-0.035819-0.0085520.055948-0.059038-0.0363810.028096
75%24.06360026.5235000.0000003.0000004.5000000.0000000.0000000.0000000.0000001185.750500...0.0353740.0345740.0048830.052254-0.0019740.0222510.088571-0.019058-0.0026500.056992
max1114.3890001101.220000499.8160002358.00000013089.0000001431.0000004.0000003.0000002.0000007811.000000...0.1488130.1456080.1246630.1719510.1357640.1309910.2158360.1597570.1389070.170253
\n", + "

8 rows × 398 columns

\n", + "
" + ], + "text/plain": [ + " fe_segment_duration_0 fe_segment_duration_1 fe_segment_duration_2 \\\n", + "count 2363.000000 2363.000000 2363.000000 \n", + "mean 20.137511 27.236475 8.203986 \n", + "std 42.110801 69.460023 27.732029 \n", + "min 0.000000 0.000000 0.000000 \n", + "25% 0.000000 0.000000 0.000000 \n", + "50% 10.074000 0.000000 0.000000 \n", + "75% 24.063600 26.523500 0.000000 \n", + "max 1114.389000 1101.220000 499.816000 \n", + "\n", + " views_0 views_1 views_2 votes_0 votes_1 \\\n", + "count 2363.000000 2363.000000 2363.000000 2363.000000 2363.000000 \n", + "mean 14.888701 53.520948 5.762167 0.019467 -0.002539 \n", + "std 75.225058 318.610721 44.939315 0.210946 0.442256 \n", + "min 0.000000 0.000000 0.000000 -2.000000 -15.000000 \n", + "25% 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "50% 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "75% 3.000000 4.500000 0.000000 0.000000 0.000000 \n", + "max 2358.000000 13089.000000 1431.000000 4.000000 3.000000 \n", + "\n", + " votes_2 videoDuration ... 374 375 376 \\\n", + "count 2363.000000 2363.000000 ... 2363.000000 2363.000000 2363.000000 \n", + "mean 0.003386 951.087211 ... 0.008716 0.000507 -0.026151 \n", + "std 0.126794 797.083870 ... 0.040685 0.048089 0.046112 \n", + "min -2.000000 12.011000 ... -0.118398 -0.160237 -0.160714 \n", + "25% 0.000000 447.362500 ... -0.019848 -0.031422 -0.058615 \n", + "50% 0.000000 832.000000 ... 0.008296 0.000408 -0.027543 \n", + "75% 0.000000 1185.750500 ... 0.035374 0.034574 0.004883 \n", + "max 2.000000 7811.000000 ... 0.148813 0.145608 0.124663 \n", + "\n", + " 377 378 379 380 381 \\\n", + "count 2363.000000 2363.000000 2363.000000 2363.000000 2363.000000 \n", + "mean 0.025948 -0.035363 -0.006474 0.054771 -0.058691 \n", + "std 0.040346 0.049848 0.042834 0.049673 0.055822 \n", + "min -0.109925 -0.185898 -0.139312 -0.107283 -0.209248 \n", + "25% -0.000969 -0.067551 -0.036673 0.021013 -0.100077 \n", + "50% 0.025049 -0.035819 -0.008552 0.055948 -0.059038 \n", + "75% 0.052254 -0.001974 0.022251 0.088571 -0.019058 \n", + "max 0.171951 0.135764 0.130991 0.215836 0.159757 \n", + "\n", + " 382 383 \n", + "count 2363.000000 2363.000000 \n", + "mean -0.036102 0.027509 \n", + "std 0.050280 0.042570 \n", + "min -0.175477 -0.110835 \n", + "25% -0.072208 -0.001920 \n", + "50% -0.036381 0.028096 \n", + "75% -0.002650 0.056992 \n", + "max 0.138907 0.170253 \n", + "\n", + "[8 rows x 398 columns]" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_pd.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "45e84379-7683-49e2-b335-fe461d353121", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "fe_more_than_35_perc_sponsorship_content 0.135209\n", + "votes_0 0.210946\n", + "votes_1 0.442256\n", + "fe_segment_duration_2 27.732029\n", + "fe_segment_duration_0 42.110801\n", + "views_2 44.939315\n", + "fe_segment_duration_1 69.460023\n", + "views_0 75.225058\n", + "views_1 318.610721\n", + "videoDuration 797.083870\n", + "dtype: float64" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_pd.std().sort_values()[-10:]" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "963f67b9-ccbd-40a9-b4d9-ee8a142c7460", + "metadata": {}, + "outputs": [], + "source": [ + "norm_features = ['videoDuration', 'views_1', 'views_0', 'fe_segment_duration_1', 'views_2', 'fe_segment_duration_0', 'fe_segment_duration_2' ] # pick the non-embedding columns with high sigma" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "9413a22a-0660-4c35-9391-2e4f5f8d244f", + "metadata": {}, + "outputs": [], + "source": [ + "#normalisation of norm_features\n", + "for col in norm_features:\n", + " X_pd[col] = (X_pd[col] - X_pd[col].min()) / (X_pd[col].max() - X_pd[col].min() )\n", + "\n", + "# standardization - also yields similar results\n", + "# for col in norm_features:\n", + "# X_pd[col] = (X_pd[col] - X_pd[col].mean()) / X_pd[col].std()\n", + "# X_pd['videoDuration'] = (X_pd['videoDuration'] - X_pd['videoDuration'].mean()) / X_pd['videoDuration'].std()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "d464a5b0-d9b8-402a-82de-6422907f2bf5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fe_segment_duration_0fe_segment_duration_1fe_segment_duration_2views_0views_1views_2votes_0votes_1votes_2videoDuration...374375376377378379380381382383
count2363.0000002363.0000002363.0000002363.0000002363.0000002363.0000002363.0000002363.0000002363.0000002363.000000...2363.0000002363.0000002363.0000002363.0000002363.0000002363.0000002363.0000002363.0000002363.0000002363.000000
mean0.0180700.0247330.0164140.0063140.0040890.0040270.019467-0.0025390.0033860.120410...0.0087160.000507-0.0261510.025948-0.035363-0.0064740.054771-0.058691-0.0361020.027509
std0.0377880.0630760.0554840.0319020.0243420.0314040.2109460.4422560.1267940.102203...0.0406850.0480890.0461120.0403460.0498480.0428340.0496730.0558220.0502800.042570
min0.0000000.0000000.0000000.0000000.0000000.000000-2.000000-15.000000-2.0000000.000000...-0.118398-0.160237-0.160714-0.109925-0.185898-0.139312-0.107283-0.209248-0.175477-0.110835
25%0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.055822...-0.019848-0.031422-0.058615-0.000969-0.067551-0.0366730.021013-0.100077-0.072208-0.001920
50%0.0090400.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.105140...0.0082960.000408-0.0275430.025049-0.035819-0.0085520.055948-0.059038-0.0363810.028096
75%0.0215940.0240860.0000000.0012720.0003440.0000000.0000000.0000000.0000000.150499...0.0353740.0345740.0048830.052254-0.0019740.0222510.088571-0.019058-0.0026500.056992
max1.0000001.0000001.0000001.0000001.0000001.0000004.0000003.0000002.0000001.000000...0.1488130.1456080.1246630.1719510.1357640.1309910.2158360.1597570.1389070.170253
\n", + "

8 rows × 398 columns

\n", + "
" + ], + "text/plain": [ + " fe_segment_duration_0 fe_segment_duration_1 fe_segment_duration_2 \\\n", + "count 2363.000000 2363.000000 2363.000000 \n", + "mean 0.018070 0.024733 0.016414 \n", + "std 0.037788 0.063076 0.055484 \n", + "min 0.000000 0.000000 0.000000 \n", + "25% 0.000000 0.000000 0.000000 \n", + "50% 0.009040 0.000000 0.000000 \n", + "75% 0.021594 0.024086 0.000000 \n", + "max 1.000000 1.000000 1.000000 \n", + "\n", + " views_0 views_1 views_2 votes_0 votes_1 \\\n", + "count 2363.000000 2363.000000 2363.000000 2363.000000 2363.000000 \n", + "mean 0.006314 0.004089 0.004027 0.019467 -0.002539 \n", + "std 0.031902 0.024342 0.031404 0.210946 0.442256 \n", + "min 0.000000 0.000000 0.000000 -2.000000 -15.000000 \n", + "25% 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "50% 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "75% 0.001272 0.000344 0.000000 0.000000 0.000000 \n", + "max 1.000000 1.000000 1.000000 4.000000 3.000000 \n", + "\n", + " votes_2 videoDuration ... 374 375 376 \\\n", + "count 2363.000000 2363.000000 ... 2363.000000 2363.000000 2363.000000 \n", + "mean 0.003386 0.120410 ... 0.008716 0.000507 -0.026151 \n", + "std 0.126794 0.102203 ... 0.040685 0.048089 0.046112 \n", + "min -2.000000 0.000000 ... -0.118398 -0.160237 -0.160714 \n", + "25% 0.000000 0.055822 ... -0.019848 -0.031422 -0.058615 \n", + "50% 0.000000 0.105140 ... 0.008296 0.000408 -0.027543 \n", + "75% 0.000000 0.150499 ... 0.035374 0.034574 0.004883 \n", + "max 2.000000 1.000000 ... 0.148813 0.145608 0.124663 \n", + "\n", + " 377 378 379 380 381 \\\n", + "count 2363.000000 2363.000000 2363.000000 2363.000000 2363.000000 \n", + "mean 0.025948 -0.035363 -0.006474 0.054771 -0.058691 \n", + "std 0.040346 0.049848 0.042834 0.049673 0.055822 \n", + "min -0.109925 -0.185898 -0.139312 -0.107283 -0.209248 \n", + "25% -0.000969 -0.067551 -0.036673 0.021013 -0.100077 \n", + "50% 0.025049 -0.035819 -0.008552 0.055948 -0.059038 \n", + "75% 0.052254 -0.001974 0.022251 0.088571 -0.019058 \n", + "max 0.171951 0.135764 0.130991 0.215836 0.159757 \n", + "\n", + " 382 383 \n", + "count 2363.000000 2363.000000 \n", + "mean -0.036102 0.027509 \n", + "std 0.050280 0.042570 \n", + "min -0.175477 -0.110835 \n", + "25% -0.072208 -0.001920 \n", + "50% -0.036381 0.028096 \n", + "75% -0.002650 0.056992 \n", + "max 0.138907 0.170253 \n", + "\n", + "[8 rows x 398 columns]" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_pd.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "5b0b7b71-56aa-4568-a876-c20fc706bbf3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((2339, 398), (24, 398))" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Xtrain, Xtest = train_test_split(X, train_size = 0.8)\n", + "Xtrain_pd, Xtest_pd = train_test_split(X_pd, train_size = 0.99) \n", + "Xtrain_pd.shape, Xtest_pd.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "dd28f5e3-1fe8-4a67-a730-f50e7be1d80d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/apramod/.pyenv/versions/3.8.12/envs/sebicorps/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n", + " super()._check_params_vs_input(X, default_n_init=10)\n" + ] + }, + { + "data": { + "text/html": [ + "
KMeans(n_clusters=3)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KMeans(n_clusters=3)" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#KMeans\n", + "kmeans_algo = KMeans(n_clusters = 3)\n", + "kmeans_algo.fit(Xtrain_pd)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "b3e51858-3e8c-4396-8399-4bbbb67a7147", + "metadata": {}, + "outputs": [], + "source": [ + "labels_knn = kmeans_algo.labels_.reshape(len(Xtrain_pd),1)\n", + "\n", + "Xtrain_labelled_knn_pd = pd.DataFrame(Xtrain_pd)\n", + "Xtrain_labelled_knn_pd.insert(Xtrain_labelled_knn_pd.shape[1], 'label', labels_knn)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "b3326426-4f3d-4efc-8b35-cdda106fce48", + "metadata": {}, + "outputs": [], + "source": [ + "data_with_knnlabels = data.join(Xtrain_labelled_knn_pd['label'], how = 'left')" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "9f2f09e4-07c4-4183-affb-2053efb2d703", + "metadata": {}, + "outputs": [], + "source": [ + "analyse_groups = data_with_knnlabels.groupby('label').agg({'videoID':'count'})" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "e967bb15-c0ad-442a-99f0-b13a6312be97", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['videoID', 'fe_segment_duration_0', 'fe_segment_duration_1',\n", + " 'fe_segment_duration_2', 'views_0', 'views_1', 'views_2', 'votes_0',\n", + " 'votes_1', 'votes_2',\n", + " ...\n", + " '375', '376', '377', '378', '379', '380', '381', '382', '383', 'label'],\n", + " dtype='object', length=400)" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_with_knnlabels.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "db2cb875-f07a-4b9b-b743-5dee3f9e267c", + "metadata": {}, + "outputs": [], + "source": [ + "data_with_knnlabels = pd.merge(embedding_data, data_with_knnlabels , on = \"videoID\", how = \"inner\")" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "id": "45e6a7ca-99a2-4915-9b93-631dc6ade3cb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlevideoIDnamelabel
0Managing Your Money Like the Rich! | Ankur War...soMHbU3Uqjkwarikoo0.0
1Want to RETIRE before 30? WATCH THIS!1gWwlIgua2gwarikoo0.0
2Investing in 2 GREAT stocks!kriJ0QSOf1kAkshatZayn0.0
3Earn ₹50,000+ Per Month without Investment | H...KQv1_cuIQIkPushkarRajThakurOfficial0.0
4How to Make Money Online with #Upstox? | Earn ...1vyL5fDHJG4PushkarRajThakurOfficial0.0
\n", + "
" + ], + "text/plain": [ + " title videoID \\\n", + "0 Managing Your Money Like the Rich! | Ankur War... soMHbU3Uqjk \n", + "1 Want to RETIRE before 30? WATCH THIS! 1gWwlIgua2g \n", + "2 Investing in 2 GREAT stocks! kriJ0QSOf1k \n", + "3 Earn ₹50,000+ Per Month without Investment | H... KQv1_cuIQIk \n", + "4 How to Make Money Online with #Upstox? | Earn ... 1vyL5fDHJG4 \n", + "\n", + " name label \n", + "0 warikoo 0.0 \n", + "1 warikoo 0.0 \n", + "2 AkshatZayn 0.0 \n", + "3 PushkarRajThakurOfficial 0.0 \n", + "4 PushkarRajThakurOfficial 0.0 " + ] + }, + "execution_count": 176, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analyse = data_with_knnlabels[['title', 'videoID', 'name', 'label']]\n", + "analyse[ analyse.label == 0.0 ].head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "cebe7938", + "metadata": {}, + "outputs": [], + "source": [ + "data_with_knnlabels.to_feather(\"./data_with_knnlabels.feather\")" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "2b63ae6b-5fc9-4480-b9b5-29f0910fe0a6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/apramod/.pyenv/versions/3.8.12/envs/sebicorps/lib/python3.8/site-packages/sklearn/utils/validation.py:1184: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True)\n" + ] + }, + { + "data": { + "text/plain": [ + "0.7328993721500769" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "silhouette_score(Xtrain_pd, labels_knn)" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "f7f6d99d-03bc-4523-aaab-6824a7a8fb0d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
DBSCAN(eps=7, min_samples=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "DBSCAN(eps=7, min_samples=1)" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# DBSCAN\n", + "\n", + "dbscan_algo = DBSCAN(min_samples = 1, eps=7) #euclidean distance used here\n", + "dbscan_algo.fit(Xtrain_pd)" + ] + }, + { + "cell_type": "code", + "execution_count": 699, + "id": "59573070-3300-4031-84e6-138b5cbcf8c5", + "metadata": {}, + "outputs": [], + "source": [ + "labels_dbscan = dbscan_algo.labels_.reshape((len(Xtrain_pd),1))\n", + "\n", + "Xtrain_labelled_dbscan_pd = pd.DataFrame(Xtrain_pd)\n", + "Xtrain_labelled_dbscan_pd.insert(Xtrain_labelled_dbscan_pd.shape[1], 'label', labels_dbscan)" + ] + }, + { + "cell_type": "code", + "execution_count": 700, + "id": "32e8f87d-3be3-4dbb-bf5f-6664a86c9e54", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 700, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(np.unique(dbscan_algo.labels_))" + ] + }, + { + "cell_type": "code", + "execution_count": 701, + "id": "11bb094f-e9bc-462a-ad7f-d389fab3572f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/apramod/.pyenv/versions/3.8.12/envs/sebicorps/lib/python3.8/site-packages/sklearn/utils/validation.py:1184: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True)\n" + ] + }, + { + "data": { + "text/plain": [ + "0.8154968900301026" + ] + }, + "execution_count": 701, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "silhouette_score(Xtrain_pd, labels_dbscan)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "33b30399-1fd1-47a8-bd10-eb2e0b0fe21d", + "metadata": {}, + "outputs": [], + "source": [ + "analyse_groups_dbscan = Xtrain_labelled_dbscan_pd.groupby('label').agg({'videoID':'count'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c4d7b52-3c25-4d77-bf1a-8c23fe4bba41", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/SEBICops/Embeddings.ipynb b/SEBICops/Embeddings.ipynb new file mode 100644 index 0000000..303a0e2 --- /dev/null +++ b/SEBICops/Embeddings.ipynb @@ -0,0 +1,5667 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "IQL_LAYI4zLE" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import pickle" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "-dKxu5KP5H4p" + }, + "outputs": [], + "source": [ + "df_video_info = pd.read_csv('videoInfo.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jMkYEC3k5I5s", + "outputId": "72c395fa-747d-476c-e595-582d197adafc" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 5813965 entries, 0 to 5813964\n", + "Data columns (total 4 columns):\n", + " # Column Dtype \n", + "--- ------ ----- \n", + " 0 videoID object \n", + " 1 channelID object \n", + " 2 title object \n", + " 3 published float64\n", + "dtypes: float64(1), object(3)\n", + "memory usage: 177.4+ MB\n" + ] + } + ], + "source": [ + "df_video_info.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "P1-m9_iS5J0K" + }, + "outputs": [], + "source": [ + "CHANNEL_MAP = {\n", + " 'UCqW8jxh4tH1Z1sWPbkGWL4g': 'AkshatZayn',\n", + " 'UCsNxHPbaCWL1tKw2hxGQD6g': 'AssetYogi',\n", + " 'UCe3qdG0A_gr-sEdat5y2twQ': 'CARachanaRanade',\n", + " 'UCtnItzU7q_bA1eoEBjqcVrw': 'shankarnath',\n", + " 'UCS2NdYUmv_PUyyKeDAo5zYA': 'PRSundar64',\n", + " 'UCPohbSYq4IXhv0yxiy-sT4g': 'InvestYadnya',\n", + " 'UCUWbS9qoyLKTdOlvgK1GBlA': 'ParimalAde',\n", + " 'UCUMccND2H_CVS0dMZKCPCXA': 'namaskarprasad',\n", + " 'UCwAdQUuPT6laN-AQR17fe1g': 'pranjalkamra',\n", + " 'UCEAAzv2OBqxsSczKJ2QZyGQ': 'PushkarRajThakurOfficial',\n", + " 'UCRzYN32xtBf3Yxsx5BvJWJw': 'warikoo'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IGg9bmxO5Kru", + "outputId": "5ae0cbd9-b83f-4591-e808-9b3d144f9178" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 2419 entries, 0 to 2418\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 videoID 2419 non-null object \n", + " 1 channelID 2419 non-null object \n", + " 2 title 2419 non-null object \n", + " 3 published 2419 non-null float64\n", + "dtypes: float64(1), object(3)\n", + "memory usage: 75.7+ KB\n" + ] + } + ], + "source": [ + "df_video_info = df_video_info[df_video_info['channelID'].isin(CHANNEL_MAP.keys())\n", + " ].reset_index(drop=True)\n", + "df_video_info.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "MB6eWOho5MLs", + "outputId": "c23a5d00-d159-4203-fdee-6f1a3da8ccfc" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
videoIDchannelIDtitlepublishedname
02USGSuPe8SQUCsNxHPbaCWL1tKw2hxGQD6gStock Market के Basics, Risks और Returns - Sha...1.568938e+09AssetYogi
13PGL5pkqwVMUCsNxHPbaCWL1tKw2hxGQD6gMutual Funds Investment Reality for Beginners ...1.578614e+09AssetYogi
2um42od-JW-MUCqW8jxh4tH1Z1sWPbkGWL4gMasterclass on communicating effectively | How...1.634342e+09AkshatZayn
31SyX64uQTgMUCqW8jxh4tH1Z1sWPbkGWL4gOne stock that I REGRET not buying.1.634515e+09AkshatZayn
4OLj9sgfQPhAUCUMccND2H_CVS0dMZKCPCXAThe Economics Of Drugs 😵 | The Untold Truth Of...1.634429e+09namaskarprasad
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "text/plain": [ + " videoID channelID \\\n", + "0 2USGSuPe8SQ UCsNxHPbaCWL1tKw2hxGQD6g \n", + "1 3PGL5pkqwVM UCsNxHPbaCWL1tKw2hxGQD6g \n", + "2 um42od-JW-M UCqW8jxh4tH1Z1sWPbkGWL4g \n", + "3 1SyX64uQTgM UCqW8jxh4tH1Z1sWPbkGWL4g \n", + "4 OLj9sgfQPhA UCUMccND2H_CVS0dMZKCPCXA \n", + "\n", + " title published \\\n", + "0 Stock Market के Basics, Risks और Returns - Sha... 1.568938e+09 \n", + "1 Mutual Funds Investment Reality for Beginners ... 1.578614e+09 \n", + "2 Masterclass on communicating effectively | How... 1.634342e+09 \n", + "3 One stock that I REGRET not buying. 1.634515e+09 \n", + "4 The Economics Of Drugs 😵 | The Untold Truth Of... 1.634429e+09 \n", + "\n", + " name \n", + "0 AssetYogi \n", + "1 AssetYogi \n", + "2 AkshatZayn \n", + "3 AkshatZayn \n", + "4 namaskarprasad " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_video_info['name'] = df_video_info['channelID'].map(lambda x:CHANNEL_MAP[x])\n", + "df_video_info.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "45nFroyH5M8W", + "outputId": "69d5cdb7-91d5-42b8-d566-b4f0091f8bda" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting sentence-transformers\n", + " Downloading sentence-transformers-2.2.2.tar.gz (85 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.0/86.0 kB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)\n", + " Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.4/7.4 MB\u001b[0m \u001b[31m68.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (4.66.1)\n", + "Requirement already satisfied: torch>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (2.0.1+cu118)\n", + "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.15.2+cu118)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.23.5)\n", + "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.2.2)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.10.1)\n", + "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (3.8.1)\n", + "Collecting sentencepiece (from sentence-transformers)\n", + " Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m55.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)\n", + " Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m31.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (3.12.2)\n", + "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (2023.6.0)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (2.31.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (6.0.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (4.7.1)\n", + "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (23.1)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (1.12)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (3.1)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (3.1.2)\n", + "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (2.0.0)\n", + "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.6.0->sentence-transformers) (3.27.2)\n", + "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.6.0->sentence-transformers) (16.0.6)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.6.0->sentence-transformers) (2023.6.3)\n", + "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers<5.0.0,>=4.6.0->sentence-transformers)\n", + " Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m120.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting safetensors>=0.3.1 (from transformers<5.0.0,>=4.6.0->sentence-transformers)\n", + " Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m76.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->sentence-transformers) (8.1.6)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->sentence-transformers) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers) (3.2.0)\n", + "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision->sentence-transformers) (9.4.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.6.0->sentence-transformers) (2.1.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (3.2.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (2023.7.22)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.6.0->sentence-transformers) (1.3.0)\n", + "Building wheels for collected packages: sentence-transformers\n", + " Building wheel for sentence-transformers (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125924 sha256=4b2efbd35f217fea22c4941978bf6c329ba51dd08afbaf02e54689ac70a983f9\n", + " Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f\n", + "Successfully built sentence-transformers\n", + "Installing collected packages: tokenizers, sentencepiece, safetensors, huggingface-hub, transformers, sentence-transformers\n", + "Successfully installed huggingface-hub-0.16.4 safetensors-0.3.2 sentence-transformers-2.2.2 sentencepiece-0.1.99 tokenizers-0.13.3 transformers-4.31.0\n" + ] + } + ], + "source": [ + "!pip install -U sentence-transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 465, + "referenced_widgets": [ + "b775a1e98aff42d28cb623ecdb0545f6", + "24f9a4564ef748069c0251a87938f037", + "9c767767aa044cb4817cc363d1a6f469", + "622778078ab04960b2e9f836f9a319a8", + "7d5a0e39df7f4ba0b5b63fcc1e2b6f97", + "50574669cea0457a83bef8c893ac6ab5", + "150f48c401624fa4b7129f6ce5f3a4b5", + "8ec5114318ac45fd9ff8b0d426455e3e", + "606de47a36d44141ae24d37f15247924", + "3f6664fb069343fcaff00c32a64ae0ea", + "ef28f967dece446da99a1c66f38ba8d2", + "575cd2a00990417a9dbec85c66471b71", + "86a36781585e44cca6985a2f12e5f07e", + "3066696d713f4a1ca887fd23e58b9a36", + "d115144b4475495fac7971b43d89631a", + "031c6220db884046ac0a79d230a555ed", + "457269e55c6c41708ba4dc44c0901b10", + "34827149a20644e888f8e7b082fa12d3", + "812fc0d346614df99bc04a9e7c59a1e4", + "bbf4670ab0d0463fb7099df5fd6b5acf", + "e4709cab923a4856b8577eae51d359fc", + "04fd8851ea1f4cc0a258fb8f229a7172", + "5a93c3c77ee542cda7982f6b3f98c8b7", + "5cb0e29d6a7148eeb17e081a496c9e96", + "ef2163c90b5c447d969b4a85fbec747c", + "0728c62a5c634687956e349f1ea92d3f", + "7e3e472894e342cba862169aa47036de", + "c4df155cf7cf40fd88ff1be327eb4816", + "1178ed4504684bbcb741e6884fae4437", + "97dc6641cbc3439f9b46bd4d2d205239", + "f17311a54b2546458fe818ef4527013a", + "fe6584ded4d94ec3939f52ad2e17fc23", + "e611b6b3132f407d95e097d943a6a4eb", + "dc4547aba9aa4c5988959b686f8be4c7", + "d520632a422f474bba7be092bd79ee71", + "d5c235f8d030444199158f57ae93d0d3", + "d37b72db553044a98a4382a4695cb95a", + "1ef2bbf916404cdfa592acb98af03e7e", + "ea94b13ae0134eadb39d3c21a7d6e9e4", + "0755daf8f23a4495bcf65031b3cde706", + "3c30de5a42c54877b79375910122dd40", + "62aa747a52ff4dca965d377edca927d0", + "c6262517670b46909ce998ba161a2654", + "a85d81148b3b42a29a8fb3fb8ef45ddc", + "6f8d4baa7ba34d22b75a676989aa7d97", + "f984e37c027547479a9a1b4063aabc93", + "f95c137e491541e2b592fa2882ae6040", + "515b8358381e4276b8c27e6441978062", + "c7c15bdfab4f42068ec633d888e9a87f", + "42ada80daaac4545aeef8679a481a3fa", + "0926c5f7691d43df9a8c5ff0f9cc6ddc", + "5d060286f2c34b2b9c01b812fcdc2b7f", + "6402d0d4c6f44528be6d7d4737a9d3d3", + "09bbbd9c728c434e90155eab7278a2a7", + "f117846c7c724eb7bd4b1b1fcbe2cfff", + "bf31de785680449396b32d96635fe552", + "c7e3367af6714772b22617a52d2b8a66", + "c6f2a546b33746bc9508ca32c7298b60", + "d194b97640e6473283c51b02f263eef8", + "af301487ffb64a89bed3982f3c737121", + "563d0d49df1c41918e1286e465bcb191", + "6f1f5ab408ce4d25ac033a8dcb309324", + "19d59926565b4994be8036040387518e", + "193385c37754463493e28dd445c8fb1c", + "455730a2be904d2eb0849b045625954d", + "278da682f514465c937f150aa17ef437", + "a62a1a031bf04e5ba9a4d7644b8ea538", + "8f3eb2274db74f768064cd3a55742963", + "8769c709aff04075b04bf725921a1d99", + "c815e28ab26f47d0b29d628c8aca90d4", + "6362be89f5034f98ad9b89cde5783545", + "b1a6c5dc13d644f1a7f4ba11c377afcf", + "4f6c5ae7df1548b1b5c94d58f41137bb", + "fd65338a5f8b42b095de22d7e392a4a0", + "7a7285effd154a70b28e02b904a3c9f1", + "9ddab790729744e491c48f2f3e3fdc53", + "d02a58fa54ac43ad9c7791e953d3d766", + "5785a7670a0647f3aae8b147c6a5926c", + "a18b915710334dd1b33749448b9e97f0", + "10ca323018564a979c99442e38b37b94", + "a6b90677601e450a8c8dc48a59e5ff73", + "6b148c266bdc4127973aca5c5036a0a9", + "e9adbb4796ab4b97a5b29ca8f708054d", + "1a582b6d8c6f41f9a503d6e316918512", + "bb383dc8c20b406abc8c297a3882aa52", + "c637b278166c43e4ab74c1d44b54fa0a", + "9efb4b4e77ee4e87a9b99b5e28d9f9ea", + "e1853ba777e14d37b4be59e5d6882a2f", + "dc6bb5d2486048feb8e669045fd41be0", + "aaf7733529ee4f31bd4a3831f9bca38b", + "cd1b7710a57041bb9109872e9742f8ec", + "dcd50f62a30449de8e129f083ace9fe1", + "87ed6d6474dd4f0b980164b3c5926b8b", + "103201a9c7954cfdb8acd43966d06b9f", + "21f037f68ff34a5892387f19e359cd7a", + "627b1ab39af04dde9bacb034dd30d52e", + "b623c9adf31243e8b96c3f2a3cb7e677", + "2753569937bd47b2af36e59091302566", + "ead3f1b98c6643569f20ffb58182554c", + "c72fec41edbe4b96b67b48a8ec20fed8", + "1ae33be0d10248158028aeacb0a5753a", + "b720713b380b4158bff330788c78668b", + "5e0192d6200746d2b53dfb3643cd4f8c", + "07a058f3b8a14d62955f7a0ece4cd3d6", + "cb6ceb3f5e8d46098d599125660e91a5", + "379176606965420793dabde89c229099", + "3b261c97d1c44ea9acfcff1ae9ac8d03", + "9a9a73e415f342cca82642224a826ea6", + "919a306bad604df4b095c3f3a4d30f74", + "05cb13f6ad9b4aa7a049db323006fcbf", + "60db48c1cb9e49f79d433923299a3776", + "669e79bfec8a43acbe4ecb508eca175a", + "48f411078d6540eea99d9a3c52a58b55", + "b2b4d3441d114a7eb5aded3f6f0c0059", + "ced39db459904387a54fa21ed6eeafa1", + "4ed20a537348499c94b92eb68e278cf8", + "c4dc8672138242e3a4be150bd0b82c8a", + "217ef36817c442b7b768cbc109ece79b", + "c49c4933275c40fd97ca1151f02aa4e6", + "eca55a5158c646079dbe6032c5d23349", + "076f4907a98c40e1be3b0d500ebcbfda", + "4b46c9ed3f5f4debafc805c1ac6262dc", + "310dbe8c0f1e42989573788eecb24ad4", + "fcfaa23e576b4c659d857bd77441cdf3", + "925e7a3f45f14bb5831942b4240dd369", + "e63794d99420439a86f7c30495ff4134", + "9b321fba1ed54684839d887087183f27", + "0c664da9fa2d452d86e25bbbecf4a777", + "fb84e130b71f4bcbab57fec94f6b512e", + "4faff4bb2fbe48d8bb27bc3246c0ec71", + "f324b17ad5c44810b0163e4f35ba4642", + "322862eef24c4f8589820fa0da6a26a3", + "7d7dd5997e2b4914834e6ccd8742971e", + "b821d5d445694819b2830510c6d952ac", + "f59944d01ce84939a22fb14e8a1f9fa1", + "aab77f60c480489988fbdf0071c4894d", + "7f33a8cd92744b69b83d60379752547f", + "cf53283d946b4ba490f0fb2377806ee7", + "eeed7f93d57b4596b8571c5648564414", + "0697ca701f864d8094edb2f588987eca", + "03296a99beac49f2964f02ebf3747b93", + "63c373012e3d414cb8d38d43150d3adb", + "1303e75480bd493a8688099b92038592", + "3a4fae9b4fe94f44b878458de79dce39", + "1a4f36bb59524cf7b43826697abdb6d5", + "be61e162da4a4c2aa21765887eb104be", + "be580a4babd248eea7be39f46bb6efb4", + "20882a09dcc14180925a3f3c315903e3", + "2e5ba34d9bbc4c18af924695df5282ff", + "056393d3c15c4c4d868e41c7366b73d9", + "91e60365f3cd4be3b73fcf5b317f8066", + "e584b0edeae64cd4b88b69dc0cbb6e00", + "b8d12ac4e58b4d36b3799b8fe935d9b1", + "def425aa63bd4edfa5147b57f8e1316b" + ] + }, + "id": "sK3roh1Q5Nrj", + "outputId": "59dff244-94ea-4057-d5cd-6c26d5f26ee8" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b775a1e98aff42d28cb623ecdb0545f6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading (…)0c79f/.gitattributes: 0%| | 0.00/737 [00:00 convert the title to embeddings (KNN) etc (glove / fastText embeddings etc/)\n", + "- Clusters will be for (sponsor, self_promo, outro + intro + interaction + filler = interaction)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "212d6d74", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-24T15:02:00.359051Z", + "start_time": "2023-08-24T15:01:43.377130Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AkshatZayn ['UCqW8jxh4tH1Z1sWPbkGWL4g']\n", + "AssetYogi ['UCsNxHPbaCWL1tKw2hxGQD6g']\n", + "CARachanaRanade ['UCe3qdG0A_gr-sEdat5y2twQ']\n", + "shankarnath ['UCtnItzU7q_bA1eoEBjqcVrw']\n", + "PRSundar64 ['UCS2NdYUmv_PUyyKeDAo5zYA']\n", + "InvestYadnya ['UCPohbSYq4IXhv0yxiy-sT4g']\n", + "ParimalAde ['UCUWbS9qoyLKTdOlvgK1GBlA']\n", + "namaskarprasad ['UCUMccND2H_CVS0dMZKCPCXA']\n", + "pranjalkamra ['UCwAdQUuPT6laN-AQR17fe1g']\n", + "PushkarRajThakurOfficial ['UCEAAzv2OBqxsSczKJ2QZyGQ']\n", + "warikoo ['UCRzYN32xtBf3Yxsx5BvJWJw']\n" + ] + } + ], + "source": [ + "BASE_PATH = Path(\"./tmp\")\n", + "DATA_PATH = BASE_PATH / \"data\"\n", + "\n", + "channel_name_to_id = {}\n", + "REGEX_CHANNEL_ID = re.compile(r\"channel_id=(\\w+-?\\w+)\\\"\\>\")\n", + "\n", + "CHANNELS_TO_FILTER = [\n", + " \"AkshatZayn\",\n", + " \"AssetYogi\",\n", + " \"CARachanaRanade\",\n", + " \"shankarnath\",\n", + " \"PRSundar64\",\n", + " \"InvestYadnya\",\n", + " \"ParimalAde\",\n", + " \"namaskarprasad\",\n", + " \"pranjalkamra\",\n", + " \"PushkarRajThakurOfficial\",\n", + " \"warikoo\"\n", + "]\n", + "\n", + "\n", + "# we need channel_ids to filter the underlying dataset, fetching that information from YT\n", + "for channel_name in CHANNELS_TO_FILTER:\n", + " try:\n", + " page_source = requests.get(f\"https://www.youtube.com/@{channel_name}\").text\n", + " channel_id = REGEX_CHANNEL_ID.findall(page_source)\n", + " print(channel_name, channel_id)\n", + " channel_name_to_id[channel_name] = channel_id[0]\n", + " except Exception as error:\n", + " print(f\"Failed to find the channel_id for the channel_name {channel_name}\")\n", + "\n", + "# We are starting out with 3 clusters\n", + "# 1st and 2nd ones are \"sponsor\" and \"selfpromo\"\n", + "# #rd cluster we are combining rest of the categories mentioned in the below list as one category\n", + "\n", + "\n", + "CHANNEL_IDS_FILTER = list(\n", + " channel_name_to_id.values()\n", + ")\n", + "\n", + "VIDEO_CATEGORY_FILTER = [\n", + " #########\n", + " \"outro\",\n", + " \"intro\",\n", + " \"filler\",\n", + " \"interaction\",\n", + " \n", + " #########\n", + " \"sponsor\",\n", + " \n", + " #########,\n", + " \"selfpromo\"\n", + "]\n", + "\n", + "\n", + "VIDEO_CATEGORY_FILTER_MAP = {\n", + " \"outro\": 0,\n", + " \"intro\": 0,\n", + " \"filler\": 0,\n", + " \"interaction\": 0,\n", + " \"sponsor\": 1,\n", + " \"selfpromo\": 2,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "223607da", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-24T15:02:00.377346Z", + "start_time": "2023-08-24T15:02:00.366982Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'AkshatZayn': 'UCqW8jxh4tH1Z1sWPbkGWL4g',\n", + " 'AssetYogi': 'UCsNxHPbaCWL1tKw2hxGQD6g',\n", + " 'CARachanaRanade': 'UCe3qdG0A_gr-sEdat5y2twQ',\n", + " 'shankarnath': 'UCtnItzU7q_bA1eoEBjqcVrw',\n", + " 'PRSundar64': 'UCS2NdYUmv_PUyyKeDAo5zYA',\n", + " 'InvestYadnya': 'UCPohbSYq4IXhv0yxiy-sT4g',\n", + " 'ParimalAde': 'UCUWbS9qoyLKTdOlvgK1GBlA',\n", + " 'namaskarprasad': 'UCUMccND2H_CVS0dMZKCPCXA',\n", + " 'pranjalkamra': 'UCwAdQUuPT6laN-AQR17fe1g',\n", + " 'PushkarRajThakurOfficial': 'UCEAAzv2OBqxsSczKJ2QZyGQ',\n", + " 'warikoo': 'UCRzYN32xtBf3Yxsx5BvJWJw'}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "channel_name_to_id" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3e21f039", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-24T15:02:00.381159Z", + "start_time": "2023-08-24T15:02:00.378710Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['UCqW8jxh4tH1Z1sWPbkGWL4g',\n", + " 'UCsNxHPbaCWL1tKw2hxGQD6g',\n", + " 'UCe3qdG0A_gr-sEdat5y2twQ',\n", + " 'UCtnItzU7q_bA1eoEBjqcVrw',\n", + " 'UCS2NdYUmv_PUyyKeDAo5zYA',\n", + " 'UCPohbSYq4IXhv0yxiy-sT4g',\n", + " 'UCUWbS9qoyLKTdOlvgK1GBlA',\n", + " 'UCUMccND2H_CVS0dMZKCPCXA',\n", + " 'UCwAdQUuPT6laN-AQR17fe1g',\n", + " 'UCEAAzv2OBqxsSczKJ2QZyGQ',\n", + " 'UCRzYN32xtBf3Yxsx5BvJWJw']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "CHANNEL_IDS_FILTER" + ] + }, + { + "cell_type": "markdown", + "id": "9948d57b", + "metadata": {}, + "source": [ + "# Reading the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5d48d43b", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-20T13:45:44.332126Z", + "start_time": "2023-08-20T13:45:44.327497Z" + } + }, + "outputs": [], + "source": [ + "# Below is the dataset schema\n", + "# We're currently using only 3 files in the interest of time\n", + "# sponsorTimes.csv, titles.csv and videoInfo.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6e6b5a11", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-20T13:46:36.845939Z", + "start_time": "2023-08-20T13:46:08.317537Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((5475, 12), (2419, 4), (30306, 8))" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# SAMPLE : Simple flag to control how many rows we're reading\n", + "\n", + "SAMPLE = False\n", + "N_ROWS = 2**10 if SAMPLE else None\n", + "\n", + "FE_PREFIX = \"fe_\"\n", + "\n", + "\n", + "SPONSOR_FILTER_COLUMNS = [\n", + " \"actionType\",\n", + " \"videoID\",\n", + " \"startTime\",\n", + " \"endTime\",\n", + " \"videoDuration\",\n", + " \"votes\",\n", + " \"timeSubmitted\",\n", + " \"views\",\n", + " \"category\",\n", + " \"hidden\",\n", + " \"locked\",\n", + " \"reputation\",\n", + "]\n", + "\n", + "FINAL_COLS = []\n", + "\n", + "\n", + "df_video_info = pd.read_csv(\n", + " DATA_PATH / \"videoInfo.csv\", \n", + " nrows = N_ROWS\n", + ")\n", + "\n", + "# Filtering the data wrt the channel_ids we're interested in only\n", + "df_video_info = df_video_info[df_video_info[\"channelID\"].isin(CHANNEL_IDS_FILTER)]\n", + "df_video_info.reset_index(drop=True, inplace=True)\n", + "VIDEO_IDS = set(df_video_info[\"videoID\"].unique())\n", + "\n", + "\n", + "# reading datasets in memory using pandas\n", + "df_sponsor_times = pd.read_csv(\n", + " DATA_PATH / \"sponsorTimes.csv\", \n", + " nrows = N_ROWS, \n", + " usecols=SPONSOR_FILTER_COLUMNS\n", + ")\n", + "df_sponsor_times = df_sponsor_times[df_sponsor_times[\"videoID\"].isin(VIDEO_IDS)]\n", + "df_sponsor_times.sort_values(by=[\"videoID\", \"startTime\"], inplace=True)\n", + "df_sponsor_times.reset_index(drop=True, inplace=True)\n", + "\n", + "# Correcting data wrt videoDuration (swapping with maxEndTime available)\n", + "df_temp = (\n", + " df_sponsor_times[df_sponsor_times[\"videoDuration\"] <= 0][[\"videoID\", \"endTime\"]]\n", + " .groupby(\"videoID\")\n", + " .agg(max)\n", + " .reset_index(drop=False)\n", + ")\n", + "video_id_max_endtime_map = dict(zip(df_temp[\"videoID\"], df_temp[\"endTime\"]))\n", + "\n", + "df_sponsor_times.loc[\n", + " df_sponsor_times.loc[\n", + " (df_sponsor_times[\"videoID\"].isin(video_id_max_endtime_map)) \n", + " & (df_sponsor_times[\"videoDuration\"] <= 0)\n", + " ].index, \"videoDuration\"] = df_sponsor_times[\"videoID\"].map(video_id_max_endtime_map)\n", + "\n", + "df_titles = pd.read_csv(\n", + " DATA_PATH / \"titles.csv\", \n", + " nrows = N_ROWS\n", + ")\n", + "\n", + "del df_temp, video_id_max_endtime_map\n", + "\n", + "gc.collect()\n", + "df_sponsor_times.shape, df_video_info.shape, df_titles.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4a311582", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-20T13:46:41.763481Z", + "start_time": "2023-08-20T13:46:41.744767Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
videoIDtitleoriginaluserIDservicehashedVideoIDtimeSubmittedUUID
0U0wTDK0VOeYHistory of Antarctica's Flag01ff2cdd11ed952d0c13d678413113860ff279a3d90d31c...YouTube93646c719490256e8cb43cfaa41e39534525389b0b28f5...168004407989507f32a99-3e2d-48a3-a247-b2b03f7cd4bc
\n", + "
" + ], + "text/plain": [ + " videoID title original \\\n", + "0 U0wTDK0VOeY History of Antarctica's Flag  0 \n", + "\n", + " userID service \\\n", + "0 1ff2cdd11ed952d0c13d678413113860ff279a3d90d31c... YouTube \n", + "\n", + " hashedVideoID timeSubmitted \\\n", + "0 93646c719490256e8cb43cfaa41e39534525389b0b28f5... 1680044079895 \n", + "\n", + " UUID \n", + "0 07f32a99-3e2d-48a3-a247-b2b03f7cd4bc " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_titles.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4e1401e0", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-20T13:46:42.052146Z", + "start_time": "2023-08-20T13:46:42.036912Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
videoIDchannelIDtitlepublished
02USGSuPe8SQUCsNxHPbaCWL1tKw2hxGQD6gStock Market के Basics, Risks और Returns - Sha...1.568938e+09
\n", + "
" + ], + "text/plain": [ + " videoID channelID \\\n", + "0 2USGSuPe8SQ UCsNxHPbaCWL1tKw2hxGQD6g \n", + "\n", + " title published \n", + "0 Stock Market के Basics, Risks और Returns - Sha... 1.568938e+09 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_video_info.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "908bef9b", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-20T13:46:42.691129Z", + "start_time": "2023-08-20T13:46:42.667274Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
videoIDstartTimeendTimevoteslockedtimeSubmittedviewscategoryactionTypevideoDurationhiddenreputation
0--kZomtrtIQ459.290465.0460016664246333360outroskip465.04600.000000
1-2MyBawvlts26.72632.7130016707240503941fillerskip636.06400.000000
2-3AfFa0rV6Q1405.8901447.1590016593712423410selfpromoskip2477.00001.110345
3-3AfFa0rV6Q2088.8302110.9000016593712423410sponsorskip2477.00001.110345
4-3AfFa0rV6Q2366.2692390.4710016593712423410interactionskip2477.00001.110345
\n", + "
" + ], + "text/plain": [ + " videoID startTime endTime votes locked timeSubmitted views \\\n", + "0 --kZomtrtIQ 459.290 465.046 0 0 1666424633336 0 \n", + "1 -2MyBawvlts 26.726 32.713 0 0 1670724050394 1 \n", + "2 -3AfFa0rV6Q 1405.890 1447.159 0 0 1659371242341 0 \n", + "3 -3AfFa0rV6Q 2088.830 2110.900 0 0 1659371242341 0 \n", + "4 -3AfFa0rV6Q 2366.269 2390.471 0 0 1659371242341 0 \n", + "\n", + " category actionType videoDuration hidden reputation \n", + "0 outro skip 465.046 0 0.000000 \n", + "1 filler skip 636.064 0 0.000000 \n", + "2 selfpromo skip 2477.000 0 1.110345 \n", + "3 sponsor skip 2477.000 0 1.110345 \n", + "4 interaction skip 2477.000 0 1.110345 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_sponsor_times.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "412b6262", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-20T13:46:43.032669Z", + "start_time": "2023-08-20T13:46:43.014322Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
videoIDstartTimeendTimevoteslockedtimeSubmittedviewscategoryactionTypevideoDurationhiddenreputation
2-3AfFa0rV6Q1405.8901447.1590016593712423410selfpromoskip2477.001.110345
3-3AfFa0rV6Q2088.8302110.9000016593712423410sponsorskip2477.001.110345
4-3AfFa0rV6Q2366.2692390.4710016593712423410interactionskip2477.001.110345
\n", + "
" + ], + "text/plain": [ + " videoID startTime endTime votes locked timeSubmitted views \\\n", + "2 -3AfFa0rV6Q 1405.890 1447.159 0 0 1659371242341 0 \n", + "3 -3AfFa0rV6Q 2088.830 2110.900 0 0 1659371242341 0 \n", + "4 -3AfFa0rV6Q 2366.269 2390.471 0 0 1659371242341 0 \n", + "\n", + " category actionType videoDuration hidden reputation \n", + "2 selfpromo skip 2477.0 0 1.110345 \n", + "3 sponsor skip 2477.0 0 1.110345 \n", + "4 interaction skip 2477.0 0 1.110345 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_sponsor_times[df_sponsor_times[\"videoID\"] == \"-3AfFa0rV6Q\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4801b1ad", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-20T13:47:19.849120Z", + "start_time": "2023-08-20T13:47:18.312443Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "# some filtering\n", + "# removing the videos that have full-video label and category we want...\n", + "\n", + "df_sponsor_times = df_sponsor_times[df_sponsor_times[\"actionType\"] != \"full\"]\n", + "df_sponsor_times[\"locked\"] = (df_sponsor_times[\"locked\"] == \"1\") * 1\n", + "df_sponsor_times = df_sponsor_times[df_sponsor_times[\"category\"].isin(VIDEO_CATEGORY_FILTER)]\n", + "\n", + "# Mapping category to numeric values\n", + "print(VIDEO_CATEGORY_FILTER_MAP)\n", + "df_sponsor_times[\"category\"] = df_sponsor_times[\"category\"].map(VIDEO_CATEGORY_FILTER_MAP)\n", + "\n", + "# Time-Delta-features\n", + "df_sponsor_times[f\"{FE_PREFIX}segment_duration\"] = df_sponsor_times[\"endTime\"] - df_sponsor_times[\"startTime\"]\n", + "df_sponsor_times[f\"{FE_PREFIX}submission_time\"] = df_sponsor_times[\"timeSubmitted\"] / 1e3\n", + "\n", + "df_temp = df_sponsor_times.pivot_table(\n", + " index=[\"videoID\"],\n", + " values=[\"fe_segment_duration\", \"views\", \"votes\", \"videoDuration\"],\n", + " aggfunc={\n", + " \"fe_segment_duration\": sum,\n", + " \"views\": sum,\n", + " \"votes\": sum,\n", + " \"videoDuration\": max,\n", + " },\n", + " columns=[\"category\"],\n", + " fill_value=0\n", + " )\n", + "\n", + "df_sponsor_times_video_id_level = pd.DataFrame(df_temp.to_records())\n", + "\n", + "column_names = [\"videoID\"]\n", + "for col in df_sponsor_times_video_id_level.columns.values[1:]:\n", + " col_1, category = eval(col)\n", + " column_names.append(col_1 + \"_\" + str(category))\n", + "\n", + "df_sponsor_times_video_id_level.columns = column_names\n", + "df_sponsor_times_video_id_level.drop(\n", + " [\n", + " \"videoDuration_0\",\n", + " \"videoDuration_1\",\n", + " \"videoDuration_2\"\n", + " ],\n", + " axis=1,\n", + " inplace=True\n", + ")\n", + "\n", + "video_duration_map = dict(zip(df_sponsor_times[\"videoID\"], df_sponsor_times[\"videoDuration\"]))\n", + "df_sponsor_times_video_id_level[\"videoDuration\"] = df_sponsor_times_video_id_level[\"videoID\"].map(video_duration_map)\n", + "\n", + "\n", + "for category in [0, 1, 2]:\n", + " df_sponsor_times_video_id_level[\n", + " f\"{FE_PREFIX}ratio_segment_duration_with_total_video_duration_{category}\"\n", + " ] = df_sponsor_times_video_id_level[f\"{FE_PREFIX}segment_duration_{category}\"] / df_sponsor_times_video_id_level[\"videoDuration\"]\n", + " \n", + " \n", + "# more than 35% is sponsor?\n", + "df_sponsor_times_video_id_level[f\"{FE_PREFIX}more_than_35_perc_sponsorship_content\"] = 0\n", + "df_sponsor_times_video_id_level[\"temp_1\"] = (\n", + " df_sponsor_times_video_id_level[\"fe_ratio_segment_duration_with_total_video_duration_0\"]\n", + " + df_sponsor_times_video_id_level[\"fe_ratio_segment_duration_with_total_video_duration_1\"]\n", + " + df_sponsor_times_video_id_level[\"fe_ratio_segment_duration_with_total_video_duration_2\"]\n", + ")\n", + "\n", + "df_sponsor_times_video_id_level.loc[\n", + " df_sponsor_times_video_id_level[\"temp_1\"] >= 0.344444449,\n", + " f\"{FE_PREFIX}more_than_35_perc_sponsorship_content\"\n", + "] = 1\n", + "\n", + "df_sponsor_times_video_id_level.drop([\"temp_1\"], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "17b1b489", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-20T13:47:38.567070Z", + "start_time": "2023-08-20T13:47:38.547962Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
videoIDfe_segment_duration_0fe_segment_duration_1fe_segment_duration_2views_0views_1views_2votes_0votes_1votes_2videoDurationfe_ratio_segment_duration_with_total_video_duration_0fe_ratio_segment_duration_with_total_video_duration_1fe_ratio_segment_duration_with_total_video_duration_2fe_more_than_35_perc_sponsorship_content
0--kZomtrtIQ5.7560.0000.000000000465.0460.0123770.0000000.0000000
1-2MyBawvlts5.9870.0000.000100000636.0640.0094130.0000000.0000000
2-3AfFa0rV6Q24.20222.07041.2690000002477.0000.0097710.0089100.0166610
3-3Q-k4WQTDI7.3840.0000.000000000242.0240.0305090.0000000.0000000
4-6WBAaHqT8g5.3340.0000.000000000242.0240.0220390.0000000.0000000
................................................
2358zwLHJlNMlf40.00024.5990.00007600001092.0000.0000000.0225270.0000000
2359zxKURXHy6es46.0730.0000.000000000733.0730.0628490.0000000.0000000
2360zxdOcHOrAdE0.00020.9800.00000000061.0600.0000000.3435960.0000000
2361zxi7Rm-lWTg43.5200.00043.33736011100555.0000.0784140.0000000.0780850
2362zzJ0iHJ7_nk24.830117.1000.0008731500-10605.2010.0410280.1934890.0000000
\n", + "

2363 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " videoID fe_segment_duration_0 fe_segment_duration_1 \\\n", + "0 --kZomtrtIQ 5.756 0.000 \n", + "1 -2MyBawvlts 5.987 0.000 \n", + "2 -3AfFa0rV6Q 24.202 22.070 \n", + "3 -3Q-k4WQTDI 7.384 0.000 \n", + "4 -6WBAaHqT8g 5.334 0.000 \n", + "... ... ... ... \n", + "2358 zwLHJlNMlf4 0.000 24.599 \n", + "2359 zxKURXHy6es 46.073 0.000 \n", + "2360 zxdOcHOrAdE 0.000 20.980 \n", + "2361 zxi7Rm-lWTg 43.520 0.000 \n", + "2362 zzJ0iHJ7_nk 24.830 117.100 \n", + "\n", + " fe_segment_duration_2 views_0 views_1 views_2 votes_0 votes_1 \\\n", + "0 0.000 0 0 0 0 0 \n", + "1 0.000 1 0 0 0 0 \n", + "2 41.269 0 0 0 0 0 \n", + "3 0.000 0 0 0 0 0 \n", + "4 0.000 0 0 0 0 0 \n", + "... ... ... ... ... ... ... \n", + "2358 0.000 0 76 0 0 0 \n", + "2359 0.000 0 0 0 0 0 \n", + "2360 0.000 0 0 0 0 0 \n", + "2361 43.337 36 0 11 1 0 \n", + "2362 0.000 87 315 0 0 -1 \n", + "\n", + " votes_2 videoDuration \\\n", + "0 0 465.046 \n", + "1 0 636.064 \n", + "2 0 2477.000 \n", + "3 0 242.024 \n", + "4 0 242.024 \n", + "... ... ... \n", + "2358 0 1092.000 \n", + "2359 0 733.073 \n", + "2360 0 61.060 \n", + "2361 0 555.000 \n", + "2362 0 605.201 \n", + "\n", + " fe_ratio_segment_duration_with_total_video_duration_0 \\\n", + "0 0.012377 \n", + "1 0.009413 \n", + "2 0.009771 \n", + "3 0.030509 \n", + "4 0.022039 \n", + "... ... \n", + "2358 0.000000 \n", + "2359 0.062849 \n", + "2360 0.000000 \n", + "2361 0.078414 \n", + "2362 0.041028 \n", + "\n", + " fe_ratio_segment_duration_with_total_video_duration_1 \\\n", + "0 0.000000 \n", + "1 0.000000 \n", + "2 0.008910 \n", + "3 0.000000 \n", + "4 0.000000 \n", + "... ... \n", + "2358 0.022527 \n", + "2359 0.000000 \n", + "2360 0.343596 \n", + "2361 0.000000 \n", + "2362 0.193489 \n", + "\n", + " fe_ratio_segment_duration_with_total_video_duration_2 \\\n", + "0 0.000000 \n", + "1 0.000000 \n", + "2 0.016661 \n", + "3 0.000000 \n", + "4 0.000000 \n", + "... ... \n", + "2358 0.000000 \n", + "2359 0.000000 \n", + "2360 0.000000 \n", + "2361 0.078085 \n", + "2362 0.000000 \n", + "\n", + " fe_more_than_35_perc_sponsorship_content \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "... ... \n", + "2358 0 \n", + "2359 0 \n", + "2360 0 \n", + "2361 0 \n", + "2362 0 \n", + "\n", + "[2363 rows x 15 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_sponsor_times_video_id_level" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "6288dab9", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-20T13:48:07.770850Z", + "start_time": "2023-08-20T13:48:07.720338Z" + } + }, + "outputs": [], + "source": [ + "df_sponsor_times_video_id_level.to_feather(\n", + " DATA_PATH / \"exploring_dataset_1_artifacts\" / \"sponsorTimes_filtered_videoID_level.feather\"\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/SEBICops/README.md b/SEBICops/README.md new file mode 100644 index 0000000..3a53e6d --- /dev/null +++ b/SEBICops/README.md @@ -0,0 +1,40 @@ +# Empowering-Investors-Hackathon + +#### Team Name - SEBICops +#### Problem Statement - Identifying Misleading Claims +#### Team Leader Email - mayur_madnani@intuit.com + +## A Brief of the Prototype: +The proposal is to run an unsupervised clustering algorithm using existing and crafted features to create different clusters for influencers who : +- Run for sponsorships +- Self-promote to sell their courses and merchandise +- Increase YouTube engagement by reiterating on like, subscribe, comment +- Genuine informative videos + +## Tech Stack: +- Python 3.11+ (Programming Language) +- Data Processing Module (e.g. Pandas, Spark) +- Clustering Module (e.g. KNN) +- Visualization Module (Plotly, Matplotlib, Seaborn) +- Dashboard interface (Streamlit) + +## Step-by-Step Code Execution Instructions: +Pre-requisite: +Setup Python environment with the below dependencies: +- numpy +- pandas +- plotly +- feather +- scikit-learn +- streamlit + +## Raw Data +- Raw Data can be download from [this](https://sponsor.ajay.app/database) link. Please be patient, it takes around 10-15 minutes, rest depends on the internet bandwidth. + +Run the app +`streamlit run sebi_cops_streamlit_app.py` + +## What I Learned: +Not all the content from influencers can be trusted blindly. An individual's sound judgement even from selective influencers is necessary +Technically, we learned a lot about data integration, feature engineering, and applying ML concepts to real-world scenarios + diff --git a/SEBICops/app/data_with_knnlabels.feather b/SEBICops/app/data_with_knnlabels.feather new file mode 100644 index 0000000..8772f2b Binary files /dev/null and b/SEBICops/app/data_with_knnlabels.feather differ diff --git a/SEBICops/app/sebi_cops_streamlit_app.py b/SEBICops/app/sebi_cops_streamlit_app.py new file mode 100644 index 0000000..2206dce --- /dev/null +++ b/SEBICops/app/sebi_cops_streamlit_app.py @@ -0,0 +1,72 @@ +""" +Main application. +pip install -i https://pypi.python.org/simple plotly pandas streamlit +To Execute -: streamlit run sebi_cops_streamlit_app.py +""" + + +import pandas as pd +import plotly.express as px +import streamlit as st + + +st.set_page_config( + page_title='Clustering on Videos', + layout='wide' +) + +sidebar = st.sidebar + +@st.cache_data +def get_data(): + data = pd.read_feather("data_with_knnlabels.feather") + data["fe_segment_duration"] = data['fe_segment_duration_0'] + data['fe_segment_duration_1'] + data['fe_segment_duration_2'] + return data + + +df_raw = get_data() + +df_raw["label_knn"] = df_raw["label_knn"].astype(str) +video_id_title_map = dict(zip(df_raw["videoID"], df_raw["title"])) +video_id_total_duration_map = dict(zip(df_raw["videoID"], df_raw["videoDuration"])) +video_id_fe_segment_duration_map = dict(zip(df_raw["videoID"], df_raw["fe_segment_duration"])) + +video_ids = set(df_raw["videoID"].unique().tolist()) + +video_id_selector = sidebar.selectbox( + "Select a VideoID", + video_ids +) +st.markdown(f"# Currently Selected Video ID -: {video_id_selector}") + +show_data = sidebar.checkbox("Show Data", True) + +df_filtered_video_id = df_raw.query(f"videoID == '{video_id_selector}'") +columns_to_show = ["videoID", "title", "name", "channelID", "videoDuration", "label_knn"] + +if show_data: + st.dataframe(df_filtered_video_id[columns_to_show], use_container_width=1000) + +subplots = sidebar.checkbox("Show Cluster Plots", True) + + +st.text( + f""" + Title => {video_id_title_map.get(video_id_selector)} + Sponsor & Self-Promotion duration => {video_id_fe_segment_duration_map.get(video_id_selector):.2f} secs + Total Duration => {video_id_total_duration_map.get(video_id_selector):.2f} secs + """ + ) + +fig = px.scatter_3d( + df_raw, + x="fe_segment_duration_1", + y="videoDuration", + z="fe_segment_duration_2", + color="label_knn", + hover_data=["name", "title"], + width=1024, + height=768, + title="Plot -: Clustering on video segments", +) +st.plotly_chart(fig, theme="streamlit", use_container_width=True)