From 236e3bae11f22df3f941b16ffb1fcd9c724cbaf6 Mon Sep 17 00:00:00 2001 From: Sride Shankar Date: Fri, 11 Oct 2024 15:14:47 +0530 Subject: [PATCH 1/2] added files --- .../Music Recommendation/README.md | 23 + .../Music Recommendation/clustering.ipynb | 396 ++++++++++++++++++ 2 files changed, 419 insertions(+) create mode 100644 Recommendation Models/Music Recommendation/README.md create mode 100644 Recommendation Models/Music Recommendation/clustering.ipynb diff --git a/Recommendation Models/Music Recommendation/README.md b/Recommendation Models/Music Recommendation/README.md new file mode 100644 index 000000000..44e00880b --- /dev/null +++ b/Recommendation Models/Music Recommendation/README.md @@ -0,0 +1,23 @@ +## Mock Spotify Music Recommendation System ## +**Project Overview:** +In this project, we will build a song recommendation system based on your personal Spotify data that divides the songs that you liked into 'k' number of playlists using the K-Means Clustering algorithm based on similarity in audio features such as energy, tempo, danceability, etc. Taking a look at the clustering, you will get an idea about what each playlist represents, e.g. you may notice that playlist #1 contains slow and melancholic songs, etc. Further, you get to test the recommendation ability of the system by getting new songs by a particular artist/any other way to get a bunch of unseen random songs to test whether it makes sense for the new songs to be classified under the category they have been assigned to. +K-Means is a popular unsupervised machine learning algorithm used for clustering data into groups based on feature similarity. The goal is to partition a dataset into 'k' distinct clusters, where each data point belongs to the cluster with the nearest mean. +**Steps of the algorithm:** +Initialization: Choose 'k' initial centroids randomly from the data points. +Assignment Step: Assign each data point to the nearest centroid, forming kk clusters. +Update Step: Recalculate the centroids as the mean of the points in each cluster. +Convergence Check: Repeat the assignment and update steps until centroids no longer change significantly or a maximum number of iterations is reached. +We use the library function from sklearn to achieve our purpose here. +**Pros and Cons of the algorithm:** +Pros: + +* Simple to implement and understand. +* Efficient for large datasets. +* Works well with spherical clusters. + +Cons: + +* Requires the number of clusters 'k' to be specified in advance. +* Sensitive to initial centroid placement. +* Can converge to local minima. + diff --git a/Recommendation Models/Music Recommendation/clustering.ipynb b/Recommendation Models/Music Recommendation/clustering.ipynb new file mode 100644 index 000000000..8a8648d96 --- /dev/null +++ b/Recommendation Models/Music Recommendation/clustering.ipynb @@ -0,0 +1,396 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "be744dc8-5cd6-4365-8d3e-45d4b0a11f3c", + "metadata": {}, + "outputs": [], + "source": [ + "# login to your Spotify account and create a new app.\n", + "# set a redirect url: IMPORTANT: this will be used to get our access code. e.g. https://google.com\n", + "# get your client_id and client_secret credentials\n", + "# we need these to authenticate ourselves to query the Spotify API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24d3a5c1-ea62-4e06-ab10-eefb5fde9de8", + "metadata": {}, + "outputs": [], + "source": [ + "CLIENT_ID = # type in your client_id here\n", + "CLIENT_SECRET = # type in your client_secret here\n", + "REDIRECT_URI = # type in the url you set here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69bbab88-3f28-4507-970b-fcfcbffec974", + "metadata": {}, + "outputs": [], + "source": [ + "CODE = # go to f\"https://accounts.spotify.com/authorize?redirect_uri={REDIRECT_URI}&response_type=code&client_id={CLIENT_ID}&scope=user-library-read\"\n", + "# and get the code from the params of the url you've been redirected to. you can visit this endpoint either through your browser or Postman.\n", + "# e.g. of the redirected url: https://www.google.com/?code={code}\n", + "# this is necessary because the endpoints we will be using require the user-library-read scope to deal with our personal Spotify data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bf455de-0de8-4876-ba2e-aa161ad723e6", + "metadata": {}, + "outputs": [], + "source": [ + "import base64\n", + "import requests\n", + "\n", + "# all of this to get an access token with which we can query and retrieve our liked songs and our personal data\n", + "data = {\n", + " 'grant_type': 'authorization_code',\n", + " 'code': CODE,\n", + " 'redirect_uri': REDIRECT_URI\n", + "}\n", + "credentials = f'{CLIENT_ID}:{CLIENT_SECRET}'\n", + "encoded_credentials = base64.b64encode(credentials.encode('utf-8')).decode('utf-8')\n", + "headers = {\n", + " 'content-type': 'application/x-www-form-urlencoded',\n", + " 'Authorization': 'Basic ' + encoded_credentials\n", + "}\n", + "response = requests.post('https://accounts.spotify.com/api/token', headers=headers,data=data)\n", + "print(response.json())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7832f1bb-f989-46d5-b0bb-4b83022f2fb7", + "metadata": {}, + "outputs": [], + "source": [ + "ACCESS_TOKEN = response.json()['access_token'] # acquire the access token from the response" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64adeb90-cd5f-4118-ada5-f59becfd945a", + "metadata": {}, + "outputs": [], + "source": [ + "headers = {\n", + " 'Authorization': 'Bearer ' + ACCESS_TOKEN\n", + "}\n", + "response = requests.get('https://api.spotify.com/v1/me/tracks', headers=headers) # get all the songs you liked. set a limit if you want to control the \n", + "# response received\n", + "res_dict = response.json()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a19f6d2-2e28-4515-9a28-1a3ac8001e34", + "metadata": {}, + "outputs": [], + "source": [ + "print(res_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbc49a04-82c6-4718-bc0b-e0d07fd7b591", + "metadata": {}, + "outputs": [], + "source": [ + "print(res_dict['items'][0]['track']['id'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47b20e04-8a6d-4278-8d7a-9819c2aa8d30", + "metadata": {}, + "outputs": [], + "source": [ + "# collecting song ids and song names\n", + "list_of_ids = []\n", + "list_of_names = []\n", + "for i in range(len(res_dict['items'])):\n", + " list_of_ids.append(res_dict['items'][i]['track']['id'])\n", + " list_of_names.append(res_dict['items'][i]['track']['name'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb6d5cf7-e857-4c94-917c-cbae84a98f6d", + "metadata": {}, + "outputs": [], + "source": [ + "# GET request: https://api.spotify.com/v1/audio-features\n", + "# building a list of ids as a string type for the param to be passed to the audio-features endpoint.\n", + "request_str = \"\"\n", + "for i in range(len(list_of_ids)):\n", + " request_str += list_of_ids[i] + \",\"\n", + "request_str = request_str[:len(request_str)-1] + \",\" + list_of_ids[len(list_of_ids)-1]\n", + "print(request_str)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef150aff-ab96-41de-adc4-50f0ad968b3a", + "metadata": {}, + "outputs": [], + "source": [ + "# getting a list of important features and their values for each song in your liked songs.\n", + "# some of these features include: tempo, danceability, energy, liveness, etc.\n", + "# we can use these audio features to cluster songs into playlists and further classify new ones.\n", + "headers = {\n", + " 'Authorization': 'Bearer ' + ACCESS_TOKEN\n", + "}\n", + "\n", + "params = {\n", + " 'ids': request_str,\n", + "}\n", + "\n", + "response = requests.get('https://api.spotify.com/v1/audio-features', params=params, headers=headers)\n", + "print(response.json())\n", + "audio_features = response.json()['audio_features']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b0211a6-8721-4d49-9608-b815da891849", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a6fd9b5-ac8d-44ab-ade0-9d6da3a37eec", + "metadata": {}, + "outputs": [], + "source": [ + "# get all the data into a Pandas DataFrame. drop irrelevant and categorical features so that we can work with numerical data while running the\n", + "# k-means clustering algorithm.\n", + "df = pd.DataFrame(audio_features)\n", + "df = df.drop('type', axis=1)\n", + "df = df.drop('uri', axis=1)\n", + "df = df.drop('track_href', axis=1)\n", + "df = df.drop('analysis_url', axis=1)\n", + "df = df.drop('time_signature', axis=1)\n", + "df['name'] = pd.DataFrame(list_of_names)\n", + "print(df.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f1bfe18-e928-417b-b65a-57d47677620f", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import KMeans\n", + "from sklearn.preprocessing import MinMaxScaler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e62f52e-f9ac-405d-ba9d-4f91c83c8618", + "metadata": {}, + "outputs": [], + "source": [ + "# segregating similar songs into three playlists\n", + "new_df = df.drop('id', axis=1)\n", + "new_df = new_df.drop('name', axis=1)\n", + "# normalize the features, this is important for clustering to ensure that a feature doesn't unintentionally dominate over others owing to its \n", + "# higher range of values.\n", + "X = MinMaxScaler().fit_transform(new_df)\n", + "kmeans = KMeans(init=\"k-means++\", n_clusters=3, random_state=42).fit(X)\n", + "df['kmeans'] = kmeans.labels_\n", + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0889587e-7a64-4573-9fbe-d0b47bc7f9c0", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "# getting a list of songs from an artist e.g. Billie Eilish and categorizing each song under a playlist\n", + "headers = {\n", + " 'Authorization': 'Bearer ' + ACCESS_TOKEN\n", + "}\n", + "# you can give the id of any artist whose tracks you wish to explore\n", + "artist_id = \"6qqNVTkY8uBg9cP3Jd7DAH\"\n", + "response = requests.get(f'https://api.spotify.com/v1/artists/{artist_id}/top-tracks', headers=headers)\n", + "print(response.json())\n", + "new_songs = response.json()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5221b533-8ddc-4ec5-a27e-a02829bb953c", + "metadata": {}, + "outputs": [], + "source": [ + "# segregating clusters of songs into 3 playlists\n", + "# taking a look at each playlist, you can understand what mood each of these playlists represents.\n", + "# for e.g. you may notice that energetic songs fall under a certain playlist, soft and melancholic ones fall under another, etc.\n", + "cluster_1 = df[df['kmeans']==0]\n", + "cluster_2 = df[df['kmeans']==1]\n", + "cluster_3 = df[df['kmeans']==2]\n", + "print(\"Playlist #1:\")\n", + "print(cluster_1)\n", + "print(\"Playlist #2:\")\n", + "print(cluster_2)\n", + "print(\"Playlist #3:\")\n", + "print(cluster_3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bfe406b-bab4-49c5-a84d-a9956e25ecb9", + "metadata": {}, + "outputs": [], + "source": [ + "# accumulating song names and ids for reference\n", + "list_of_ids = []\n", + "list_of_names = []\n", + "for i in range(len(new_songs['tracks'])):\n", + " list_of_ids.append(new_songs['tracks'][i]['id'])\n", + " list_of_names.append(new_songs['tracks'][i]['name'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebe3de6a-b56a-487a-8d50-d93974cbab2c", + "metadata": {}, + "outputs": [], + "source": [ + "print(list_of_names)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1452ad0f-448a-406a-8a09-657deb9ce0c9", + "metadata": {}, + "outputs": [], + "source": [ + "# preparing query parameter\n", + "request_str = \"\"\n", + "for i in range(len(list_of_ids)):\n", + " request_str += list_of_ids[i] + \",\"\n", + "request_str = request_str[:len(request_str)-1] + \",\" + list_of_ids[len(list_of_ids)-1]\n", + "print(request_str)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "795f7dec-9e77-4d74-ab6f-419fd65e04c4", + "metadata": {}, + "outputs": [], + "source": [ + "headers = {\n", + " 'Authorization': 'Bearer ' + ACCESS_TOKEN\n", + "}\n", + "\n", + "params = {\n", + " 'ids': request_str,\n", + "}\n", + "# getting audio analysis features\n", + "response = requests.get('https://api.spotify.com/v1/audio-features', params=params, headers=headers)\n", + "print(response.json())\n", + "audio_features = response.json()['audio_features']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1168c97c-f1f5-4d16-b888-a0830ba7e9ca", + "metadata": {}, + "outputs": [], + "source": [ + "# removing features which weren't used while fitting our dataset: categorical and irrelevant ones.\n", + "test_df = pd.DataFrame(audio_features)\n", + "test_df['name'] = pd.DataFrame(list_of_names)\n", + "test_df['id'] = pd.DataFrame(list_of_ids)\n", + "cpy_df = test_df.drop('name', axis=1)\n", + "cpy_df = cpy_df.drop('id', axis=1)\n", + "cpy_df = cpy_df.drop('uri', axis=1)\n", + "cpy_df = cpy_df.drop('track_href', axis=1)\n", + "cpy_df = cpy_df.drop('type', axis=1)\n", + "cpy_df = cpy_df.drop('analysis_url', axis=1)\n", + "cpy_df = cpy_df.drop('time_signature', axis=1)\n", + "# scaling the testing dataset and using the previously trained model to predict the clusters for the new dataset.\n", + "cpy_df = MinMaxScaler().fit_transform(cpy_df)\n", + "test_df['kmeans'] = kmeans.predict(cpy_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a22f9721-0dd9-4997-8188-c066d5c97756", + "metadata": {}, + "outputs": [], + "source": [ + "# take a look at the categories the new songs fall under\n", + "print(\"New additions to Playlist #1: \")\n", + "print(test_df[test_df['kmeans']==0])\n", + "print(\"New additions to Playlist #2: \")\n", + "print(test_df[test_df['kmeans']==1])\n", + "print(\"New additions to Playlist #3: \")\n", + "print(test_df[test_df['kmeans']==2])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81c301ba-3b26-40b2-8392-29398b1c5c30", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From f709c3ee5cc1d1dd81a53d01b577485d40fe5cf7 Mon Sep 17 00:00:00 2001 From: Sride Shankar Date: Fri, 11 Oct 2024 18:46:55 +0530 Subject: [PATCH 2/2] add visualization --- Recommendation Models/Music Recommendation/clustering.ipynb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Recommendation Models/Music Recommendation/clustering.ipynb b/Recommendation Models/Music Recommendation/clustering.ipynb index 8a8648d96..d53306707 100644 --- a/Recommendation Models/Music Recommendation/clustering.ipynb +++ b/Recommendation Models/Music Recommendation/clustering.ipynb @@ -369,7 +369,10 @@ "id": "81c301ba-3b26-40b2-8392-29398b1c5c30", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# visualizing the dataset in clusters\n", + "sns.pairplot(data = df, hue = 'kmeans')" + ] } ], "metadata": {