From 236e3bae11f22df3f941b16ffb1fcd9c724cbaf6 Mon Sep 17 00:00:00 2001
From: Sride Shankar <sridevishankar1161@gmail.com>
Date: Fri, 11 Oct 2024 15:14:47 +0530
Subject: [PATCH 1/2] added files

---
 .../Music Recommendation/README.md            |  23 +
 .../Music Recommendation/clustering.ipynb     | 396 ++++++++++++++++++
 2 files changed, 419 insertions(+)
 create mode 100644 Recommendation Models/Music Recommendation/README.md
 create mode 100644 Recommendation Models/Music Recommendation/clustering.ipynb

diff --git a/Recommendation Models/Music Recommendation/README.md b/Recommendation Models/Music Recommendation/README.md
new file mode 100644
index 000000000..44e00880b
--- /dev/null
+++ b/Recommendation Models/Music Recommendation/README.md	
@@ -0,0 +1,23 @@
+## Mock Spotify Music Recommendation System ##
+**Project Overview:**
+In this project, we will build a song recommendation system based on your personal Spotify data that divides the songs that you liked into 'k' number of playlists using the K-Means Clustering algorithm based on similarity in audio features such as energy, tempo, danceability, etc. Taking a look at the clustering, you will get an idea about what each playlist represents, e.g. you may notice that playlist #1 contains slow and melancholic songs, etc. Further, you get to test the recommendation ability of the system by getting new songs by a particular artist/any other way to get a bunch of unseen random songs to test whether it makes sense for the new songs to be classified under the category they have been assigned to.
+K-Means is a popular unsupervised machine learning algorithm used for clustering data into groups based on feature similarity. The goal is to partition a dataset into 'k' distinct clusters, where each data point belongs to the cluster with the nearest mean.
+**Steps of the algorithm:**
+Initialization: Choose 'k' initial centroids randomly from the data points.
+Assignment Step: Assign each data point to the nearest centroid, forming kk clusters.
+Update Step: Recalculate the centroids as the mean of the points in each cluster.
+Convergence Check: Repeat the assignment and update steps until centroids no longer change significantly or a maximum number of iterations is reached.
+We use the library function from sklearn to achieve our purpose here.
+**Pros and Cons of the algorithm:**
+Pros:
+
+* Simple to implement and understand.
+* Efficient for large datasets.
+* Works well with spherical clusters.
+
+Cons:
+
+* Requires the number of clusters 'k' to be specified in advance.
+* Sensitive to initial centroid placement.
+* Can converge to local minima.
+
diff --git a/Recommendation Models/Music Recommendation/clustering.ipynb b/Recommendation Models/Music Recommendation/clustering.ipynb
new file mode 100644
index 000000000..8a8648d96
--- /dev/null
+++ b/Recommendation Models/Music Recommendation/clustering.ipynb	
@@ -0,0 +1,396 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be744dc8-5cd6-4365-8d3e-45d4b0a11f3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# login to your Spotify account and create a new app.\n",
+    "# set a redirect url: IMPORTANT: this will be used to get our access code. e.g. https://google.com\n",
+    "# get your client_id and client_secret credentials\n",
+    "# we need these to authenticate ourselves to query the Spotify API."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24d3a5c1-ea62-4e06-ab10-eefb5fde9de8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CLIENT_ID = # type in your client_id here\n",
+    "CLIENT_SECRET = # type in your client_secret here\n",
+    "REDIRECT_URI = # type in the url you set here"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "69bbab88-3f28-4507-970b-fcfcbffec974",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CODE = # go to f\"https://accounts.spotify.com/authorize?redirect_uri={REDIRECT_URI}&response_type=code&client_id={CLIENT_ID}&scope=user-library-read\"\n",
+    "# and get the code from the params of the url you've been redirected to. you can visit this endpoint either through your browser or Postman.\n",
+    "# e.g. of the redirected url: https://www.google.com/?code={code}\n",
+    "# this is necessary because the endpoints we will be using require the user-library-read scope to deal with our personal Spotify data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2bf455de-0de8-4876-ba2e-aa161ad723e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import base64\n",
+    "import requests\n",
+    "\n",
+    "# all of this to get an access token with which we can query and retrieve our liked songs and our personal data\n",
+    "data = {\n",
+    "    'grant_type': 'authorization_code',\n",
+    "    'code': CODE,\n",
+    "    'redirect_uri': REDIRECT_URI\n",
+    "}\n",
+    "credentials = f'{CLIENT_ID}:{CLIENT_SECRET}'\n",
+    "encoded_credentials = base64.b64encode(credentials.encode('utf-8')).decode('utf-8')\n",
+    "headers = {\n",
+    "        'content-type': 'application/x-www-form-urlencoded',\n",
+    "        'Authorization': 'Basic ' + encoded_credentials\n",
+    "}\n",
+    "response = requests.post('https://accounts.spotify.com/api/token', headers=headers,data=data)\n",
+    "print(response.json())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7832f1bb-f989-46d5-b0bb-4b83022f2fb7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ACCESS_TOKEN = response.json()['access_token'] # acquire the access token from the response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64adeb90-cd5f-4118-ada5-f59becfd945a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "headers = {\n",
+    "    'Authorization': 'Bearer ' + ACCESS_TOKEN\n",
+    "}\n",
+    "response = requests.get('https://api.spotify.com/v1/me/tracks', headers=headers) # get all the songs you liked. set a limit if you want to control the \n",
+    "# response received\n",
+    "res_dict = response.json()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4a19f6d2-2e28-4515-9a28-1a3ac8001e34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(res_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fbc49a04-82c6-4718-bc0b-e0d07fd7b591",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(res_dict['items'][0]['track']['id'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "47b20e04-8a6d-4278-8d7a-9819c2aa8d30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# collecting song ids and song names\n",
+    "list_of_ids = []\n",
+    "list_of_names = []\n",
+    "for i in range(len(res_dict['items'])):\n",
+    "    list_of_ids.append(res_dict['items'][i]['track']['id'])\n",
+    "    list_of_names.append(res_dict['items'][i]['track']['name'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb6d5cf7-e857-4c94-917c-cbae84a98f6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# GET request: https://api.spotify.com/v1/audio-features\n",
+    "# building a list of ids as a string type for the param to be passed to the audio-features endpoint.\n",
+    "request_str = \"\"\n",
+    "for i in range(len(list_of_ids)):\n",
+    "    request_str += list_of_ids[i] + \",\"\n",
+    "request_str = request_str[:len(request_str)-1] + \",\" + list_of_ids[len(list_of_ids)-1]\n",
+    "print(request_str)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ef150aff-ab96-41de-adc4-50f0ad968b3a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# getting a list of important features and their values for each song in your liked songs.\n",
+    "# some of these features include: tempo, danceability, energy, liveness, etc.\n",
+    "# we can use these audio features to cluster songs into playlists and further classify new ones.\n",
+    "headers = {\n",
+    "    'Authorization': 'Bearer ' + ACCESS_TOKEN\n",
+    "}\n",
+    "\n",
+    "params = {\n",
+    "    'ids': request_str,\n",
+    "}\n",
+    "\n",
+    "response = requests.get('https://api.spotify.com/v1/audio-features', params=params, headers=headers)\n",
+    "print(response.json())\n",
+    "audio_features = response.json()['audio_features']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b0211a6-8721-4d49-9608-b815da891849",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4a6fd9b5-ac8d-44ab-ade0-9d6da3a37eec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get all the data into a Pandas DataFrame. drop irrelevant and categorical features so that we can work with numerical data while running the\n",
+    "# k-means clustering algorithm.\n",
+    "df = pd.DataFrame(audio_features)\n",
+    "df = df.drop('type', axis=1)\n",
+    "df = df.drop('uri', axis=1)\n",
+    "df = df.drop('track_href', axis=1)\n",
+    "df = df.drop('analysis_url', axis=1)\n",
+    "df = df.drop('time_signature', axis=1)\n",
+    "df['name'] = pd.DataFrame(list_of_names)\n",
+    "print(df.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f1bfe18-e928-417b-b65a-57d47677620f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.cluster import KMeans\n",
+    "from sklearn.preprocessing import MinMaxScaler"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2e62f52e-f9ac-405d-ba9d-4f91c83c8618",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# segregating similar songs into three playlists\n",
+    "new_df = df.drop('id', axis=1)\n",
+    "new_df = new_df.drop('name', axis=1)\n",
+    "# normalize the features, this is important for clustering to ensure that a feature doesn't unintentionally dominate over others owing to its \n",
+    "# higher range of values.\n",
+    "X = MinMaxScaler().fit_transform(new_df)\n",
+    "kmeans = KMeans(init=\"k-means++\", n_clusters=3, random_state=42).fit(X)\n",
+    "df['kmeans'] = kmeans.labels_\n",
+    "print(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0889587e-7a64-4573-9fbe-d0b47bc7f9c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "# getting a list of songs from an artist e.g. Billie Eilish and categorizing each song under a playlist\n",
+    "headers = {\n",
+    "    'Authorization': 'Bearer ' + ACCESS_TOKEN\n",
+    "}\n",
+    "# you can give the id of any artist whose tracks you wish to explore\n",
+    "artist_id = \"6qqNVTkY8uBg9cP3Jd7DAH\"\n",
+    "response = requests.get(f'https://api.spotify.com/v1/artists/{artist_id}/top-tracks', headers=headers)\n",
+    "print(response.json())\n",
+    "new_songs = response.json()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5221b533-8ddc-4ec5-a27e-a02829bb953c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# segregating clusters of songs into 3 playlists\n",
+    "# taking a look at each playlist, you can understand what mood each of these playlists represents.\n",
+    "# for e.g. you may notice that energetic songs fall under a certain playlist, soft and melancholic ones fall under another, etc.\n",
+    "cluster_1 = df[df['kmeans']==0]\n",
+    "cluster_2 = df[df['kmeans']==1]\n",
+    "cluster_3 = df[df['kmeans']==2]\n",
+    "print(\"Playlist #1:\")\n",
+    "print(cluster_1)\n",
+    "print(\"Playlist #2:\")\n",
+    "print(cluster_2)\n",
+    "print(\"Playlist #3:\")\n",
+    "print(cluster_3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0bfe406b-bab4-49c5-a84d-a9956e25ecb9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# accumulating song names and ids for reference\n",
+    "list_of_ids = []\n",
+    "list_of_names = []\n",
+    "for i in range(len(new_songs['tracks'])):\n",
+    "    list_of_ids.append(new_songs['tracks'][i]['id'])\n",
+    "    list_of_names.append(new_songs['tracks'][i]['name'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ebe3de6a-b56a-487a-8d50-d93974cbab2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(list_of_names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1452ad0f-448a-406a-8a09-657deb9ce0c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# preparing query parameter\n",
+    "request_str = \"\"\n",
+    "for i in range(len(list_of_ids)):\n",
+    "    request_str += list_of_ids[i] + \",\"\n",
+    "request_str = request_str[:len(request_str)-1] + \",\" + list_of_ids[len(list_of_ids)-1]\n",
+    "print(request_str)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "795f7dec-9e77-4d74-ab6f-419fd65e04c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "headers = {\n",
+    "    'Authorization': 'Bearer ' + ACCESS_TOKEN\n",
+    "}\n",
+    "\n",
+    "params = {\n",
+    "    'ids': request_str,\n",
+    "}\n",
+    "# getting audio analysis features\n",
+    "response = requests.get('https://api.spotify.com/v1/audio-features', params=params, headers=headers)\n",
+    "print(response.json())\n",
+    "audio_features = response.json()['audio_features']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1168c97c-f1f5-4d16-b888-a0830ba7e9ca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# removing features which weren't used while fitting our dataset: categorical and irrelevant ones.\n",
+    "test_df = pd.DataFrame(audio_features)\n",
+    "test_df['name'] = pd.DataFrame(list_of_names)\n",
+    "test_df['id'] = pd.DataFrame(list_of_ids)\n",
+    "cpy_df = test_df.drop('name', axis=1)\n",
+    "cpy_df = cpy_df.drop('id', axis=1)\n",
+    "cpy_df = cpy_df.drop('uri', axis=1)\n",
+    "cpy_df = cpy_df.drop('track_href', axis=1)\n",
+    "cpy_df = cpy_df.drop('type', axis=1)\n",
+    "cpy_df = cpy_df.drop('analysis_url', axis=1)\n",
+    "cpy_df = cpy_df.drop('time_signature', axis=1)\n",
+    "# scaling the testing dataset and using the previously trained model to predict the clusters for the new dataset.\n",
+    "cpy_df = MinMaxScaler().fit_transform(cpy_df)\n",
+    "test_df['kmeans'] = kmeans.predict(cpy_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a22f9721-0dd9-4997-8188-c066d5c97756",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# take a look at the categories the new songs fall under\n",
+    "print(\"New additions to Playlist #1: \")\n",
+    "print(test_df[test_df['kmeans']==0])\n",
+    "print(\"New additions to Playlist #2: \")\n",
+    "print(test_df[test_df['kmeans']==1])\n",
+    "print(\"New additions to Playlist #3: \")\n",
+    "print(test_df[test_df['kmeans']==2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81c301ba-3b26-40b2-8392-29398b1c5c30",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From f709c3ee5cc1d1dd81a53d01b577485d40fe5cf7 Mon Sep 17 00:00:00 2001
From: Sride Shankar <sridevishankar1161@gmail.com>
Date: Fri, 11 Oct 2024 18:46:55 +0530
Subject: [PATCH 2/2] add visualization

---
 Recommendation Models/Music Recommendation/clustering.ipynb | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Recommendation Models/Music Recommendation/clustering.ipynb b/Recommendation Models/Music Recommendation/clustering.ipynb
index 8a8648d96..d53306707 100644
--- a/Recommendation Models/Music Recommendation/clustering.ipynb	
+++ b/Recommendation Models/Music Recommendation/clustering.ipynb	
@@ -369,7 +369,10 @@
    "id": "81c301ba-3b26-40b2-8392-29398b1c5c30",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# visualizing the dataset in clusters\n",
+    "sns.pairplot(data = df, hue = 'kmeans')"
+   ]
   }
  ],
  "metadata": {