-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_data.py
More file actions
117 lines (98 loc) · 3.69 KB
/
load_data.py
File metadata and controls
117 lines (98 loc) · 3.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import numpy as np
import pandas as pd
import wget
import os
import zipfile
def load_small():
if not os.path.isdir('ml-latest-small'): # if directory not present then download and unzip
url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
filename = wget.download(url)
with zipfile.ZipFile(filename, 'r') as zip_ref:
zip_ref.extractall(".")
ratings_filename = 'ml-latest-small/ratings.csv'
movie_filename = 'ml-latest-small/movies.csv'
data = pd.read_csv(ratings_filename)
movies = pd.read_csv(movie_filename)
return data, movies
def load_medium():
if not os.path.isdir('ml-10M100K'):
url = 'http://files.grouplens.org/datasets/movielens/ml-10m.zip'
filename = wget.download(url)
with zipfile.ZipFile(filename, 'r') as zip_ref:
zip_ref.extractall(".")
ratings_filename = 'ml-10M100K/ratings.dat'
movie_filename = 'ml-10M100K/movies.dat'
data = pd.read_csv(ratings_filename, sep='::', header=None, engine='python')
data.columns = ['userId','movieId','rating','timestamp']
movies = pd.read_csv(movie_filename, sep='::', header=None, engine='python')
movies.columns = ['movieId','title','genre']
return data, movies
def load_large():
if not os.path.isdir('ml-25m'):
url = 'http://files.grouplens.org/datasets/movielens/ml-25m.zip'
filename = wget.download(url)
with zipfile.ZipFile(filename, 'r') as zip_ref:
zip_ref.extractall(".")
ratings_filename = 'ml-25m/ratings.csv'
movie_filename = 'ml-25m/movies.csv'
data = pd.read_csv(ratings_filename)
movies = pd.read_csv(movie_filename)
return data, movies
def load_dataset(size = 'small'):
if size == 'large':
data, movies = load_large()
elif size == 'medium':
data, movies = load_medium()
else:
data, movies = load_small()
return data, movies
def convert(data, num_movies):
'''
Convert the data from RAW csv to a single vector for every user
This vector has ids of rated movies by a user
'''
users = data['userId'].unique()
data = data.values
N = data.shape[0]
movie_arr = []
rating_arr = []
i = 0
index = 0
for id in users:
movie_ids = []
user_rating = []
while index < N and data[index][0] == id:
movie_ids.append(data[index][1])
user_rating.append(data[index][2]/5)
# arr[i, data[index][1]] = data[index][2]/5
index += 1
movie_arr.append(list(map(int,movie_ids)))
rating_arr.append(user_rating)
i += 1
movie_arr = np.array(movie_arr, dtype = object)
rating_arr = np.array(rating_arr, dtype = object)
return movie_arr, rating_arr
def pre_process(data, movies):
'''
Parameters
----------
data : ratings dataframe
movies : movies dataframe
Returns
-------
movie_arr : preprocessed reviews list
ratings_arr : preprocessed ratings list
d : movie-cat_id to movie-id dict
movies : processed movie dataframe
'''
data['movieId'] = data['movieId'].astype('category')
d = dict(enumerate(data['movieId'].cat.categories))
data['movieId'] = data['movieId'].cat.codes
movies = movies.set_index('movieId')
num_movies = len(movies)
print("Number of movies : ",num_movies)
movie_arr,ratings_arr = convert(data, num_movies)
return movie_arr, ratings_arr, d, movies
if __name__ == '__main__':
data, movies = load_dataset()
pre_process(data, movies)