-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathTopicSpecificRecommendation.py
More file actions
472 lines (337 loc) · 17.6 KB
/
TopicSpecificRecommendation.py
File metadata and controls
472 lines (337 loc) · 17.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
"""
Recommendation System.
"""
# Author: Federico Zarfati <fz234@cornell.edu>
import networkx as nx
import scipy.sparse
import numpy as np
import scipy.sparse
from networkx.exception import NetworkXError
from networkx.algorithms import bipartite
class TopicSpecificRecommendation():
"""Topic Specific Recommendation based on Topic-Specific PageRank.
Performs graph transformation and uses Topic-Specific-PageRank on the
transformed graph to obtain recommendations for every user in the graph.
Tested on MovieLens 100K Dataset.
The dataset contains 100000 ratings of 943 users on 1682 movies.
Each rating is a natural number in [1, 5].
For more info on the dataset:
http://grouplens.org/datasets/movielens/100k/
Parameters
----------
graph_users_items :
The graph represented with a dictionary where
graph_users_items['graph'] is a NetworkX object,
graph_users_items['users'] = all_users_id is a set of users IDs
graph_users_items['items'] = all_items_id is a set of items IDs
Attributes
----------
item_item_graph :
Bipartite weighted projected graph generated from the graph_users_items
graph.
M :
A scipy sparse matrix of the item_item graph to speed up the Page-Rank
computation.
"""
def __init__(self, graph_users_items):
self.graph_users_items = graph_users_items
self.item_item_graph = self.create_item_item_graph()
self.M = nx.to_scipy_sparse_matrix(self.item_item_graph,
nodelist=self.item_item_graph.nodes(),
weight='weight', dtype=float)
def create_item_item_graph(self):
"""Generate the item-item graph.
Wrapper of the NetworkX function to generate the bipartite
weighted projected graph.
"""
item_item_graph = nx.Graph()
item_item_graph = bipartite.weighted_projected_graph(
self.graph_users_items["graph"].to_undirected(),
self.graph_users_items["items"])
return item_item_graph
def pagerank_scipy_mod(self, alpha=0.85, personalization=None,
max_iter=100, tol=1.0e-6, weight='weight',
dangling=None):
"""Return the PageRank of the nodes in the graph.
ADAPTED FROM THE NETWORKX LIBRARY.
https://networkx.github.io/documentation/networkx-1.10/
_modules/networkx/algorithms/link_analysis/pagerank_alg.html#pagerank
Given that for our purposes we need to call the pagerank function for
every user in the graph the function was modified in order compute the
scipy matrix of the graph outside the pagerank function. This leads to
a significant increase in performance.
PageRank computes a ranking of the nodes in the graph G based on
the structure of the incoming links. It was originally designed as
an algorithm to rank web pages.
Parameters
----------
alpha : float, optional
Damping parameter for PageRank, default=0.85.
personalization: dict, optional
The "personalization vector" consisting of a dictionary with a
key for every graph node and nonzero personalization value for each
node. By default, a uniform distribution is used.
max_iter : integer, optional
Maximum number of iterations in power method eigenvalue solver.
tol : float, optional
Error tolerance used to check convergence in power method solver.
weight : key, optional
Edge data key to use as weight. If None weights are set to 1.
dangling: dict, optional
The outedges to be assigned to any "dangling" nodes, i.e., nodes without
any outedges. The dict key is the node the outedge points to and the dict
value is the weight of that outedge. By default, dangling nodes are given
outedges according to the personalization vector (uniform if not
specified) This must be selected to result in an irreducible transition
matrix (see notes under google_matrix). It may be common to have the
dangling dict to be the same as the personalization dict.
Returns
-------
pagerank : dictionary
Dictionary of nodes with PageRank as value
Notes
-----
The eigenvector calculation uses power iteration with a SciPy
sparse matrix representation.
This implementation works with Multi(Di)Graphs. For multigraphs the
weight between two nodes is set to be the sum of all edge weights
between those nodes.
See Also
--------
pagerank, pagerank_numpy, google_matrix
References
----------
.. [1] A. Langville and C. Meyer,
"A survey of eigenvector methods of web information retrieval."
http://citeseer.ist.psu.edu/713792.html
.. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,
The PageRank citation ranking: Bringing order to the Web. 1999
http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf
"""
G = self.item_item_graph
N = len(G)
if N == 0:
return {}
nodelist = G.nodes()
# the scipy matrix computed in the initialization of the object
M = self.M
S = scipy.array(M.sum(axis=1)).flatten()
S[S != 0] = 1.0 / S[S != 0]
Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr')
M = Q * M
# initial vector
x = scipy.repeat(1.0 / N, N)
# Personalization vector
if personalization is None:
p = scipy.repeat(1.0 / N, N)
else:
missing = set(nodelist) - set(personalization)
if missing:
raise NetworkXError('Personalization vector dictionary '
'must have a value for every node. '
'Missing nodes %s' % missing)
p = scipy.array([personalization[n] for n in nodelist],
dtype=float)
p = p / p.sum()
# Dangling nodes
if dangling is None:
dangling_weights = p
else:
missing = set(nodelist) - set(dangling)
if missing:
raise NetworkXError('Dangling node dictionary '
'must have a value for every node. '
'Missing nodes %s' % missing)
# Convert the dangling dictionary into an array in nodelist order
dangling_weights = scipy.array([dangling[n] for n in nodelist],
dtype=float)
dangling_weights /= dangling_weights.sum()
is_dangling = scipy.where(S == 0)[0]
# power iteration: make up to max_iter iterations
for _ in range(max_iter):
xlast = x
x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + \
(1 - alpha) * p
# check convergence, l1 norm
err = scipy.absolute(x - xlast).sum()
if err < N * tol:
return dict(zip(nodelist, map(float, x)))
raise NetworkXError('pagerank_scipy: power iteration failed to converge '
'in %d iterations.' % max_iter)
def create_preference_vector_for_teleporting(self, user_id):
"""Compute the preference vector for a given user.
To increase the personalization of the recommendation, the teleporting
probability distribution among all nodes in the virtual topic must be
biased using the rating values that the input user has associated to
each rated items.
Args:
user_id: The id of the user.
Returns:
A dictinary representing the preference vector of the given user.
"""
# Initialization of the preference vector with 0 as value
preference_vector = {item : 0 for item in
self.graph_users_items['items']}
# Function to insert numerator in the vector
def try_to_rate(chosen_item):
try:
value = self.graph_users_items['graph'][user_id][chosen_item]['weight']
except:
value = 0
return value
# Insert numerator for every user using try to rate
preference_vector = {item : try_to_rate(item)
for item in preference_vector}
# Denominator to normalize
den = sum(preference_vector.values())
# Every element of the vector is computed using numerator and
# denominator of every item for the user
preference_vector = {item : preference_vector[item]*1.0/den
for item in preference_vector}
return preference_vector
def create_ranked_list_of_recommended_items(self,page_rank_vector_of_items,
user_id):
"""Ranked list from the resulting page-rank vector for a given user.
sorted list of items in a descending order of Topic-Specific-PageRank score.
Of course, this sorted list of items must not contain the items already
rated by the input user. This sorted list is the output of the
recommendation process. The list of items are sorted in descending order.
Most recommended first.
Args:
page_rank_vector_of_items: Resulting vector from page-rank.
user_id: The id of the user.
Returns:
True if successful, False otherwise.
"""
# This is a list of 'item_id' sorted in descending order of score.
sorted_list_of_recommended_items = []
# You can produce this list from a list of [item, score]
# couples sorted in descending order of score.
# Delete movies already seen by the user
for elem in self.graph_users_items["graph"].neighbors(user_id):
del page_rank_vector_of_items[elem]
# Create the list with elements [movie,rating]
sorted_list_of_recommended_items = [[elem,
page_rank_vector_of_items[elem]]
for elem in page_rank_vector_of_items]
# Sorting list in descending order based on rating
sorted_list_of_recommended_items.sort(key=lambda x: (x[1],
x[0]),
reverse=True)
# Return a list of recommended items (movies)
sorted_list_of_recommended_items = [elem[0]
for elem in sorted_list_of_recommended_items]
return sorted_list_of_recommended_items
def recommended_for_user(self, user_id):
"""Recommendation for a given user.
Given a user id we compute:
- The preference vector of the given user.
- The page-rank vector of the given user.
- The list of items recommended for the user.
Args:
user_id: The id of the user.
Returns:
The list of items sorted in descending order. Most recommended
first.
"""
preference_vector = self.create_preference_vector_for_teleporting(user_id)
personalized_pagerank_vector_of_items = self \
.pagerank_scipy_mod(personalization=preference_vector)
return self.create_ranked_list_of_recommended_items(personalized_pagerank_vector_of_items, user_id)
@staticmethod
def discounted_cumulative_gain(user_id, sorted_list_of_recommended_items, test_graph_users_items):
"""Compute the Discounted Comulative Gain for a given user.
https://en.wikipedia.org/wiki/Discounted_cumulative_gain
Args:
user_id1: The id of the user.
sorted_list_of_recommended_items: Output of the recommendation system.
test_graph_users_items: The test graph.
Returns:
The DCG metric.
"""
dcg = 0.
# List of [movie,rating] for user
item_rating = [(elem[1],elem[2]["weight"])
for elem in test_graph_users_items["graph"] \
.edges(user_id, data=True)
if elem[1] in sorted_list_of_recommended_items]
# Dictionary to sort
dic = dict([(elem[0], elem[1]) for elem in item_rating])
# Sorting based on sorted list
sorted_items = sorted(dic.items(), key=lambda i:sorted_list_of_recommended_items.index(i[0]))
# List of rankings
r = [int(i[1]) for i in sorted_items]
# Dcg computation based on rankings list
dcg = r[0] + np.sum(r[1:] / np.log2(np.arange(2, len(r)+1)))
return dcg
@staticmethod
def minimum_discounted_cumulative_gain(user_id, test_graph_users_items):
"""Minimum Discounted Comulative Gain.
It is also important to have a lower bound on the nDCG value.
For this reason, we must compute the minimum DCG.
Args:
user_id: The id of the user.
test_graph_users_items: The test graph.
Returns:
The Minimum DCG.
"""
dcg = 0.
# List of [movie,rating] for user
item_rating = [(elem[1],elem[2]["weight"])
for elem in test_graph_users_items["graph"].edges(user_id, data=True)]
# Order reverse false for minimum dcg
sorted_rank = sorted(item_rating, key=lambda x: (x[1], x[0]), reverse=False)
sorted_rank = [int(i[1]) for i in sorted_rank]
# Min dcg
dcg = sorted_rank[0] + np.sum(sorted_rank[1:] / np.log2(np.arange(2, len(sorted_rank) + 1)))
return dcg
@staticmethod
def maximum_discounted_cumulative_gain(user_id, test_graph_users_items):
"""Maximum Discounted Comulative Gain.
To obtain a normalized version of DCG you have to divide the DCG of
a user by the maximum DCG for that user.
Args:
user_id: The id of the user.
test_graph_users_items: The test graph.
Returns:
The Maximum DCG.
"""
dcg = 0.
# Reorder
item_rating = [(elem[1],elem[2]["weight"])
for elem in test_graph_users_items["graph"].edges(user_id, data=True)]
# Reorder reverse for maximum dcg
sorted_rank = sorted(item_rating, key=lambda x: (x[1], x[0]), reverse=True)
sorted_rank = [int(i[1]) for i in sorted_rank]
# Max dcg
dcg = sorted_rank[0] + np.sum(sorted_rank[1:] / np.log2(np.arange(2, len(sorted_rank) + 1)))
return dcg
def calculate_dcg_metrics(self, test_graph_users_items):
"""Perform the recommendation for every user and calculate the DCG.
For every user in the training graph the recommendation is computed.
Then, the recommendation is tested with the test graph and the metrics
computed.
Args:
test_graph_users_items: The test graph.
Returns:
Tuple: (avgDCG lower bound, avgNDCG)
"""
sum_of_normalizedDCG_for_PERSONAL_recommendation = 0.
sum_of_normalizedDCG_LOWER_BOUND = 0.
for user_id in test_graph_users_items['users']:
# For every user we calculate the preference vector and PageRank
recommended_items = self.recommended_for_user(user_id)
current_dcg_for_PERSONAL_recommendation = self.discounted_cumulative_gain(user_id,
recommended_items,
test_graph_users_items)
# Minimum Discounted Cumulative Gain.
current_minimum_dcg = self.minimum_discounted_cumulative_gain(user_id,
test_graph_users_items)
# Maximum Discounted Cumulative Gain.
current_MAXIMUM_dcg = self.maximum_discounted_cumulative_gain(user_id,
test_graph_users_items)
sum_of_normalizedDCG_LOWER_BOUND += current_minimum_dcg/float(current_MAXIMUM_dcg)
sum_of_normalizedDCG_for_PERSONAL_recommendation += current_dcg_for_PERSONAL_recommendation/float(current_MAXIMUM_dcg)
avg_normalized_DCG_LOWER_BOUND = sum_of_normalizedDCG_LOWER_BOUND / float(len(test_graph_users_items['users']))
avg_normalized_DCG_for_PERSONAL_recommendation = sum_of_normalizedDCG_for_PERSONAL_recommendation / float(len(test_graph_users_items['users']))
return (avg_normalized_DCG_LOWER_BOUND, avg_normalized_DCG_for_PERSONAL_recommendation)