topic-specific-recommendation/TopicSpecificRecommendation.py at master · fedezarf/topic-specific-recommendation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
"""
Recommendation System.
"""

# Author: Federico Zarfati <fz234@cornell.edu>


import networkx as nx
import scipy.sparse
import numpy as np
import scipy.sparse
from networkx.exception import NetworkXError
from networkx.algorithms import bipartite


class TopicSpecificRecommendation():
    """Topic Specific Recommendation based on Topic-Specific PageRank.

    Performs graph transformation and uses Topic-Specific-PageRank on the
    transformed graph to obtain recommendations for every user in the graph.

    Tested on MovieLens 100K Dataset.
    The dataset contains 100000 ratings of 943 users on 1682 movies.
    Each rating is a natural number in [1, 5].

    For more info on the dataset:
    http://grouplens.org/datasets/movielens/100k/


    Parameters
    ----------
    graph_users_items :
        The graph represented with a dictionary where
        graph_users_items['graph'] is a NetworkX object,
        graph_users_items['users'] = all_users_id is a set of users IDs
        graph_users_items['items'] = all_items_id is a set of items IDs

    Attributes
    ----------
    item_item_graph :
        Bipartite weighted projected graph generated from the graph_users_items
        graph.

    M :
        A scipy sparse matrix of the item_item graph to speed up the Page-Rank
        computation.


    """
    def __init__(self, graph_users_items):
        self.graph_users_items = graph_users_items
        self.item_item_graph = self.create_item_item_graph()
        self.M = nx.to_scipy_sparse_matrix(self.item_item_graph,
                                           nodelist=self.item_item_graph.nodes(),
                                           weight='weight', dtype=float)

    def create_item_item_graph(self):
        """Generate the item-item graph.

        Wrapper of the NetworkX function to generate the bipartite
        weighted projected graph.

        """

        item_item_graph = nx.Graph()

        item_item_graph = bipartite.weighted_projected_graph(
                            self.graph_users_items["graph"].to_undirected(),
                            self.graph_users_items["items"])


        return item_item_graph


    def pagerank_scipy_mod(self, alpha=0.85, personalization=None,
                   max_iter=100, tol=1.0e-6, weight='weight',
                   dangling=None):
        """Return the PageRank of the nodes in the graph.

        ADAPTED FROM THE NETWORKX LIBRARY.

        https://networkx.github.io/documentation/networkx-1.10/
        _modules/networkx/algorithms/link_analysis/pagerank_alg.html#pagerank

        Given that for our purposes we need to call the pagerank function for
        every user in the graph the function was modified in order compute the
        scipy matrix of the graph outside the pagerank function. This leads to
        a significant increase in performance.


        PageRank computes a ranking of the nodes in the graph G based on
        the structure of the incoming links. It was originally designed as
        an algorithm to rank web pages.

        Parameters
        ----------

        alpha : float, optional
          Damping parameter for PageRank, default=0.85.

        personalization: dict, optional
           The "personalization vector" consisting of a dictionary with a
           key for every graph node and nonzero personalization value for each
           node. By default, a uniform distribution is used.

        max_iter : integer, optional
          Maximum number of iterations in power method eigenvalue solver.

        tol : float, optional
          Error tolerance used to check convergence in power method solver.

        weight : key, optional
          Edge data key to use as weight.  If None weights are set to 1.

        dangling: dict, optional
          The outedges to be assigned to any "dangling" nodes, i.e., nodes without
          any outedges. The dict key is the node the outedge points to and the dict
          value is the weight of that outedge. By default, dangling nodes are given
          outedges according to the personalization vector (uniform if not
          specified) This must be selected to result in an irreducible transition
          matrix (see notes under google_matrix). It may be common to have the
          dangling dict to be the same as the personalization dict.

        Returns
        -------
        pagerank : dictionary
           Dictionary of nodes with PageRank as value


        Notes
        -----
        The eigenvector calculation uses power iteration with a SciPy
        sparse matrix representation.

        This implementation works with Multi(Di)Graphs. For multigraphs the
        weight between two nodes is set to be the sum of all edge weights
        between those nodes.

        See Also
        --------
        pagerank, pagerank_numpy, google_matrix

        References
        ----------
        .. [1] A. Langville and C. Meyer,
           "A survey of eigenvector methods of web information retrieval."
           http://citeseer.ist.psu.edu/713792.html
        .. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,
           The PageRank citation ranking: Bringing order to the Web. 1999
           http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf
        """

        G = self.item_item_graph
        N = len(G)
        if N == 0:
            return {}

        nodelist = G.nodes()

        # the scipy matrix computed in the initialization of the object
        M = self.M

        S = scipy.array(M.sum(axis=1)).flatten()
        S[S != 0] = 1.0 / S[S != 0]
        Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr')
        M = Q * M

        # initial vector
        x = scipy.repeat(1.0 / N, N)

        # Personalization vector
        if personalization is None:
            p = scipy.repeat(1.0 / N, N)
        else:
            missing = set(nodelist) - set(personalization)
            if missing:
                raise NetworkXError('Personalization vector dictionary '
                                    'must have a value for every node. '
                                    'Missing nodes %s' % missing)
            p = scipy.array([personalization[n] for n in nodelist],
                            dtype=float)
            p = p / p.sum()

        # Dangling nodes
        if dangling is None:
            dangling_weights = p
        else:
            missing = set(nodelist) - set(dangling)
            if missing:
                raise NetworkXError('Dangling node dictionary '
                                    'must have a value for every node. '
                                    'Missing nodes %s' % missing)
            # Convert the dangling dictionary into an array in nodelist order
            dangling_weights = scipy.array([dangling[n] for n in nodelist],
                                           dtype=float)
            dangling_weights /= dangling_weights.sum()
        is_dangling = scipy.where(S == 0)[0]

        # power iteration: make up to max_iter iterations
        for _ in range(max_iter):
            xlast = x
            x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + \
                (1 - alpha) * p
            # check convergence, l1 norm
            err = scipy.absolute(x - xlast).sum()
            if err < N * tol:
                return dict(zip(nodelist, map(float, x)))
        raise NetworkXError('pagerank_scipy: power iteration failed to converge '
                            'in %d iterations.' % max_iter)


    def create_preference_vector_for_teleporting(self, user_id):
        """Compute the preference vector for a given user.

        To increase the personalization of the recommendation, the teleporting
        probability distribution among all nodes in the virtual topic must be
        biased using the rating values that the input user has associated to
        each rated items.


        Args:
            user_id: The id of the user.

        Returns:
            A dictinary representing the preference vector of the given user.

        """
        # Initialization of the preference vector with 0 as value
        preference_vector = {item : 0 for item in
                                    self.graph_users_items['items']}

        # Function to insert numerator in the vector
        def try_to_rate(chosen_item):
            try:
                value = self.graph_users_items['graph'][user_id][chosen_item]['weight']
            except:
                value = 0

            return value

        # Insert numerator for every user using try to rate
        preference_vector = {item : try_to_rate(item)
                                    for item in preference_vector}

        # Denominator to normalize
        den = sum(preference_vector.values())

        # Every element of the vector is computed using numerator and
        # denominator of every item for the user
        preference_vector = {item : preference_vector[item]*1.0/den
                                    for item in preference_vector}

        return preference_vector


    def create_ranked_list_of_recommended_items(self,page_rank_vector_of_items,
                                               user_id):
        """Ranked list from the resulting page-rank vector for a given user.

        sorted list of items in a descending order of Topic-Specific-PageRank score.
        Of course, this sorted list of items must not contain the items already
        rated by the input user. This sorted list is the output of the
        recommendation process. The list of items are sorted in descending order.
        Most recommended first.


        Args:
            page_rank_vector_of_items: Resulting vector from page-rank.
            user_id: The id of the user.

        Returns:
            True if successful, False otherwise.

        """
        # This is a list of 'item_id' sorted in descending order of score.
        sorted_list_of_recommended_items = []

        # You can produce this list from a list of [item, score]
        # couples sorted in descending order of score.

        # Delete movies already seen by the user
        for elem in self.graph_users_items["graph"].neighbors(user_id):
            del page_rank_vector_of_items[elem]

        # Create the list with elements [movie,rating]
        sorted_list_of_recommended_items = [[elem,
                                             page_rank_vector_of_items[elem]]
                                        for elem in page_rank_vector_of_items]

        # Sorting list in descending order based on rating
        sorted_list_of_recommended_items.sort(key=lambda x: (x[1],
                                                             x[0]),
                                                            reverse=True)

        # Return a list of recommended items (movies)
        sorted_list_of_recommended_items = [elem[0]
                                for elem in sorted_list_of_recommended_items]

        return sorted_list_of_recommended_items


    def recommended_for_user(self, user_id):
        """Recommendation for a given user.

        Given a user id we compute:
            - The preference vector of the given user.
            - The page-rank vector of the given user.
            - The list of items recommended for the user.

        Args:
            user_id: The id of the user.


        Returns:
            The list of items sorted in descending order. Most recommended
            first.

        """
        preference_vector = self.create_preference_vector_for_teleporting(user_id)
        personalized_pagerank_vector_of_items = self \
                                                .pagerank_scipy_mod(personalization=preference_vector)
        return self.create_ranked_list_of_recommended_items(personalized_pagerank_vector_of_items, user_id)


    @staticmethod
    def discounted_cumulative_gain(user_id, sorted_list_of_recommended_items, test_graph_users_items):
        """Compute the Discounted Comulative Gain for a given user.


        https://en.wikipedia.org/wiki/Discounted_cumulative_gain

        Args:
            user_id1: The id of the user.
            sorted_list_of_recommended_items: Output of the recommendation system.
            test_graph_users_items: The test graph.

        Returns:
            The DCG metric.

        """
        dcg = 0.


        # List of [movie,rating] for user
        item_rating = [(elem[1],elem[2]["weight"])
                        for elem in test_graph_users_items["graph"] \
                        .edges(user_id, data=True)
                        if elem[1] in sorted_list_of_recommended_items]
        # Dictionary to sort
        dic = dict([(elem[0], elem[1]) for elem in item_rating])
        # Sorting based on sorted list
        sorted_items = sorted(dic.items(), key=lambda i:sorted_list_of_recommended_items.index(i[0]))
        # List of rankings
        r = [int(i[1]) for i in sorted_items]
        # Dcg computation based on rankings list
        dcg = r[0] + np.sum(r[1:] / np.log2(np.arange(2, len(r)+1)))


        return dcg


    @staticmethod
    def minimum_discounted_cumulative_gain(user_id, test_graph_users_items):
        """Minimum Discounted Comulative Gain.

        It is also important to have a lower bound on the nDCG value.
        For this reason, we must compute the minimum DCG.

        Args:
            user_id: The id of the user.
            test_graph_users_items: The test graph.

        Returns:
            The Minimum DCG.

        """
        dcg = 0.

        # List of [movie,rating] for user
        item_rating = [(elem[1],elem[2]["weight"])
                        for elem in test_graph_users_items["graph"].edges(user_id, data=True)]
        # Order reverse false for minimum dcg
        sorted_rank = sorted(item_rating, key=lambda x: (x[1], x[0]), reverse=False)
        sorted_rank = [int(i[1]) for i in sorted_rank]
        # Min dcg
        dcg = sorted_rank[0] + np.sum(sorted_rank[1:] / np.log2(np.arange(2, len(sorted_rank) + 1)))

        return dcg


    @staticmethod
    def maximum_discounted_cumulative_gain(user_id, test_graph_users_items):
        """Maximum Discounted Comulative Gain.

        To obtain a normalized version of DCG you have to divide the DCG of
        a user by the maximum DCG for that user.


        Args:
            user_id: The id of the user.
            test_graph_users_items: The test graph.

        Returns:
            The Maximum DCG.

        """
        dcg = 0.

        # Reorder
        item_rating = [(elem[1],elem[2]["weight"])
                        for elem in test_graph_users_items["graph"].edges(user_id, data=True)]
        # Reorder reverse for maximum dcg
        sorted_rank = sorted(item_rating, key=lambda x: (x[1], x[0]), reverse=True)
        sorted_rank = [int(i[1]) for i in sorted_rank]
        # Max dcg
        dcg = sorted_rank[0] + np.sum(sorted_rank[1:] / np.log2(np.arange(2, len(sorted_rank) + 1)))
        return dcg


    def calculate_dcg_metrics(self, test_graph_users_items):
        """Perform the recommendation for every user and calculate the DCG.

        For every user in the training graph the recommendation is computed.
        Then, the recommendation is tested with the test graph and the metrics
        computed.


        Args:
            test_graph_users_items: The test graph.


        Returns:
            Tuple: (avgDCG lower bound, avgNDCG)

        """
        sum_of_normalizedDCG_for_PERSONAL_recommendation = 0.
        sum_of_normalizedDCG_LOWER_BOUND                 = 0.


        for user_id in test_graph_users_items['users']:


            # For every user we calculate the preference vector and PageRank
            recommended_items = self.recommended_for_user(user_id)
            current_dcg_for_PERSONAL_recommendation = self.discounted_cumulative_gain(user_id,
                                                                                      recommended_items,
                                                                                      test_graph_users_items)

    		 # Minimum Discounted Cumulative Gain.
            current_minimum_dcg = self.minimum_discounted_cumulative_gain(user_id,
                                                                          test_graph_users_items)

		 # Maximum Discounted Cumulative Gain.
            current_MAXIMUM_dcg = self.maximum_discounted_cumulative_gain(user_id,
                                                                          test_graph_users_items)


            sum_of_normalizedDCG_LOWER_BOUND += current_minimum_dcg/float(current_MAXIMUM_dcg)
            sum_of_normalizedDCG_for_PERSONAL_recommendation += current_dcg_for_PERSONAL_recommendation/float(current_MAXIMUM_dcg)

        avg_normalized_DCG_LOWER_BOUND                 = sum_of_normalizedDCG_LOWER_BOUND / float(len(test_graph_users_items['users']))
        avg_normalized_DCG_for_PERSONAL_recommendation = sum_of_normalizedDCG_for_PERSONAL_recommendation / float(len(test_graph_users_items['users']))

        return (avg_normalized_DCG_LOWER_BOUND, avg_normalized_DCG_for_PERSONAL_recommendation)