Skip to content

Commit 2fbbb90

Browse files
committed
some methods for dataset analysis added
1 parent 9aebb91 commit 2fbbb90

File tree

3 files changed

+93
-6
lines changed

3 files changed

+93
-6
lines changed

src/common/Bookmark.java

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,38 @@ public List<Integer> getTags() {
113113

114114
// Statics ----------------------------------------------------------------------------------
115115

116+
public static double getAvgNumberOfTopics(List<Bookmark> lines) {
117+
double sum = 0.0;
118+
for (Bookmark line : lines) {
119+
sum += line.getCategories().size();
120+
}
121+
return sum / lines.size();
122+
}
123+
124+
public static double getBookmarkDiversity(List<Bookmark> lines) {
125+
if (lines.size() == 0) {
126+
return 0.0;
127+
}
128+
if (lines.size() == 1) {
129+
return 1.0;
130+
}
131+
132+
double diversity = 0.0;
133+
int size = 0;
134+
for (int i = 0; i < lines.size() - 1; i++) {
135+
for (int j = i + 1; j < lines.size(); j++) {
136+
Bookmark srcBookmark = lines.get(i);
137+
Bookmark destBookmark = lines.get(j);
138+
diversity += Utilities.getJaccardSimLists(srcBookmark.getCategories(), destBookmark.getCategories());
139+
size++;
140+
}
141+
}
142+
if (size == 0) {
143+
return 0.0;
144+
}
145+
return diversity / size;
146+
}
147+
116148
public static Bookmark getUserData(List<Bookmark> lines, int userID, int resID) {
117149
Bookmark returnData = null;
118150
for (Bookmark data : lines) {

src/common/Utilities.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,8 @@ public static double getJaccardSimLists(List<Integer> targetMap, List<Integer> n
424424
Set<Integer> intersectSet = new HashSet<Integer>(targetMap);
425425
unionSet.addAll(nMap);
426426
intersectSet.retainAll(nMap);
427+
if (intersectSet.size() == 0 || unionSet.size() == 0)
428+
return 0.0;
427429
return (double)intersectSet.size() / (double)unionSet.size();
428430
}
429431

@@ -432,6 +434,8 @@ public static double getJaccardFloatSim(Map<Integer, Double> targetMap, Map<Inte
432434
Set<Integer> intersectSet = new HashSet<Integer>(targetMap.keySet());
433435
unionSet.addAll(nMap.keySet());
434436
intersectSet.retainAll(nMap.keySet());
437+
if (intersectSet.size() == 0 || unionSet.size() == 0)
438+
return 0.0;
435439
return (double)intersectSet.size() / (double)unionSet.size();
436440
}
437441

@@ -442,6 +446,8 @@ public static double getCosineSim(Map<Integer, Integer> targetMap, Map<Integer,
442446
for (int k : both) scalar += (targetMap.get(k) * nMap.get(k));
443447
for (int k : targetMap.keySet()) norm1 += (targetMap.get(k) * targetMap.get(k));
444448
for (int k : nMap.keySet()) norm2 += (nMap.get(k) * nMap.get(k));
449+
if (Math.sqrt(norm1 * norm2) == 0.0)
450+
return 0.0;
445451
return scalar / Math.sqrt(norm1 * norm2);
446452
}
447453

@@ -454,6 +460,8 @@ public static double getCosineSimList(List<Integer> targetList, List<Integer> nL
454460
for (int k : both) scalar += (targetMap.get(k) * nMap.get(k));
455461
for (int k : targetMap.keySet()) norm1 += (targetMap.get(k) * targetMap.get(k));
456462
for (int k : nMap.keySet()) norm2 += (nMap.get(k) * nMap.get(k));
463+
if (Math.sqrt(norm1 * norm2) == 0.0)
464+
return 0.0;
457465
return scalar / Math.sqrt(norm1 * norm2);
458466
}
459467

@@ -464,6 +472,8 @@ public static double getCosineFloatSim(Map<Integer, Double> targetMap, Map<Integ
464472
for (int k : both) scalar += (targetMap.get(k) * nMap.get(k));
465473
for (int k : targetMap.keySet()) norm1 += (targetMap.get(k) * targetMap.get(k));
466474
for (int k : nMap.keySet()) norm2 += (nMap.get(k) * nMap.get(k));
475+
if (Math.sqrt(norm1 * norm2) == 0.0)
476+
return 0.0;
467477
return scalar / Math.sqrt(norm1 * norm2);
468478
}
469479

src/test/Pipeline.java

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,19 @@ License, or (at your option) any later version.
2424
import itemrecommendations.Resource3LTCalculator;
2525
import itemrecommendations.ZhengCalculator;
2626

27+
import java.io.BufferedWriter;
28+
import java.io.File;
29+
import java.io.FileWriter;
30+
import java.io.IOException;
2731
import java.util.ArrayList;
2832
import java.util.Arrays;
2933
import java.util.List;
34+
import java.util.Map;
3035

36+
import common.Bookmark;
3137
import common.CalculationType;
3238
import common.Features;
39+
import common.Utilities;
3340
import processing.BLLCalculator;
3441
import processing.BM25Calculator;
3542
import processing.BaselineCalculator;
@@ -65,9 +72,9 @@ public class Pipeline {
6572
// set for categorizer/describer split (true is describer, false is categorizer - null for nothing)
6673
private final static Boolean DESCRIBER = null;
6774
// placeholder for the topic posfix
68-
private static String TOPIC_NAME = null;
75+
private static String TOPIC_NAME = "lda_500";
6976
// placeholder for the used dataset
70-
private final static String DATASET = "cul";
77+
private final static String DATASET = "lastfm";
7178

7279
public static void main(String[] args) {
7380
System.out.println("TagRecommender:\n" + "" +
@@ -88,7 +95,7 @@ public static void main(String[] args) {
8895
// Resource-Recommender testing
8996
String dir = DATASET + "_core";
9097
String path = dir + "/" + DATASET + "_sample";
91-
//getStatistics(path);
98+
//try { getStatistics(path, true); } catch (IOException e) { e.printStackTrace(); }
9299
//writeTensorFiles(path, false);
93100
//evaluate(dir, path, "wrmf_500_mml", TOPIC_NAME, false, true);
94101
//createLdaSamples(path, 1, 500, false);
@@ -477,19 +484,57 @@ private static List<Integer> getBetaValues(int betaUpperBound) {
477484
return betaValues;
478485
}
479486

480-
private static void getStatistics(String dataset) {
487+
private static void getStatistics(String dataset, boolean writeAll) throws IOException {
488+
if (TOPIC_NAME != null) {
489+
dataset += ("_" + TOPIC_NAME);
490+
}
481491
BookmarkReader reader = new BookmarkReader(0, false);
482492
reader.readFile(dataset);
493+
483494
int bookmarks = reader.getBookmarks().size();
484-
System.out.println("Bookmarks: " + bookmarks);
495+
System.out.println("Posts: " + bookmarks);
485496
int users = reader.getUsers().size();
486497
System.out.println("Users: " + users);
487498
int resources = reader.getResources().size();
488499
System.out.println("Resources: " + resources);
489500
int tags = reader.getTags().size();
490501
System.out.println("Tags: " + tags);
491502
int tagAssignments = reader.getTagAssignmentsCount();
492-
System.out.println("Tag-Assignments: " + tagAssignments);
503+
System.out.println("Tag-Assignments: " + tagAssignments);
504+
int categories = reader.getCategories().size();
505+
System.out.println("Topics: " + categories);
506+
double avgBookmarksPerUser = (double)bookmarks / users;
507+
System.out.println("Avg. resources/posts per user: " + avgBookmarksPerUser);
508+
double avgBookmarksPerResource = (double)bookmarks / resources;
509+
System.out.println("Avg. users/posts per resource: " + avgBookmarksPerResource);
510+
511+
if (writeAll) {
512+
getTrainTestSize(dataset);
513+
FileWriter userWriter = new FileWriter(new File("./data/metrics/" + dataset + "_userStats.txt"));
514+
BufferedWriter userBW = new BufferedWriter(userWriter);
515+
userBW.write("UserID| NoOfResources| NoOfTopics| Topic-Similarity\n");
516+
List<Bookmark> trainList = reader.getBookmarks().subList(0, TRAIN_SIZE);
517+
List<Integer> testUsers = reader.getUniqueUserListFromTestSet(TRAIN_SIZE);
518+
System.out.println();
519+
520+
double avgTopicsPerUser = 0.0;
521+
double avgTopicDiversityPerUser = 0.0;
522+
List<Map<Integer, Double>> userTopics = Utilities.getRelativeTopicMaps(trainList, false);
523+
List<List<Bookmark>> userBookmarks = Utilities.getBookmarks(trainList, false);
524+
for (int userID : testUsers) {
525+
Map<Integer, Double> topicsOfUser = userTopics.get(userID);
526+
double topicDiversityOfUser = Bookmark.getBookmarkDiversity(userBookmarks.get(userID));
527+
userBW.write(userID + "| " + reader.getUserCounts().get(userID) + "| " + topicsOfUser.keySet().size() + "| " + topicDiversityOfUser + "\n");
528+
avgTopicsPerUser += topicsOfUser.keySet().size();
529+
avgTopicDiversityPerUser += topicDiversityOfUser;
530+
}
531+
System.out.println("Avg. topics per user: " + avgTopicsPerUser / testUsers.size());
532+
System.out.println("Avg. topic-similarity per user: " + avgTopicDiversityPerUser / testUsers.size());
533+
double avgTopicsPerResource = Bookmark.getAvgNumberOfTopics(trainList);
534+
System.out.println("Avg. topics per resource: " + avgTopicsPerResource);
535+
userBW.flush();
536+
userBW.close();
537+
}
493538
}
494539

495540
private static void getTrainTestSize(String sample) {

0 commit comments

Comments
 (0)