nothing much

alexaverbuch · alexaverbuch · commit 15beba44a26b · 2015-12-22T13:52:24.000+01:00
diff --git a/run.sh b/run.sh
@@ -18,7 +18,7 @@ export HADOOP_HEAPSIZE=6144
 export HADOOP_CLIENT_OPTS="-Xmx6g $HADOOP_CLIENT_OPTS"
 
 mvn clean
-mvn assembly:assembly
+mvn -DskipTests assembly:assembly 
 
 cp $LDBC_SNB_DATAGEN_HOME/target/ldbc_snb_datagen.jar $LDBC_SNB_DATAGEN_HOME/
 rm $LDBC_SNB_DATAGEN_HOME/target/ldbc_snb_datagen.jar
diff --git a/test.sh b/test.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+DEFAULT_HADOOP_HOME=/home/user//hadoop-2.6.0.stand #change to your hadoop folder
+DEFAULT_LDBC_SNB_DATAGEN_HOME=/home/user/ldbc_snb_datagen_0.2 #change to your ldbc_socialnet_dbgen folder
+PARAM_GENERATION=1 #param generation
+
+# allow overriding configuration from outside via environment variables
+# i.e. you can do
+#     HADOOP_HOME=/foo/bar LDBC_SNB_DATAGEN_HOME=/baz/quux ./run.sh
+# instead of changing the contents of this file
+HADOOP_HOME=${HADOOP_HOME:-$DEFAULT_HADOOP_HOME}
+LDBC_SNB_DATAGEN_HOME=${LDBC_SNB_DATAGEN_HOME:-$DEFAULT_LDBC_SNB_DATAGEN_HOME}
+
+export HADOOP_HOME
+export LDBC_SNB_DATAGEN_HOME
+
+mvn clean
+mvn -DskipTests assembly:assembly 
+
+cp $LDBC_SNB_DATAGEN_HOME/target/ldbc_snb_datagen.jar $LDBC_SNB_DATAGEN_HOME/
+
+java -cp $LDBC_SNB_DATAGEN_HOME/ldbc_snb_datagen.jar org.apache.hadoop.util.RunJar $LDBC_SNB_DATAGEN_HOME/ldbc_snb_datagen.jar  $LDBC_SNB_DATAGEN_HOME/test_params.ini
+
+
+### TEST SCRIPTS ####
+
+ENTITIES="person post comment forum  place tag tagclass organisation "
+RELATIONS="person_knows_person organisation_isLocatedIn_place place_isPartOf_place tagclass_isSubclassOf_tagclass tag_hasType_tagclass person_studyAt_organisation person_hasInterest_tag person_workAt_organisation person_isLocatedIn_place forum_hasTag_tag forum_hasModerator_person forum_hasMember_person forum_containerOf_post comment_hasCreator_person comment_hasTag_tag comment_isLocatedIn_place comment_replyOf_comment comment_replyOf_post post_hasCreator_person post_hasTag_tag post_isLocatedIn_place person_likes_comment person_likes_post"
+ATTRIBUTES="person_speaks_language person_email_emailaddress  "
+
+FILES="$ENTITIES $RELATIONS "
+
+for file in $ENTITIES
+do
+  echo "TESTING FILE: $file"
+  python2 ./test/validateIdUniqueness.py ./test_data/social_network/${file}_?_?.csv
+done
+
+for file in $RELATIONS
+do
+  echo "TESTING FILE: $file"
+  python2 ./test/validatePairUniqueness.py 0 1 ./test_data/social_network/${file}_?_?.csv
+done
+
+echo "TESTING KNOWS SUBGRAPH INTEGRITY"
+python2 ./test/validateKnowsGraph.py ./test_data/social_network
+
+echo "TESTING STUDYAT SUBGRAPH INTEGRITY"
+python2 ./test/validateStudyAt.py ./test_data/social_network/
+
+echo "TESTING UPDATE STREAMS"
+for file in `ls ./test_data/social_network/updateStream_*`
+do
+  python2 ./test/validateUpdateStream.py $file
+done
diff --git a/test/compareDatasets.sh b/test/compareDatasets.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+if [ $# -ne 2 ]
+then
+   echo "Arguments not correctly supplied"
+   echo "Usage: sh testDatasets <num_reducers1> <dir1> <num_reducers2> <dir2>"
+   exit
+fi
+
+FILES="person person_email_emailaddress  person_studyAt_organisation person_hasInterest_tag person_workAt_organisation person_isLocatedIn_place  person_knows_person person_speaks_language tag tagclass_isSubclassOf_tagclass tag_hasType_tagclass tagclass place place_isPartOf_place organisation organisation_isLocatedIn_place"
+FILES2="post comment forum forum_hasTag_tag forum_hasModerator_person forum_hasMember_person forum_containerOf_post comment_hasCreator_person comment_hasTag_tag comment_isLocatedIn_place comment_replyOf_comment comment_replyOf_post post_hasCreator_person post_hasTag_tag post_isLocatedIn_place person_likes_comment person_likes_post"
+
+FILES="$FILES $FILES2"
+
+DIR_1=$1
+DIR_2=$2
+
+for file in $FILES
+do
+   echo "CHECKING FILE $file"
+   $(tail -q -n +2 $DIR_1/${file}_?_?.csv  | sort > .auxFile1)
+   $(tail -q -n +2 $DIR_2/${file}_?_?.csv  | sort > .auxFile2)
+
+   # computing checksums
+   a=$(md5sum .auxFile1 | awk '{print $1}')
+   b=$(md5sum .auxFile2 | awk '{print $1}')
+    
+   if [ "$a" == "$b" ] 
+   then
+       echo ${file} are equal 
+       echo ${a} 
+       echo ${b} 
+   else
+       echo ERROR!!!!! ${file} are different 
+       echo ${a} 
+       echo ${b} 
+       exit
+   fi
+   echo "---------------------"
+done
+echo GREAT!!!!! the two datasets are the same! 
+rm -f .auxFile1
+rm -f .auxFile2
+
+
diff --git a/test/testDataset.sh b/test/testDataset.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+if [ $# -ne 1 ]
+then
+   echo "Arguments not correctly supplied"
+   echo "Usage: sh testDataset <dir>"
+   exit
+fi
+
+python2 ./validateKnowsGraph.py $1
diff --git a/test/testDatasets.sh b/test/testDatasets.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+if [ $# -ne 2 ]
+then
+   echo "Arguments not correctly supplied"
+   echo "Usage: sh testDatasets <num_reducers1> <dir1> <num_reducers2> <dir2>"
+   exit
+fi
+
+FILES="person person_email_emailaddress  person_studyAt_organisation person_hasInterest_tag person_workAt_organisation person_isLocatedIn_place  person_knows_person person_speaks_language tag tagclass_isSubclassOf_tagclass tag_hasType_tagclass tagclass place place_isPartOf_place organisation organisation_isLocatedIn_place"
+
+FILES2="post comment forum forum_hasTag_tag forum_hasModerator_person forum_hasMember_person forum_containerOf_post comment_hasCreator_person comment_hasTag_tag comment_isLocatedIn_place comment_replyOf_comment comment_replyOf_post post_hasCreator_person post_hasTag_tag post_isLocatedIn_place person_likes_comment person_likes_post"
+
+FILES="$FILES $FILES2"
+
+DIR_1=$1
+DIR_2=$2
+
+for file in $FILES
+do
+   echo "CHECKING FILE $file"
+   $(tail -q -n +2 $DIR_1/${file}_?_?.csv  | sort > .auxFile1)
+   $(tail -q -n +2 $DIR_2/${file}_?_?.csv  | sort > .auxFile2)
+
+   # computing checksums
+   a=$(md5sum .auxFile1 | awk '{print $1}')
+   b=$(md5sum .auxFile2 | awk '{print $1}')
+    
+   if [ "$a" == "$b" ] 
+   then
+       echo ${file} are equal 
+       echo ${a} 
+       echo ${b} 
+   else
+       echo ERROR!!!!! ${file} are different 
+       echo ${a} 
+       echo ${b} 
+       exit
+   fi
+   echo "---------------------"
+done
+echo GREAT!!!!! the two datasets are the same! 
+rm -f .auxFile1
+rm -f .auxFile2
+
+
diff --git a/test/validateIdUniqueness.py b/test/validateIdUniqueness.py
@@ -0,0 +1,25 @@
+
+import sys, os
+from sets import Set
+
+
+if( len(sys.argv) == 1):
+    print("Validates if the ids in the union of the first column of the input files are unique.")
+    print("Usage: validateIdUniqueness <file0> <file1> ... <filen>")
+
+ids = Set()
+for i in range(1,len(sys.argv)):
+    print("Reading "+sys.argv[i])
+    inputFile = open(sys.argv[i],'r')
+    index = 0
+    for line in inputFile.readlines():
+        if index > 0:
+            newId = int((line.split('|'))[0])
+            if newId in ids:
+                print("ERROR: Id "+str(newId)+" already exists")
+                print("Line "+str(index+1)+" of "+sys.argv[i]) 
+                exit()
+            ids.add(newId)
+        index+=1
+    inputFile.close()
+print("GREAT! All ids are different.")
diff --git a/test/validateKnowsGraph.py b/test/validateKnowsGraph.py
@@ -0,0 +1,48 @@
+
+import sys, os
+import glob
+
+
+if( len(sys.argv) == 1):
+    print("Validates the correcness of the knows graph.")
+    print("Usage: validateKnowsGraph <dir>")
+
+person_files = glob.glob(sys.argv[1]+'/person_?_?.csv')
+knows_files = glob.glob(sys.argv[1]+'/person_knows_person_?_?.csv')
+
+persons = set()
+
+for filename in person_files:
+    file = open(filename,"r")
+    print("reading "+filename)
+    count = 0
+    for line in file.readlines():
+        if count != 0:
+            fields = line.split('|')
+            persons.add(int(fields[0]))
+        count+=1
+    file.close()
+
+
+for filename in knows_files:
+    file = open(filename,"r")
+    print("reading "+filename)
+    count = 0
+    for line in file.readlines():
+        if count != 0:
+            fields = line.split('|')
+            if (int(fields[0]) not in persons):
+                print("ERROR: missing person "+fields[0])
+                exit()
+            if (int(fields[1]) not in persons):
+                print("ERROR: missing person "+fields[1])
+                exit()
+        count+=1
+
+
+    file.close()
+
+print("GREAT: Knows graph is correct!")
+
+
+
diff --git a/test/validatePairUniqueness.py b/test/validatePairUniqueness.py
@@ -0,0 +1,32 @@
+
+import sys, os
+
+
+if( len(sys.argv) < 4):
+    print("Validates that a pair of columns never appear repeated.")
+    print("Usage: validateIdUniqueness <coulmn1> <column2> <file0> <file1> ... <filen>")
+
+column1=int(sys.argv[1])
+column2=int(sys.argv[2])
+
+ids = {} 
+
+for i in range(3,len(sys.argv)):
+    print("Reading "+sys.argv[i])
+    inputFile = open(sys.argv[i],'r')
+    index = 0
+    for line in inputFile.readlines():
+        if index > 0:
+            firstId = int((line.split('|'))[column1])
+            secondId = int((line.split('|'))[column2])
+            if firstId not in ids:
+              ids[firstId] = set([])
+            s = ids[firstId]
+            if secondId in s:
+              print("ERROR, Id pair not unique")
+              print(str(firstId)+" "+str(secondId))
+              exit(1)
+            s.add(secondId)
+        index+=1
+    inputFile.close()
+print("GREAT! All ids are different.")
diff --git a/test/validateStudyAt.py b/test/validateStudyAt.py
@@ -0,0 +1,64 @@
+
+import sys, os
+import glob
+
+
+if( len(sys.argv) == 1):
+    print("Validates the correctness of the studyAt graph.")
+    print("Usage: validateStudyAt <dir>")
+
+study_at_files = glob.glob(sys.argv[1]+'/person_studyAt_organization_?_?.csv')
+organization_files = glob.glob(sys.argv[1]+'/organisation_?_?.csv')
+update_stream_files = glob.glob(sys.argv[1]+'/updateStream_?_?_person.csv')
+
+universities = set()
+
+for filename in organization_files:
+    file = open(filename,"r")
+    print("reading "+filename)
+    count = 0
+    for line in file.readlines():
+        if count != 0:
+            fields = line.split('|')
+            if fields[1] == "university":
+                universities.add(int(fields[0]))
+        count+=1
+    file.close()
+
+print("Number of universities read "+str(len(universities)))
+
+
+for filename in study_at_files:
+    file = open(filename,"r")
+    print("reading "+filename)
+    count = 0
+    for line in file.readlines():
+        if count != 0:
+            fields = line.split('|')
+            if (int(fields[1]) not in universities):
+                print("ERROR: missing university "+fields[1])
+                exit()
+        count+=1
+
+
+    file.close()
+
+for filename in update_stream_files:
+    file = open(filename,"r")
+    print("reading "+filename)
+    count = 0
+    for line in file.readlines():
+        study_ats = line.split('|')[15]
+        if( study_ats != ''):
+            for study_at in study_ats.split(';'):
+                uni_id = study_at.split(',')[0]
+                if (int(uni_id) not in universities):
+                    print("ERROR: missing university "+uni_id)
+                    print(line)
+                    exit()
+    file.close()
+
+print("GREAT: studyAt graph is correct!")
+
+
+
diff --git a/test/validateUpdateStream.py b/test/validateUpdateStream.py
@@ -0,0 +1,27 @@
+
+import sys, os
+
+
+if( len(sys.argv) == 1):
+    print("Validates the correcness of an update stream regarding the dates of the events.")
+    print("Usage: validateIdUniqueness <file>")
+
+fileName = sys.argv[1]
+
+file = open(fileName,"r")
+
+previous_entry = -1
+for line in file.readlines():
+    fields = line.split("|")
+    if previous_entry > int(fields[0]):
+        print("ERROR: date is smaller than previous one")
+        exit()
+    if (int(fields[1])+10000) > int(fields[0]):
+        print("ERROR: dependant event is later than the current one")
+        print(line)
+        exit()
+    previous_entry = int(fields[0])
+
+print("GREAT! UPDATE STREAM "+sys.argv[1]+" SEEMS CORRECT")
+
+file.close()
diff --git a/test_params.ini b/test_params.ini
@@ -0,0 +1,11 @@
+
+ldbc.snb.datagen.generator.scaleFactor:snb.interactive.1
+
+ldbc.snb.datagen.serializer.compressed:false
+
+ldbc.snb.datagen.serializer.personSerializer:ldbc.snb.datagen.serializer.snb.interactive.CSVPersonSerializer
+ldbc.snb.datagen.serializer.invariantSerializer:ldbc.snb.datagen.serializer.snb.interactive.CSVInvariantSerializer
+ldbc.snb.datagen.serializer.personActivitySerializer:ldbc.snb.datagen.serializer.snb.interactive.CSVPersonActivitySerializer
+
+ldbc.snb.datagen.generator.numThreads:1
+ldbc.snb.datagen.serializer.outputDir:./test_data/