Skip to content

Commit 15beba4

Browse files
committed
nothing much
2 parents 8790428 + d07c5b7 commit 15beba4

11 files changed

+363
-1
lines changed

run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ export HADOOP_HEAPSIZE=6144
1818
export HADOOP_CLIENT_OPTS="-Xmx6g $HADOOP_CLIENT_OPTS"
1919

2020
mvn clean
21-
mvn assembly:assembly
21+
mvn -DskipTests assembly:assembly
2222

2323
cp $LDBC_SNB_DATAGEN_HOME/target/ldbc_snb_datagen.jar $LDBC_SNB_DATAGEN_HOME/
2424
rm $LDBC_SNB_DATAGEN_HOME/target/ldbc_snb_datagen.jar

test.sh

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#!/bin/bash
2+
DEFAULT_HADOOP_HOME=/home/user//hadoop-2.6.0.stand #change to your hadoop folder
3+
DEFAULT_LDBC_SNB_DATAGEN_HOME=/home/user/ldbc_snb_datagen_0.2 #change to your ldbc_socialnet_dbgen folder
4+
PARAM_GENERATION=1 #param generation
5+
6+
# allow overriding configuration from outside via environment variables
7+
# i.e. you can do
8+
# HADOOP_HOME=/foo/bar LDBC_SNB_DATAGEN_HOME=/baz/quux ./run.sh
9+
# instead of changing the contents of this file
10+
HADOOP_HOME=${HADOOP_HOME:-$DEFAULT_HADOOP_HOME}
11+
LDBC_SNB_DATAGEN_HOME=${LDBC_SNB_DATAGEN_HOME:-$DEFAULT_LDBC_SNB_DATAGEN_HOME}
12+
13+
export HADOOP_HOME
14+
export LDBC_SNB_DATAGEN_HOME
15+
16+
mvn clean
17+
mvn -DskipTests assembly:assembly
18+
19+
cp $LDBC_SNB_DATAGEN_HOME/target/ldbc_snb_datagen.jar $LDBC_SNB_DATAGEN_HOME/
20+
21+
java -cp $LDBC_SNB_DATAGEN_HOME/ldbc_snb_datagen.jar org.apache.hadoop.util.RunJar $LDBC_SNB_DATAGEN_HOME/ldbc_snb_datagen.jar $LDBC_SNB_DATAGEN_HOME/test_params.ini
22+
23+
24+
### TEST SCRIPTS ####
25+
26+
ENTITIES="person post comment forum place tag tagclass organisation "
27+
RELATIONS="person_knows_person organisation_isLocatedIn_place place_isPartOf_place tagclass_isSubclassOf_tagclass tag_hasType_tagclass person_studyAt_organisation person_hasInterest_tag person_workAt_organisation person_isLocatedIn_place forum_hasTag_tag forum_hasModerator_person forum_hasMember_person forum_containerOf_post comment_hasCreator_person comment_hasTag_tag comment_isLocatedIn_place comment_replyOf_comment comment_replyOf_post post_hasCreator_person post_hasTag_tag post_isLocatedIn_place person_likes_comment person_likes_post"
28+
ATTRIBUTES="person_speaks_language person_email_emailaddress "
29+
30+
FILES="$ENTITIES $RELATIONS "
31+
32+
for file in $ENTITIES
33+
do
34+
echo "TESTING FILE: $file"
35+
python2 ./test/validateIdUniqueness.py ./test_data/social_network/${file}_?_?.csv
36+
done
37+
38+
for file in $RELATIONS
39+
do
40+
echo "TESTING FILE: $file"
41+
python2 ./test/validatePairUniqueness.py 0 1 ./test_data/social_network/${file}_?_?.csv
42+
done
43+
44+
echo "TESTING KNOWS SUBGRAPH INTEGRITY"
45+
python2 ./test/validateKnowsGraph.py ./test_data/social_network
46+
47+
echo "TESTING STUDYAT SUBGRAPH INTEGRITY"
48+
python2 ./test/validateStudyAt.py ./test_data/social_network/
49+
50+
echo "TESTING UPDATE STREAMS"
51+
for file in `ls ./test_data/social_network/updateStream_*`
52+
do
53+
python2 ./test/validateUpdateStream.py $file
54+
done

test/compareDatasets.sh

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/bin/bash
2+
3+
if [ $# -ne 2 ]
4+
then
5+
echo "Arguments not correctly supplied"
6+
echo "Usage: sh testDatasets <num_reducers1> <dir1> <num_reducers2> <dir2>"
7+
exit
8+
fi
9+
10+
FILES="person person_email_emailaddress person_studyAt_organisation person_hasInterest_tag person_workAt_organisation person_isLocatedIn_place person_knows_person person_speaks_language tag tagclass_isSubclassOf_tagclass tag_hasType_tagclass tagclass place place_isPartOf_place organisation organisation_isLocatedIn_place"
11+
FILES2="post comment forum forum_hasTag_tag forum_hasModerator_person forum_hasMember_person forum_containerOf_post comment_hasCreator_person comment_hasTag_tag comment_isLocatedIn_place comment_replyOf_comment comment_replyOf_post post_hasCreator_person post_hasTag_tag post_isLocatedIn_place person_likes_comment person_likes_post"
12+
13+
FILES="$FILES $FILES2"
14+
15+
DIR_1=$1
16+
DIR_2=$2
17+
18+
for file in $FILES
19+
do
20+
echo "CHECKING FILE $file"
21+
$(tail -q -n +2 $DIR_1/${file}_?_?.csv | sort > .auxFile1)
22+
$(tail -q -n +2 $DIR_2/${file}_?_?.csv | sort > .auxFile2)
23+
24+
# computing checksums
25+
a=$(md5sum .auxFile1 | awk '{print $1}')
26+
b=$(md5sum .auxFile2 | awk '{print $1}')
27+
28+
if [ "$a" == "$b" ]
29+
then
30+
echo ${file} are equal
31+
echo ${a}
32+
echo ${b}
33+
else
34+
echo ERROR!!!!! ${file} are different
35+
echo ${a}
36+
echo ${b}
37+
exit
38+
fi
39+
echo "---------------------"
40+
done
41+
echo GREAT!!!!! the two datasets are the same!
42+
rm -f .auxFile1
43+
rm -f .auxFile2
44+
45+

test/testDataset.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#!/bin/bash
2+
3+
if [ $# -ne 1 ]
4+
then
5+
echo "Arguments not correctly supplied"
6+
echo "Usage: sh testDataset <dir>"
7+
exit
8+
fi
9+
10+
python2 ./validateKnowsGraph.py $1

test/testDatasets.sh

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/bin/bash
2+
3+
if [ $# -ne 2 ]
4+
then
5+
echo "Arguments not correctly supplied"
6+
echo "Usage: sh testDatasets <num_reducers1> <dir1> <num_reducers2> <dir2>"
7+
exit
8+
fi
9+
10+
FILES="person person_email_emailaddress person_studyAt_organisation person_hasInterest_tag person_workAt_organisation person_isLocatedIn_place person_knows_person person_speaks_language tag tagclass_isSubclassOf_tagclass tag_hasType_tagclass tagclass place place_isPartOf_place organisation organisation_isLocatedIn_place"
11+
12+
FILES2="post comment forum forum_hasTag_tag forum_hasModerator_person forum_hasMember_person forum_containerOf_post comment_hasCreator_person comment_hasTag_tag comment_isLocatedIn_place comment_replyOf_comment comment_replyOf_post post_hasCreator_person post_hasTag_tag post_isLocatedIn_place person_likes_comment person_likes_post"
13+
14+
FILES="$FILES $FILES2"
15+
16+
DIR_1=$1
17+
DIR_2=$2
18+
19+
for file in $FILES
20+
do
21+
echo "CHECKING FILE $file"
22+
$(tail -q -n +2 $DIR_1/${file}_?_?.csv | sort > .auxFile1)
23+
$(tail -q -n +2 $DIR_2/${file}_?_?.csv | sort > .auxFile2)
24+
25+
# computing checksums
26+
a=$(md5sum .auxFile1 | awk '{print $1}')
27+
b=$(md5sum .auxFile2 | awk '{print $1}')
28+
29+
if [ "$a" == "$b" ]
30+
then
31+
echo ${file} are equal
32+
echo ${a}
33+
echo ${b}
34+
else
35+
echo ERROR!!!!! ${file} are different
36+
echo ${a}
37+
echo ${b}
38+
exit
39+
fi
40+
echo "---------------------"
41+
done
42+
echo GREAT!!!!! the two datasets are the same!
43+
rm -f .auxFile1
44+
rm -f .auxFile2
45+
46+

test/validateIdUniqueness.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
import sys, os
3+
from sets import Set
4+
5+
6+
if( len(sys.argv) == 1):
7+
print("Validates if the ids in the union of the first column of the input files are unique.")
8+
print("Usage: validateIdUniqueness <file0> <file1> ... <filen>")
9+
10+
ids = Set()
11+
for i in range(1,len(sys.argv)):
12+
print("Reading "+sys.argv[i])
13+
inputFile = open(sys.argv[i],'r')
14+
index = 0
15+
for line in inputFile.readlines():
16+
if index > 0:
17+
newId = int((line.split('|'))[0])
18+
if newId in ids:
19+
print("ERROR: Id "+str(newId)+" already exists")
20+
print("Line "+str(index+1)+" of "+sys.argv[i])
21+
exit()
22+
ids.add(newId)
23+
index+=1
24+
inputFile.close()
25+
print("GREAT! All ids are different.")

test/validateKnowsGraph.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
2+
import sys, os
3+
import glob
4+
5+
6+
if( len(sys.argv) == 1):
7+
print("Validates the correcness of the knows graph.")
8+
print("Usage: validateKnowsGraph <dir>")
9+
10+
person_files = glob.glob(sys.argv[1]+'/person_?_?.csv')
11+
knows_files = glob.glob(sys.argv[1]+'/person_knows_person_?_?.csv')
12+
13+
persons = set()
14+
15+
for filename in person_files:
16+
file = open(filename,"r")
17+
print("reading "+filename)
18+
count = 0
19+
for line in file.readlines():
20+
if count != 0:
21+
fields = line.split('|')
22+
persons.add(int(fields[0]))
23+
count+=1
24+
file.close()
25+
26+
27+
for filename in knows_files:
28+
file = open(filename,"r")
29+
print("reading "+filename)
30+
count = 0
31+
for line in file.readlines():
32+
if count != 0:
33+
fields = line.split('|')
34+
if (int(fields[0]) not in persons):
35+
print("ERROR: missing person "+fields[0])
36+
exit()
37+
if (int(fields[1]) not in persons):
38+
print("ERROR: missing person "+fields[1])
39+
exit()
40+
count+=1
41+
42+
43+
file.close()
44+
45+
print("GREAT: Knows graph is correct!")
46+
47+
48+

test/validatePairUniqueness.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
2+
import sys, os
3+
4+
5+
if( len(sys.argv) < 4):
6+
print("Validates that a pair of columns never appear repeated.")
7+
print("Usage: validateIdUniqueness <coulmn1> <column2> <file0> <file1> ... <filen>")
8+
9+
column1=int(sys.argv[1])
10+
column2=int(sys.argv[2])
11+
12+
ids = {}
13+
14+
for i in range(3,len(sys.argv)):
15+
print("Reading "+sys.argv[i])
16+
inputFile = open(sys.argv[i],'r')
17+
index = 0
18+
for line in inputFile.readlines():
19+
if index > 0:
20+
firstId = int((line.split('|'))[column1])
21+
secondId = int((line.split('|'))[column2])
22+
if firstId not in ids:
23+
ids[firstId] = set([])
24+
s = ids[firstId]
25+
if secondId in s:
26+
print("ERROR, Id pair not unique")
27+
print(str(firstId)+" "+str(secondId))
28+
exit(1)
29+
s.add(secondId)
30+
index+=1
31+
inputFile.close()
32+
print("GREAT! All ids are different.")

test/validateStudyAt.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
2+
import sys, os
3+
import glob
4+
5+
6+
if( len(sys.argv) == 1):
7+
print("Validates the correctness of the studyAt graph.")
8+
print("Usage: validateStudyAt <dir>")
9+
10+
study_at_files = glob.glob(sys.argv[1]+'/person_studyAt_organization_?_?.csv')
11+
organization_files = glob.glob(sys.argv[1]+'/organisation_?_?.csv')
12+
update_stream_files = glob.glob(sys.argv[1]+'/updateStream_?_?_person.csv')
13+
14+
universities = set()
15+
16+
for filename in organization_files:
17+
file = open(filename,"r")
18+
print("reading "+filename)
19+
count = 0
20+
for line in file.readlines():
21+
if count != 0:
22+
fields = line.split('|')
23+
if fields[1] == "university":
24+
universities.add(int(fields[0]))
25+
count+=1
26+
file.close()
27+
28+
print("Number of universities read "+str(len(universities)))
29+
30+
31+
for filename in study_at_files:
32+
file = open(filename,"r")
33+
print("reading "+filename)
34+
count = 0
35+
for line in file.readlines():
36+
if count != 0:
37+
fields = line.split('|')
38+
if (int(fields[1]) not in universities):
39+
print("ERROR: missing university "+fields[1])
40+
exit()
41+
count+=1
42+
43+
44+
file.close()
45+
46+
for filename in update_stream_files:
47+
file = open(filename,"r")
48+
print("reading "+filename)
49+
count = 0
50+
for line in file.readlines():
51+
study_ats = line.split('|')[15]
52+
if( study_ats != ''):
53+
for study_at in study_ats.split(';'):
54+
uni_id = study_at.split(',')[0]
55+
if (int(uni_id) not in universities):
56+
print("ERROR: missing university "+uni_id)
57+
print(line)
58+
exit()
59+
file.close()
60+
61+
print("GREAT: studyAt graph is correct!")
62+
63+
64+

test/validateUpdateStream.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
2+
import sys, os
3+
4+
5+
if( len(sys.argv) == 1):
6+
print("Validates the correcness of an update stream regarding the dates of the events.")
7+
print("Usage: validateIdUniqueness <file>")
8+
9+
fileName = sys.argv[1]
10+
11+
file = open(fileName,"r")
12+
13+
previous_entry = -1
14+
for line in file.readlines():
15+
fields = line.split("|")
16+
if previous_entry > int(fields[0]):
17+
print("ERROR: date is smaller than previous one")
18+
exit()
19+
if (int(fields[1])+10000) > int(fields[0]):
20+
print("ERROR: dependant event is later than the current one")
21+
print(line)
22+
exit()
23+
previous_entry = int(fields[0])
24+
25+
print("GREAT! UPDATE STREAM "+sys.argv[1]+" SEEMS CORRECT")
26+
27+
file.close()

0 commit comments

Comments
 (0)