Skip to content

Commit 4c9b240

Browse files
committed
generate analysis
1 parent 20ae8b5 commit 4c9b240

File tree

8 files changed

+147
-20
lines changed

8 files changed

+147
-20
lines changed

docona/analyseresults.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/usr/bin/env python
2+
# # Document Content and Citation Analysis (DoConA)
3+
# # Module: analyses "results.csv" raw output from pipeline and computes the similarity overlap between the citation network of the documents and the document texts
4+
# # Output: generates "analysis.csv" file with summary of the results
5+
# GNU AGPLv3 - https://choosealicense.com/licenses/agpl-3.0/
6+
7+
import pandas as pd
8+
import os
9+
10+
def analyse():
11+
# Import results and remove duplicate rows
12+
results_df = pd.read_csv(os.path.join(os.path.join(os.path.realpath('..'), "outputdata"), "results.csv"))
13+
results_df.drop_duplicates(subset=None, keep='first', inplace=True)
14+
15+
# Filter for the overlaps and sort the dataframe (in descending order) according to the number of citation overlaps (group by method as well)
16+
overlaps_df = results_df[results_df["citation_link"] == True]
17+
overlaps_df = overlaps_df.groupby(['method']).citation_link.agg('count').to_frame('num_cite_and_similar_links').reset_index()
18+
overlaps_df = overlaps_df.sort_values('num_cite_and_similar_links',ascending=False)
19+
20+
# Import full citation network of documents
21+
citations = pd.read_csv('../inputdata/citations.csv')
22+
23+
# Function to location the cited documents of a given document
24+
def find_cited_documents(documentID):
25+
relevantsource = citations[citations['source'] == documentID]
26+
return relevantsource['target'].tolist()
27+
28+
# Import sample documents
29+
unique_samples = pd.read_csv('../inputdata/sample.csv', header=None)
30+
31+
# Compute the total number of unique documents cited by the sample documents
32+
cited_documents = []
33+
for item in unique_samples.values.tolist():
34+
cited_documents.extend(find_cited_documents(item[0]))
35+
num_cite_links = len(cited_documents)
36+
37+
# Aggregate all the counts into a single data frame and compute the percentage overlaps of each method
38+
percentage_overlap_arr = []
39+
num_cite_links_arr = []
40+
num_rows = len(overlaps_df.index)
41+
num_cite_and_similar_links_arr = overlaps_df['num_cite_and_similar_links'].values.tolist()
42+
43+
for i in range(0,num_rows):
44+
num_cite_links_arr.append(num_cite_links)
45+
current_perc_overlap = (num_cite_and_similar_links_arr[i] / num_cite_links) * 100
46+
percentage_overlap_arr.append(current_perc_overlap)
47+
48+
overlaps_df['num_cite_links'] = num_cite_links_arr
49+
overlaps_df['percentage_overlap'] = percentage_overlap_arr
50+
51+
# Write the output to file
52+
overlaps_df.to_csv('../outputdata/analysis.csv', index=False)
53+

docona/docona.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,28 +9,30 @@
99
print()
1010
print("1. Preprocessing documents...", end = '', flush=True)
1111
# Preprocessing full texts
12-
from preprocessdocs import preprocess
13-
preprocess()
12+
from preprocessdocs import preprocess, preprocessingdone
13+
if not preprocessingdone():
14+
preprocess()
1415
print("Done!")
1516
print()
1617
print("2. Executing semantic measures")
1718
print("a) Training corpus models...", end = '', flush=True)
1819
# Train Doc2Vec and Word2Vec models on full texts of documents
1920
from trainedmodels import getdoc2vecmodel,getword2vecmodel
2021
doc2vecmodel = getdoc2vecmodel()
21-
word2vecmodel = getword2vecmodel()
22+
# word2vecmodel = getword2vecmodel()
2223
print("Done!")
2324
print("b) Similarity checks: corpus models...", end = '', flush=True)
25+
print()
2426
# # Trained models: check document similarity
2527
from trainedmodelssimilarity import dosimilaritychecks
2628
dosimilaritychecks("doc2vec",doc2vecmodel,"cosine")
27-
dosimilaritychecks("word2vec",word2vecmodel,"wmd")
29+
# dosimilaritychecks("word2vec",word2vecmodel,"wmd")
30+
print()
2831
print("Done!")
2932

30-
# --------------------------------------------- #
31-
# --- ADD CUSTOM PRETRAINED MODEL CODE HERE --- #
32-
# --------------------------------------------- #
33-
33+
# # --------------------------------------------- #
34+
# # --- ADD CUSTOM PRETRAINED MODEL CODE HERE --- #
35+
# # --------------------------------------------- #
3436
# print("c) Adapting GoogleNews pretrained model...", end = '', flush=True)
3537
# # # GoogleNews pretrained load / train
3638
# from pretrainedmodels import getdoc2vecmodel,getword2vecmodel
@@ -53,7 +55,8 @@
5355
# dosimilaritychecks("doc2vec", "law2vec", law2vecdoc2vecmodel, "cosine")
5456
# dosimilaritychecks("word2vec", "law2vec", law2vecword2vecmodel, "wmd")
5557
# print("Done!")
56-
print()
58+
# print()
59+
5760
print("3. Executing syntactic measures")
5861
print("a) Training TFIDF and Ngram models...", end = '', flush=True)
5962
# # TFIDF, Ngram models load / train
@@ -68,4 +71,9 @@
6871
dosyntacticsimilaritychecks("jaccard",model=None)
6972
print("Done!")
7073
print()
74+
print("4. Analysing results")
75+
from analyseresults import analyse
76+
analyse()
77+
print("Done!")
78+
print()
7179
print("-- FINISHED --")

docona/preprocessdocs.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,20 @@
2121
import pandas as pd
2222
from utility import cleantoken, stemSentence
2323

24+
def preprocessingdone():
25+
fulltextpath = "../inputdata/fulltexts/"
26+
processedtextpath = "../inputdata/processedtexts/"
27+
resourcespath = "../inputdata/resources/"
28+
29+
if path.exists(processedtextpath) and path.exists(fulltextpath):
30+
numfulltexts = len([name for name in os.listdir(fulltextpath) if os.path.isfile(os.path.join(fulltextpath, name))])
31+
numprocessedtexts = len([name for name in os.listdir(processedtextpath) if os.path.isfile(os.path.join(processedtextpath, name))])
32+
if numfulltexts == numprocessedtexts:
33+
if os.path.isfile(resourcespath + 'documentID_to_tokenized.pickle') and os.path.isfile(resourcespath + 'data_to_tokenized.pickle') and os.path.isfile(resourcespath + 'datafortraining.pickle') and os.path.isfile(resourcespath + 'documents.pickle') and os.path.isfile(resourcespath + 'documentID_to_data.pickle') and os.path.isfile(resourcespath + 'index_to_documentID.pickle') and os.path.isfile(resourcespath + 'documentID_to_index.pickle') and os.path.isfile(resourcespath + 'word2vec.model') and os.path.isfile(resourcespath + 'doc2vec.model') and os.path.isfile(resourcespath + 'word2vecsimilaritymatrix.pickle'):
34+
return True
35+
36+
return False
37+
2438
def preprocess():
2539
textpath = "../inputdata/fulltexts/"
2640
index_to_documentID = {}

docona/pretrainedmodels.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def getdoc2vecmodel(modelinputfilename,modeloutputfilename):
3131
min_count = 2
3232
sampling_threshold = 1e-5
3333
negative_size = 5
34-
training_epochs = 5
34+
training_epochs = 20
3535
dm = 0
3636
hs = 0
3737
worker_count = 4
@@ -41,6 +41,8 @@ def getdoc2vecmodel(modelinputfilename,modeloutputfilename):
4141
fname = get_tmpfile(os.path.join(os.path.join(os.path.join(os.path.realpath('..'), "inputdata"), "resources"), modeloutputfilename))
4242
# Train doc2vec model
4343
model = g.Doc2Vec(documents, vector_size=vector_size, window=window_size, min_count=min_count, sample=sampling_threshold, workers=worker_count, hs=hs, dm=dm, negative=negative_size, dbow_words=1, dm_concat=1, pretrained_emb=pretrained_emb, epochs=training_epochs)
44+
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
45+
model.init_sims(replace=True)
4446
# Save model
4547
model.save(fname)
4648

docona/pretrainedmodelssimilarity.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import os
1111
import os.path
1212
from os import path
13+
import time
1314
import csv
1415
import pandas as pd
1516
from helper import convert_to_document_references, exists_citation_link_between
@@ -19,27 +20,41 @@
1920
def lookup_similar_documents_docvec_cosine(sample_documents, n, model, pretrained_embedding_name):
2021
results = []
2122

23+
num_samples = len(sample_documents)
24+
count = 1
2225
for item in sample_documents:
26+
start = time.time()
27+
print(str(count) + "/" + str(num_samples) + "...", end='', flush=True)
28+
count = count + 1
2329
similar_documents = model.docvecs.most_similar(documentID_to_index[item], topn=n)
2430
similar_documents_references = convert_to_document_references(similar_documents)
2531
for reference in similar_documents_references:
2632
method = pretrained_embedding_name + "_doc2vec_wmd"
2733
results.append([item,reference[0].replace(".txt",""),reference[1],method,exists_citation_link_between(item,reference[0])])
28-
34+
end = time.time()
35+
timetaken = end-start
36+
print(str(timetaken) + "s")
2937
return results
3038

3139
# ### Look up top n similar documents per sample document (googlenews word2vec embeddings + word mover's distance)
3240
def lookup_similar_documents_word2vec_wmd(sample_documents, pretrained_embedding_name):
3341
results = []
3442

43+
num_samples = len(sample_documents)
44+
count = 1
3545
for item in sample_documents:
46+
start = time.time()
47+
print(str(count) + "/" + str(num_samples) + "...", end='', flush=True)
48+
count = count + 1
3649
similar_documents = sim[documentID_to_data[item]]
3750
similar_documents_references = convert_to_document_references(similar_documents)
3851
for reference in similar_documents_references:
3952
method = pretrained_embedding_name + "_word2vec_wmd"
4053
if (str(item) != str(reference[0])):
4154
results.append([item,reference[0],reference[1],method,exists_citation_link_between(item,reference[0])])
42-
55+
end = time.time()
56+
timetaken = end-start
57+
print(str(timetaken) + "s")
4358
return results
4459

4560
# ### Main function

docona/syntacticmodels.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import os.path
1414
from os import path
1515
import csv
16+
import time
1617
import math
1718
import operator
1819

@@ -57,20 +58,31 @@ def find_similar(tfidf_matrix, index, top_n):
5758

5859
def lookup_similar_documents_tfidf_based(sample_documents, n, model,methodname):
5960
results = []
61+
num_samples = len(sample_documents)
62+
count = 1
6063
for item in sample_documents:
64+
start = time.time()
65+
print(str(count) + "/" + str(num_samples) + "...", end='', flush=True)
66+
count = count + 1
6167
index = documentID_to_index[item] # Look up this documents index in the TFIDF matrix
6268
similar_documents = find_similar(model, index, n) # Look up top n similar documents for this document
6369
similar_documents_references = convert_to_document_references(similar_documents)
6470
for reference in similar_documents_references:
6571
results.append([item,reference[0],reference[1],methodname,exists_citation_link_between(item,reference[0])])
66-
72+
end = time.time()
73+
timetaken = end-start
74+
print(str(timetaken) + "s")
6775
return results
6876

6977
def lookup_similar_documents_jaccard(sample_documents, n, methodname):
7078
results = []
7179

72-
num = len(sample_documents)
80+
num_samples = len(sample_documents)
81+
count = 1
7382
for item in sample_documents:
83+
start = time.time()
84+
print(str(count) + "/" + str(num_samples) + "...", end='', flush=True)
85+
count = count + 1
7486
current_dict = {}
7587
for k,v in documentID_to_data.items():
7688
if k != item:
@@ -80,7 +92,9 @@ def lookup_similar_documents_jaccard(sample_documents, n, methodname):
8092
topn = sorted_dict[-n:]
8193
for reference in topn:
8294
results.append([item,reference[0],reference[1], methodname, exists_citation_link_between(item,reference[0])])
83-
95+
end = time.time()
96+
timetaken = end-start
97+
print(str(timetaken) + "s")
8498
return results
8599

86100
# ### Main function

docona/trainedmodels.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,10 @@ def getdoc2vecmodel():
4040
if not model_exists:
4141
# No existing model
4242
documents = pickle.load( open( "../inputdata/resources/documents.pickle", "rb" ) )
43-
model = Doc2Vec(vector_size=32, min_count=2, epochs=5)
43+
model = Doc2Vec(vector_size=256, min_count=2, epochs=30)
4444
model.build_vocab(documents)
4545
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
46+
model.init_sims(replace=True)
4647
model_file = get_tmpfile(os.path.join(os.path.join(os.path.join(os.path.realpath('..'), "inputdata"), "resources"), "doc2vec.model"))
4748
model.save(model_file)
4849

@@ -66,8 +67,8 @@ def getword2vecmodel():
6667
texts = []
6768
for doc in documents:
6869
texts.append(doc.words)
69-
model = Word2Vec(texts, size=32, window=7, min_count=2, workers=2)
70-
model.train(texts, total_examples=model.corpus_count,epochs=5)
70+
model = Word2Vec(texts, size=256, window=5, min_count=2, workers=4)
71+
model.train(texts, total_examples=model.corpus_count,epochs=30)
7172
model.init_sims(replace=True)
7273
model_file = get_tmpfile(os.path.join(os.path.join(os.path.join(os.path.realpath('..'), "inputdata"), "resources"), "word2vec.model"))
7374
model.save(model_file)

docona/trainedmodelssimilarity.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,36 +24,56 @@
2424
def lookup_similar_documents_docvec_cosine(sample_documents, n, model):
2525
results = []
2626

27+
num_samples = len(sample_documents)
28+
count = 1
2729
for item in sample_documents:
30+
start = time.time()
31+
print(str(count) + "/" + str(num_samples) + "...", end='', flush=True)
32+
count = count + 1
2833
similar_documents = model.docvecs.most_similar(documentID_to_index[item], topn=n)
2934
similar_documents_references = convert_to_document_references(similar_documents)
3035
for reference in similar_documents_references:
3136
method = "doc2vec_cosine"
3237
results.append([item,reference[0].replace(".txt",""),reference[1],method,exists_citation_link_between(item,reference[0])])
33-
38+
end = time.time()
39+
timetaken = end-start
40+
print(str(timetaken) + "s")
3441
return results
3542

3643
# ### Look up top n similar documents per sample document (word2vec embeddings + word mover's distance)
3744
def lookup_similar_documents_word2vec_wmd(sample_documents):
3845
results = []
3946

47+
num_samples = len(sample_documents)
48+
count = 1
4049
for item in sample_documents:
50+
start = time.time()
51+
print(str(count) + "/" + str(num_samples) + "...", end='', flush=True)
52+
count = count + 1
4153
similar_documents = sim[documentID_to_data[item]]
4254
similar_documents_references = convert_to_document_references(similar_documents)
4355
for reference in similar_documents_references:
4456
method = "word2vec_wmd"
4557
if (str(item) != str(reference[0])):
4658
results.append([item,reference[0],reference[1],method,exists_citation_link_between(item,reference[0])])
47-
59+
end = time.time()
60+
timetaken = end-start
61+
print(str(timetaken) + "s")
4862
return results
4963

5064
# ### Main function
5165
def dosimilaritychecks(modeltype,model,distancemeasure):
5266
results = []
5367
if (modeltype == "doc2vec" and distancemeasure == "cosine"):
68+
print("doc2vec + cosine distance")
69+
print()
5470
results = lookup_similar_documents_docvec_cosine(sampledocuments,20, model)
71+
print()
5572
elif (modeltype == "word2vec" and distancemeasure == "wmd"):
73+
print("word2vec + word movers distance")
74+
print()
5675
results = lookup_similar_documents_word2vec_wmd(sampledocuments)
76+
print()
5777
if os.path.exists('../outputdata/results.csv') == False:
5878
results.insert(0,['source_document','similar_document','similarity_score','method','citation_link'])
5979

0 commit comments

Comments
 (0)