mynlp
diff --git a/‎en/emnlp2017exp.sh‎
Lines changed: 179 additions & 0 deletions b/‎en/emnlp2017exp.sh‎
Lines changed: 179 additions & 0 deletions
diff --git a/‎en/emnlp2017exp_msr.sh‎
Lines changed: 150 additions & 0 deletions b/‎en/emnlp2017exp_msr.sh‎
Lines changed: 150 additions & 0 deletions
diff --git a/‎en/separate_msr.py‎
Lines changed: 37 additions & 0 deletions b/‎en/separate_msr.py‎
Lines changed: 37 additions & 0 deletions
@@ -0,0 +1,179 @@
+#!/bin/bash
+# for text similarity task
+
+word2vec=$4
+if [ "$word2vec" == "word2vec" ]; then
+  ./word2vec.sh
+fi
+sick=en/SICK.semeval.txt
+
+# How many processes in parallel you want to run.
+# The maximum number should be inferior to the number of cores in your machine.
+# Default: 3
+cores=${1:-3}
+# Split of the data (default train):
+#   train (4439 problems),
+#   test (4906 problems),
+#   trial (495 problems).
+templates=$2
+
+plain_dir=plain
+plain_dir2=plain2
+results_dir=results
+
+# Usage: 
+#
+# ./en/emnlp2017exp.sh 3 en/semantic_templates_en_event_sts.yaml (word2vec)
+#
+
+# Extract training and test data from SICK dataset, removing the header line.
+if [ ! -d ${plain_dir} ]; then
+  mkdir -p ${plain_dir}
+fi
+
+#echo "Extracting problems from the SICK file."
+#tail -n +2 $sick | \
+#tr -d '\r' | \
+#awk -F'\t' \
+#  '{pair_id=$1;
+#    sub(/\.$/,"",$2);
+#    sub(/\.$/,"",$3);
+#    premise=$2;
+#    conclusion=$3;
+#    if($4 == "CONTRADICTION"){
+#      judgement="no";
+#    } else if ($4 == "ENTAILMENT") {
+#      judgement="yes";
+#    } else if ($4 == "NEUTRAL") {
+#      judgement="unknown";
+#    }
+#    set=$12;
+#    printf "%s.\n%s.\n", premise, conclusion > "en_plain/sick_"tolower(set)"_"pair_id".txt";
+#    printf "%s\n", judgement > "en_plain/sick_"tolower(set)"_"pair_id".answer";
+#   }'
+
+# Create files that list all filenames of training, testing and trial.
+for dset in {train,test,trial}; do
+  ls -v ${plain_dir}/sick_${dset}_*.txt > ${plain_dir}/sick_${dset}.files
+done
+# Split filename entries into several files, for parallel processing:
+ntrain=`cat ${plain_dir}/sick_train.files | wc -l`
+ntest=`cat ${plain_dir}/sick_test.files | wc -l`
+ntrial=`cat ${plain_dir}/sick_trial.files | wc -l`
+train_lines_per_split=`python -c "from math import ceil; print(int(ceil(float(${ntrain})/${cores})))"`
+test_lines_per_split=`python -c "from math import ceil; print(int(ceil(float(${ntest})/${cores})))"`
+trial_lines_per_split=`python -c "from math import ceil; print(int(ceil(float(${ntrial})/${cores})))"`
+
+rm ${plain_dir}/sick_{train,test,trial}.files_??
+split -l $train_lines_per_split ${plain_dir}/sick_train.files ${plain_dir}/sick_train.files_
+split -l $test_lines_per_split ${plain_dir}/sick_test.files ${plain_dir}/sick_test.files_
+split -l $trial_lines_per_split ${plain_dir}/sick_trial.files ${plain_dir}/sick_trial.files_
+
+# Copy a coq static library and compile it
+cp en/coqlib_sick.v coqlib.v
+coqc coqlib.v
+cp en/tactics_coq_sick.txt tactics_coq.txt
+
+for dataset in {train,test,trial}; do
+  # Run pipeline for each entailment problem.
+  for ff in ${plain_dir}/sick_${dataset}.files_??; do
+    for f in `cat ${ff}`; do
+      ./en/similarity_en_mp_any.sh $f $templates $word2vec;
+    done &
+  done
+
+  # Wait for the parallel processes to finish.
+  wait
+ 
+  total=0
+  correct=0
+  for f in ./${plain_dir2}/sick_${dataset}_*.answer; do
+    let total++
+    base_filename=${f##*/}
+    sys_filename=./${results_dir}/${base_filename}
+    gold_answer=`head -1 $f`
+    if [ ! -e ${sys_filename} ]; then
+      sys_answer="unknown"
+    else
+      sys_answer=`head -1 ${sys_filename}`
+    fi
+    echo -e $f"\t"$gold_answer"\t"$sys_answer
+  done
+
+  # Print a summary (precision, recall, f-score) of the errors at individual problems,
+  # per problem category and a global score.
+  echo "Evaluating."
+  echo "<!doctype html>
+  <html lang='en'>
+  <head>
+    <meta charset='UTF-8'>
+    <title>Evaluation results of "$category_templates"</title>
+    <style>
+      body {
+        font-size: 1.5em;
+      }
+    </style>
+  </head>
+  <body>
+  <table border='1'>
+  <tr>
+    <td>sick problem</td>
+    <td>gold answer</td>
+    <td>system answer</td>
+    <td>proving time</td>
+  </tr>" > $results_dir/main_${dataset}.html
+  total_observations=0
+  correct_recognitions=0
+  attempts=0
+  total_proving_time=0
+  red_color="rgb(255,0,0)"
+  green_color="rgb(0,255,0)"
+  white_color="rgb(255,255,255)"
+  gray_color="rgb(136,136,136)"
+  for gold_filename in `ls -v ${plain_dir2}/sick_${dataset}_*.answer`; do
+    base_filename=${gold_filename##*/} # this line obtains the filename, without the directory path.
+    system_filename=${results_dir}/${base_filename/.txt/.answer}
+    gold_answer=`cat $gold_filename`
+    system_answer=`cat $system_filename`
+    time_filename=${results_dir}/${base_filename/.answer/.time}
+    proving_time=`cat $time_filename`
+    total_proving_time=`echo "$total_proving_time + $proving_time" | bc -l`
+    total_number=$((total_number + 1))
+    color=$white_color
+    if [ "$gold_answer" == "yes" ] || [ "$gold_answer" == "no" ]; then
+      total_observations=$((total_observations + 1))
+      if [ "$gold_answer" == "$system_answer" ]; then
+        correct_recognitions=$((correct_recognitions + 1))
+        color=$green_color
+      else
+        color=$red_color
+      fi
+      if [ "$system_answer" == "yes" ] || [ "$system_answer" == "no" ]; then
+        attempts=$((attempts + 1))
+      else
+        color=$gray_color
+      fi
+    fi
+    echo '
+  <tr>
+    <td><a style="background-color:'$color';" href="'${base_filename/.answer/.html}'">'${base_filename/.answer/}'</a></td>
+    <td>'$gold_answer'</td>
+    <td>'$system_answer'</td>
+    <td>'$proving_time's</td>
+  </tr>' >> $results_dir/main_${dataset}.html
+  done
+  average_proving_time=`echo "scale=2; $total_proving_time / $total_number" | bc -l`
+  echo "
+  <h4><font color="red">Average proving time: "${average_proving_time}" </font></h4>
+  </body>
+  </html>
+  " >> $results_dir/main_${dataset}.html
+done
+
+if [ "$word2vec" == "word2vec" ]; then
+  processid=$(ps ax|grep "word2vec-api.py"|grep -v grep|awk '{print $1}')
+  kill $processid
+fi
+
+python scripts/randomforest_all.py
+
@@ -0,0 +1,150 @@
+#!/bin/bash
+# for text similarity task
+# for MSR-video(SemEval-2012) dataset
+
+#run word2vec(optional)
+word2vec=$4
+if [ "$word2vec" == "word2vec" ]; then
+  ./word2vec.sh
+fi
+
+#make directory
+plain_dir=plain
+plain_dir2=plain2
+results_dir=results
+if [ ! -d ${plain_dir} ]; then
+  mkdir -p ${plain_dir}
+fi
+if [ ! -d ${plain2_dir} ]; then
+  mkdir -p ${plain2_dir}
+fi
+
+#download MSR-video dataset
+#wget https://www.cs.york.ac.uk/semeval-2012/task6/data/uploads/datasets/train.tgz
+#wget https://www.cs.york.ac.uk/semeval-2012/task6/data/uploads/datasets/test-gold.tgz
+#tar xvfz train.tgz
+#tar xvfz test-gold.tgz
+#python en/separate_msr.py
+
+# How many processes in parallel you want to run.
+# The maximum number should be inferior to the number of cores in your machine.
+# Default: 3
+cores=${1:-3}
+# Split of the data (default train):
+#   train (750 problems),
+#   test (750 problems),
+templates=$2
+
+# Usage: 
+#
+# ./en/emnlp2017exp_msr.sh 3 en/semantic_templates_en_event_sts.yaml (word2vec)
+#
+
+# Copy a coq static library and compile it
+cp en/coqlib_sick.v coqlib.v
+coqc coqlib.v
+cp en/tactics_coq_sick.txt tactics_coq.txt
+
+for dataset in {train,test}; do
+  # Run pipeline for each entailment problem.
+  for ff in ${plain_dir}/sick_${dataset}.files_??; do
+    for f in `cat ${ff}`; do
+      ./en/similarity_en_mp_any.sh $f $templates $word2vec;
+    done &
+  done
+
+  # Wait for the parallel processes to finish.
+  wait
+ 
+  total=0
+  correct=0
+  for f in ./${plain_dir2}/sick_${dataset}_*.answer; do
+    let total++
+    base_filename=${f##*/}
+    sys_filename=./${results_dir}/${base_filename}
+    gold_answer=`head -1 $f`
+    if [ ! -e ${sys_filename} ]; then
+      sys_answer="unknown"
+    else
+      sys_answer=`head -1 ${sys_filename}`
+    fi
+    echo -e $f"\t"$gold_answer"\t"$sys_answer
+  done
+
+  # Print a summary (precision, recall, f-score) of the errors at individual problems,
+  # per problem category and a global score.
+  echo "Evaluating."
+  echo "<!doctype html>
+  <html lang='en'>
+  <head>
+    <meta charset='UTF-8'>
+    <title>Evaluation results of "$category_templates"</title>
+    <style>
+      body {
+        font-size: 1.5em;
+      }
+    </style>
+  </head>
+  <body>
+  <table border='1'>
+  <tr>
+    <td>sick problem</td>
+    <td>gold answer</td>
+    <td>system answer</td>
+    <td>proving time</td>
+  </tr>" > $results_dir/main_${dataset}.html
+  total_observations=0
+  correct_recognitions=0
+  attempts=0
+  total_proving_time=0
+  red_color="rgb(255,0,0)"
+  green_color="rgb(0,255,0)"
+  white_color="rgb(255,255,255)"
+  gray_color="rgb(136,136,136)"
+  for gold_filename in `ls -v ${plain_dir2}/sick_${dataset}_*.answer`; do
+    base_filename=${gold_filename##*/} # this line obtains the filename, without the directory path.
+    system_filename=${results_dir}/${base_filename/.txt/.answer}
+    gold_answer=`cat $gold_filename`
+    system_answer=`cat $system_filename`
+    time_filename=${results_dir}/${base_filename/.answer/.time}
+    proving_time=`cat $time_filename`
+    total_proving_time=`echo "$total_proving_time + $proving_time" | bc -l`
+    total_number=$((total_number + 1))
+    color=$white_color
+    if [ "$gold_answer" == "yes" ] || [ "$gold_answer" == "no" ]; then
+      total_observations=$((total_observations + 1))
+      if [ "$gold_answer" == "$system_answer" ]; then
+        correct_recognitions=$((correct_recognitions + 1))
+        color=$green_color
+      else
+        color=$red_color
+      fi
+      if [ "$system_answer" == "yes" ] || [ "$system_answer" == "no" ]; then
+        attempts=$((attempts + 1))
+      else
+        color=$gray_color
+      fi
+    fi
+    echo '
+  <tr>
+    <td><a style="background-color:'$color';" href="'${base_filename/.answer/.html}'">'${base_filename/.answer/}'</a></td>
+    <td>'$gold_answer'</td>
+    <td>'$system_answer'</td>
+    <td>'$proving_time's</td>
+  </tr>' >> $results_dir/main_${dataset}.html
+  done
+  average_proving_time=`echo "scale=2; $total_proving_time / $total_number" | bc -l`
+  echo "
+  <h4><font color="red">Average proving time: "${average_proving_time}" </font></h4>
+  </body>
+  </html>
+  " >> $results_dir/main_${dataset}.html
+done
+
+if [ "$word2vec" == "word2vec" ]; then
+  processid=$(ps ax|grep "word2vec-api.py"|grep -v grep|awk '{print $1}')
+  kill $processid
+fi
+
+python scripts/randomforest_all_msr.py
+
@@ -0,0 +1,37 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import sys, os
+
+def main():
+    f = open("train/STS.input.MSRvid.txt", "r")
+    train_inputs = f.readlines()
+    f.close()
+    g = open("train/STS.gs.MSRvid.txt", "r")
+    train_answers = g.readlines()
+    g.close()
+    h = open("test-gold/STS.input.MSRvid.txt", "r")
+    test_inputs = h.readlines()
+    h.close()
+    i = open("test-gold/STS.gs.MSRvid.txt", "r")
+    test_answers = i.readlines()
+    i.close()
+    for num in range(1, 750):
+        j = open("plain/sick_train_"+str(num)+".txt", "w")
+        train_sentences = train_inputs[num].split("\t")
+        j.write(train_sentences[0]+"\n"+train_sentences[1])
+        j.close()
+        k = open("plain2/sick_train_"+str(num)+".answer", "w")
+        k.write(train_answers[num])
+        k.close()
+        l = open("plain/sick_test_"+str(num)+".txt", "w")
+        test_sentences = test_inputs[num].split("\t")
+        l.write(test_sentences[0]+"\n"+test_sentences[1])
+        l.close()
+        m = open("plain2/sick_test_"+str(num)+".answer", "w")
+        m.write(test_answers[num])
+        m.close()
+
+
+if __name__ == '__main__':
+    main()