Amit/v0.1 legal fixes (#208)

amityaccobi · peteriz · commit 53bffb88d3b4 · 2018-05-16T14:02:45.000+03:00
* server - fixed legal issues

* Fix validate_parent_exists()

* np_semantic_segmentation - fixes

1. Legal issue
2. Prompt added to nltk.dowload datasets

* WSD - fixes

1. fixed legal requirements
2. Removed ‘requirement.txt’
3. Added prompt for nltk.download()

* NP_Semantic_seg - small fix in nltk dowloaded corpora check

* Fix server tests fails:

1. Fixes wrong header (format -&gt; Response-Format)
2. Added .gz files
3. Added .gz files ignore to the .gitignore file (specifically to tests/fixtures/data/server/ dir)
diff --git a/.gitignore b/.gitignore
@@ -20,6 +20,7 @@ generated
 *.h5
 *.html
 !server/web_service/visualizer/displacy/*.html
+!tests/fixtures/data/server/*.gz
 *.log
 .idea/
 dist
diff --git a/doc/source/np_segmentation.rst b/doc/source/np_segmentation.rst
@@ -69,8 +69,7 @@ Dataset <https://vered1986.github.io/papers/Tratz2011_Dataset.tar.gz>`__.
 Is also available in
 `here <https://www.isi.edu/publications/licensed-sw/fanseparser/index.html>`__.
 (The terms and conditions of the data set license apply. Intel does not
-grant any rights to the data files or database. see relevant `license
-agreement <http://www.apache.org/licenses/LICENSE-2.0>`__)
+grant any rights to the data files or database.
 
 After downloading and unzipping the dataset, run
 ``preprocess_tratz2011.py`` in order to construct the labeled data and
@@ -97,8 +96,7 @@ command ``python data.py``
    -  Pre-trained Google News Word2vec model can download
       `here <https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing>`__
    -  The terms and conditions of the data set license apply. Intel does
-      not grant any rights to the data files or database. see relevant
-      `license agreement <http://www.apache.org/licenses/LICENSE-2.0>`__
+      not grant any rights to the data files or database.
 
 -  Cosine distance between 2 words in the Noun-Phrase.
 -  NLTKCollocations score (PMI score (from Manning and Schutze 5.4) and Chi-square score (Manning and Schutze 5.3.3)).
diff --git a/doc/source/word_sense.rst b/doc/source/word_sense.rst
@@ -83,6 +83,7 @@ Dataset Preparation
 
 The script prepare_data.py uses the gold standard csv file as described in the requirements section above
 using pretrained Google News Word2vec model. Pretrained Google News Word2vec model can be download here_.
+The terms and conditions of the data set license apply. Intel does not grant any rights to the data files.
 
 .. code:: python
 
diff --git a/examples/most_common_word_sense/feature_extraction.py b/examples/most_common_word_sense/feature_extraction.py
@@ -20,8 +20,23 @@
 from numpy import dot
 from numpy.linalg import norm
 
-nltk.download('averaged_perceptron_tagger')
-nltk.download('punkt')
+from nlp_architect.utils.generic import license_prompt
+
+try:
+    nltk.data.find('taggers/averaged_perceptron_tagger')
+except LookupError:
+    if license_prompt('Averaged Perceptron Tagger', 'http://www.nltk.org/nltk_data/') is False:
+        raise Exception("can't continue data prepare process "
+                        "without downloading averaged_perceptron_tagger")
+    nltk.download('averaged_perceptron_tagger')
+
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    if license_prompt('Punkt model', 'http://www.nltk.org/nltk_data/') is False:
+        raise Exception("can't continue data prepare process "
+                        "without downloading punkt")
+    nltk.download('punkt')
 
 # -------------------------------------------------------------------------------------#
 
diff --git a/examples/most_common_word_sense/prepare_data.py b/examples/most_common_word_sense/prepare_data.py
@@ -18,7 +18,6 @@
 """
 
 import argparse
-import codecs
 import csv
 import logging
 import math
@@ -27,9 +26,11 @@
 import gensim
 import numpy as np
 from feature_extraction import extract_features_envelope
-from nlp_architect.utils.io import validate_existing_directory, validate_existing_filepath
 from sklearn.model_selection import train_test_split
 
+from nlp_architect.utils.io import validate_existing_filepath, \
+    check_size, validate_parent_exists
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 
diff --git a/examples/most_common_word_sense/requirements.txt b/examples/most_common_word_sense/requirements.txt
diff --git a/examples/np_semantic_segmentation/README.md b/examples/np_semantic_segmentation/README.md
@@ -19,24 +19,23 @@ The expected dataset is a CSV file with 2 columns. the first column contains the
 
 If you wish to use an existing dataset for training the model, you can download Tratz 2011 et al. dataset [1,2] from the following link:
 [Tratz 2011 Dataset](https://vered1986.github.io/papers/Tratz2011_Dataset.tar.gz). Is also available in [here](https://www.isi.edu/publications/licensed-sw/fanseparser/index.html).
-(The terms and conditions of the data set license apply. Intel does not grant any rights to the data files or database. see relevant [license agreement](http://www.apache.org/licenses/LICENSE-2.0))
+(The terms and conditions of the data set license apply. Intel does not grant any rights to the data files or database.
 
 
 After downloading and unzipping the dataset, run `preprocess_tratz2011.py` in order to construct the labeled data and save it in a CSV file (as expected for the model).
 the scripts read 2 .tsv files ('tratz2011_coarse_grained_random/train.tsv' and 'tratz2011_coarse_grained_random/val.tsv') and outputs 2 .csv files accordingly.
 
 Parameters can be obtained by running:
 
-    python preprocess_tratz2011.py -h
-        --data path_to_Tratz_2011_dataset_folder
+    python preprocess_tratz2011.py --data path_to_Tratz_2011_dataset_folder
 
 
 ### Pre-processing the data:
 A feature vector is extracted from each Noun-Phrase string using the command `python data.py`
 
 * Word2Vec word embedding (300 size vector for each word in the Noun-Phrase) .
     * Pre-trained Google News Word2vec model can download [here](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing)
-    * The terms and conditions of the data set license apply. Intel does not grant any rights to the data files or database. see relevant [license agreement](http://www.apache.org/licenses/LICENSE-2.0)
+    * The terms and conditions of the data set license apply. Intel does not grant any rights to the data files or database.
 * Cosine distance between 2 words in the Noun-Phrase.
 * NLTKCollocations score (NPMI and UCI scores).
 * A binary features whether the Noun-Phrase has existing entity in Wikidata.
diff --git a/examples/np_semantic_segmentation/feature_extraction.py b/examples/np_semantic_segmentation/feature_extraction.py
@@ -24,6 +24,8 @@
 from nltk.corpus import wordnet as wn
 from nltk.stem.snowball import SnowballStemmer
 
+from nlp_architect.utils.generic import license_prompt
+
 stemmer = SnowballStemmer("english")
 headers = {"Accept": "application/json"}
 
@@ -33,7 +35,13 @@ class NLTKCollocations:
     NLTKCollocations score using NLTK framework on Brown dataset
     """
     def __init__(self):
-        nltk.download('brown')
+        try:
+            nltk.data.find('corpora/brown')
+        except LookupError:
+            if license_prompt('brown data set', 'http://www.nltk.org/nltk_data/') is False:
+                raise Exception("can't continue data prepare process "
+                                "without downloading brown dataset")
+            nltk.download('brown')
         self.bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(
             nltk.corpus.brown.words())
         self.bigram_messure = nltk.collocations.BigramAssocMeasures()
@@ -235,7 +243,13 @@ class Wordnet:
     """
 
     def __init__(self):
-        nltk.download('wordnet')
+        try:
+            nltk.data.find('corpora/wordnet')
+        except LookupError:
+            if license_prompt('WordNet data set', 'http://www.nltk.org/nltk_data/') is False:
+                raise Exception("can't continue data prepare process "
+                                "without downloading WordNet dataset")
+            nltk.download('wordnet')
         self.wordnet = wn
 
     def find_wordnet_existence(self, candidates):
diff --git a/nlp_architect/utils/io.py b/nlp_architect/utils/io.py
@@ -135,7 +135,10 @@ def validate_existing_directory(arg):
 
 def validate_parent_exists(arg):
     """Validates an input argument is a path string, and its parent directory exists."""
-    return validate_existing_directory(os.path.dirname(arg))
+    arg = path.abspath(arg)
+    dir_arg = os.path.dirname(os.path.abspath(arg))
+    if not validate_existing_directory(dir_arg) is None:
+        return arg
 
 
 def sanitize_path(path):
diff --git a/server/web_service/visualizer/displacy/displacy-ent.js b/server/web_service/visualizer/displacy/displacy-ent.js
@@ -1,6 +1,7 @@
 //- ----------------------------------
 //- 💥 DISPLACY ENT
 //- ----------------------------------
+/* this file is taken from: "https://github.com/explosion/displacy-ent" */
 
 'use strict';
 
diff --git a/server/web_service/visualizer/displacy/displacy.js b/server/web_service/visualizer/displacy/displacy.js
@@ -1,6 +1,7 @@
 //- ----------------------------------
 //- 💥 DISPLACY
 //- ----------------------------------
+/* this file is taken from: "https://github.com/explosion/displacy" */
 
 'use strict';
 
diff --git a/server/web_service/visualizer/displacy/style.css b/server/web_service/visualizer/displacy/style.css
@@ -1,3 +1,23 @@
+/*# ******************************************************************************
+/*# Copyright 2017-2018 Intel Corporation
+/*#
+/*# Licensed under the Apache License, Version 2.0 (the "License");
+/*# you may not use this file except in compliance with the License.
+/*# You may obtain a copy of the License at
+/*#
+/*#     http://www.apache.org/licenses/LICENSE-2.0
+/*#
+/*# Unless required by applicable law or agreed to in writing, software
+/*# distributed under the License is distributed on an "AS IS" BASIS,
+/*# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+/*# See the License for the specific language governing permissions and
+/*# limitations under the License.
+/*# ******************************************************************************
+
+This file contains code adapted from the open sourced code:
+"https://github.com/explosion/displacy-ent" & "https://github.com/explosion/displacy".
+License: MIT -> https://github.com/explosion/displacy-ent/blob/master/LICENSE
+
 /* Format all words in 12px Helvetica and grey */
 
 .displacy-word {
diff --git a/tests/fixtures/data/server/bist_sentences_examples.json.gz b/tests/fixtures/data/server/bist_sentences_examples.json.gz
diff --git a/tests/fixtures/data/server/spacy_ner_sentences_examples.json.gz b/tests/fixtures/data/server/spacy_ner_sentences_examples.json.gz
diff --git a/tests/test_server_sanity.py b/tests/test_server_sanity.py
@@ -58,7 +58,7 @@ def test_request(service_name):
     doc = json.dumps(test_data["input"])
     expected_result = json.dumps(test_data["response"])
     headers["Content-Type"] = "application/json"
-    headers["format"] = "json"
+    headers["Response-Format"] = "json"
     response = client.simulate_post('/' + service_name, body=doc, headers=headers)
     result_doc = json.loads(response.content, encoding='utf-8')
     assert result_doc == json.loads(expected_result)
@@ -75,7 +75,7 @@ def test_gzip_file_request(service_name):
     expected_result = json.dumps(load_test_data(service_name)["response"])
     headers["Content-Type"] = "application/gzip"
     headers["Content-Encoding"] = "gzip"
-    headers["format"] = "gzip"
+    headers["Response-Format"] = "gzip"
     response = client.simulate_post('/' + service_name, body=doc, headers=headers)
     result_doc = get_decompressed_gzip(response.content)
     assert result_doc == json.loads(expected_result)
@@ -91,7 +91,7 @@ def test_json_file_request(service_name):
         doc = file.read()
     expected_result = json.dumps(load_test_data(service_name)["response"])
     headers["Content-Type"] = "application/json"
-    headers["format"] = "json"
+    headers["Response-Format"] = "json"
     response = client.simulate_post('/' + service_name, body=doc, headers=headers)
     result_doc = json.loads(response.content, encoding='utf-8')
     assert result_doc == json.loads(expected_result)