Merge pull request #212 from NervanaSystems/peter/missing_docstrings

peteriz · web-flow · commit c1b287e25dd7 · 2018-05-16T19:30:30.000+03:00
Fixed docstrings
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -92,6 +92,6 @@ these will be placed into a central repository.
     nlp_architect.data.intent_datasets.IntentDataset
     nlp_architect.data.intent_datasets.TabularIntentDataset
     nlp_architect.data.intent_datasets.SNIPS
-    nlp_architect.data.sequential_tagging.NamedEntityDataset
+    nlp_architect.data.sequential_tagging.SequentialTaggingDataset
     nlp_architect.data.babi_dialog.BABI_Dialog
     nlp_architect.data.wikimovies.WIKIMOVIES
diff --git a/doc/source/ner_crf.rst b/doc/source/ner_crf.rst
@@ -49,7 +49,7 @@ In the above format each sentence is separated by an empty line. Each line consi
 Data loader
 -----------
 
-Loading data into the model can be done using the ``NamedEntityDataset`` data loader from ``nlp_architect.data.sequential_tagging`` package, which can be used with the preprared train and test data sets described above.
+Loading data into the model can be done using the ``SequentialTaggingDataset`` data loader from ``nlp_architect.data.sequential_tagging`` package, which can be used with the preprared train and test data sets described above.
 
 The data loader returns 2 numpy matrices:
 1. sparse word representation of the sentence words
diff --git a/examples/ner/train.py b/examples/ner/train.py
@@ -23,7 +23,7 @@
 
 from keras.utils import to_categorical
 from nlp_architect.contrib.keras.callbacks import ConllCallback
-from nlp_architect.data.sequential_tagging import NamedEntityDataset
+from nlp_architect.data.sequential_tagging import SequentialTaggingDataset
 from nlp_architect.models.ner_crf import NERCRF
 from nlp_architect.utils.io import validate_existing_filepath, validate_parent_exists, validate
 from nlp_architect.utils.metrics import get_conll_scores
@@ -88,10 +88,10 @@ def validate_input_args(args):
     args = read_input_args()
 
     # load dataset and parameters
-    dataset = NamedEntityDataset(args.train_file, args.test_file,
-                                 max_sentence_length=args.sentence_length,
-                                 max_word_length=args.word_length,
-                                 tag_field_no=args.tag_num)
+    dataset = SequentialTaggingDataset(args.train_file, args.test_file,
+                                       max_sentence_length=args.sentence_length,
+                                       max_word_length=args.word_length,
+                                       tag_field_no=args.tag_num)
 
     # get the train and test data sets
     x_train, x_char_train, y_train = dataset.train
diff --git a/nlp_architect/contrib/neon/layers.py b/nlp_architect/contrib/neon/layers.py
@@ -234,6 +234,9 @@ def bprop(self, error, alpha=1.0, beta=0.0):
 
 
 class TimeDistBiLSTM(BiLSTM):
+    """
+    A Bi-directional LSTM layer that supports time step output of the LSTM layer.
+    """
     def __init__(self, output_size, init, init_inner=None, activation=None,
                  gate_activation=None, reset_cells=False, reset_freq=0,
                  split_inputs=False, name=None):
diff --git a/nlp_architect/data/sequential_tagging.py b/nlp_architect/data/sequential_tagging.py
@@ -25,7 +25,18 @@
 from nlp_architect.utils.text import Vocabulary
 
 
-class NamedEntityDataset(object):
+class SequentialTaggingDataset(object):
+    """
+    Sequential tagging dataset loader.
+    Loads train/test files with tabular separation.
+
+    Args:
+        train_file (str): path to train file
+        test_file (str): path to test file
+        max_sentence_length (int, optional): max sentence length
+        max_word_length (int, optional): max word length
+        tag_field_no (int, optional): index of column to use a y-samples
+    """
     def __init__(self,
                  train_file,
                  test_file,
@@ -39,12 +50,12 @@ def __init__(self,
         self.tf = tag_field_no
 
         self.vocabs = {'token': Vocabulary(2),  # 0=pad, 1=unk
-                       'char': Vocabulary(2),  # 0=pad, 1=unk
-                       'tag': Vocabulary(1)}  # 0=pad
+                       'char': Vocabulary(2),   # 0=pad, 1=unk
+                       'tag': Vocabulary(1)}    # 0=pad
 
         self.data = {}
         for f in self.files:
-            raw_sentences = self.read_file(self.files[f])
+            raw_sentences = self._read_file(self.files[f])
             word_vecs = []
             char_vecs = []
             tag_vecs = []
@@ -69,42 +80,49 @@ def __init__(self,
 
     @property
     def y_labels(self):
+        """return y labels"""
         return self.vocabs['tag'].vocab
 
     @property
     def word_vocab(self):
+        """words vocabulary"""
         return self.vocabs['token'].vocab
 
     @property
     def char_vocab(self):
+        """characters vocabulary"""
         return self.vocabs['char'].vocab
 
     @property
     def word_vocab_size(self):
+        """word vocabulary size"""
         return len(self.vocabs['token']) + 2
 
     @property
     def char_vocab_size(self):
+        """character vocabulary size"""
         return len(self.vocabs['char']) + 2
 
     @property
     def train(self):
+        """Get the train set"""
         return self.data['train']
 
     @property
     def test(self):
+        """Get the test set"""
         return self.data['test']
 
-    def read_file(self, path):
+    def _read_file(self, path):
         with open(path, encoding='utf-8') as fp:
             data = fp.readlines()
             data = [d.strip() for d in data]
             data = [d for d in data if 'DOCSTART' not in d]
-            sentences = self.split_into_sentences(data)
-            parsed_sentences = [self.parse_sentence(s) for s in sentences if len(s) > 0]
+            sentences = self._split_into_sentences(data)
+            parsed_sentences = [self._parse_sentence(s) for s in sentences if len(s) > 0]
         return parsed_sentences
 
-    def parse_sentence(self, sentence):
+    def _parse_sentence(self, sentence):
         tokens = []
         tags = []
         for line in sentence:
@@ -118,7 +136,7 @@ def parse_sentence(self, sentence):
         return tokens, tags
 
     @staticmethod
-    def split_into_sentences(file_lines):
+    def _split_into_sentences(file_lines):
         sents = []
         s = []
         for line in file_lines: