From c45977df77a62de0866997a5f975f05e44fb645d Mon Sep 17 00:00:00 2001
From: YuhengHuang <hyhzxhy@qq.com>
Date: Sat, 16 May 2020 11:57:03 +0800
Subject: [PATCH 1/5] Support loading fasttext model from custom file

---
 torchnlp/word_to_vector/fast_text.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/torchnlp/word_to_vector/fast_text.py b/torchnlp/word_to_vector/fast_text.py
index dd266e9..0ac8706 100644
--- a/torchnlp/word_to_vector/fast_text.py
+++ b/torchnlp/word_to_vector/fast_text.py
@@ -46,12 +46,16 @@ class FastText(_PretrainedWordVectors):
         * https://arxiv.org/abs/1710.04087
 
     Args:
-        language (str): language of the vectors
+        name (str or None, optional): The name of the file that contains the vectors
+        url (str or None, optional): url for download if vectors not found in cache
+        language (str): language of the vectors (only needed when both url and name 
+            are ignored)
         aligned (bool): if True: use multilingual embeddings where words with
             the same meaning share (approximately) the same position in the
             vector space across languages. if False: use regular FastText
             embeddings. All available languages can be found under
-            https://github.com/facebookresearch/MUSE#multilingual-word-embeddings
+            https://github.com/facebookresearch/MUSE#multilingual-word-embeddings.
+            (only needed when both url and name are ignored)
         cache (str, optional): directory for cached vectors
         unk_init (callback, optional): by default, initialize out-of-vocabulary word vectors
             to zero vectors; can be any function that takes in a Tensor and
@@ -74,10 +78,12 @@ class FastText(_PretrainedWordVectors):
     url_base = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{}.vec'
     aligned_url_base = 'https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.{}.align.vec'
 
-    def __init__(self, language="en", aligned=False, **kwargs):
-        if aligned:
-            url = self.aligned_url_base.format(language)
-        else:
-            url = self.url_base.format(language)
-        name = os.path.basename(url)
+    def __init__(self, language="en", url=None, name=None, aligned=False, **kwargs):
+        if not name:
+            if not url:
+                if aligned:
+                    url = self.aligned_url_base.format(language)
+                else:
+                    url = self.url_base.format(language)
+            name = os.path.basename(url)
         super(FastText, self).__init__(name, url=url, **kwargs)

From 878722c75cdf7fb43df7c371f5ebccaef504df79 Mon Sep 17 00:00:00 2001
From: YuhengHuang <hyhzxhy@qq.com>
Date: Sun, 17 May 2020 00:02:57 +0800
Subject: [PATCH 2/5] fix problem in .flake8

---
 .flake8 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.flake8 b/.flake8
index 85153c3..8a07a89 100755
--- a/.flake8
+++ b/.flake8
@@ -1,4 +1,4 @@
 [flake8]
-ignore = E402, E722, E731, W504
+ignore = E402, E722, E731, E741, W504
 max-line-length = 100
 exclude = examples/
\ No newline at end of file

From a09b9c31eef7ee47e0e726d4f59ba6a78c5dbd06 Mon Sep 17 00:00:00 2001
From: YuhengHuang <hyhzxhy@qq.com>
Date: Sun, 17 May 2020 17:30:33 +0800
Subject: [PATCH 3/5] add rouge-N

---
 tests/metrics/test_rouge.py  |  25 +++++++++
 torchnlp/metrics/__init__.py |   2 +-
 torchnlp/metrics/rouge.py    | 104 +++++++++++++++++++++++++++++++++++
 3 files changed, 130 insertions(+), 1 deletion(-)
 create mode 100644 tests/metrics/test_rouge.py
 create mode 100644 torchnlp/metrics/rouge.py

diff --git a/tests/metrics/test_rouge.py b/tests/metrics/test_rouge.py
new file mode 100644
index 0000000..af6832f
--- /dev/null
+++ b/tests/metrics/test_rouge.py
@@ -0,0 +1,25 @@
+import numpy as np
+
+from torchnlp.metrics import get_rouge_n
+
+
+def test_rouge_1_exclusive():
+    hypotheses = ["the cat was found under the bed"]
+    references = ["the cat was under the bed"]
+    result = get_rouge_n(hypotheses, references, n=1, exclusive=True)
+    precision = result['p']
+    np.testing.assert_almost_equal(precision, 0.833, decimal=3)
+
+def test_rouge_1_inclusive():
+    hypotheses = ["the cat was found under the bed"]
+    references = ["the cat was under the bed"]
+    result = get_rouge_n(hypotheses, references, n=1, exclusive=False)
+    precision = result['p']
+    np.testing.assert_almost_equal(precision, 0.857, decimal=3)
+
+def test_rouge_2_exclusive():
+    hypotheses =["police killed the gunman"]
+    references = ["police kill the gunman"]
+    result = get_rouge_n(hypotheses, references, exclusive=True)
+    recall = result['p']
+    np.testing.assert_almost_equal(recall, 0.333, decimal=3)
\ No newline at end of file
diff --git a/torchnlp/metrics/__init__.py b/torchnlp/metrics/__init__.py
index 87ccd0f..f2dc80a 100755
--- a/torchnlp/metrics/__init__.py
+++ b/torchnlp/metrics/__init__.py
@@ -9,4 +9,4 @@
 # TODO: Implement perplexity
 # TODO: Implement rogue metric
 
-__all__ = ['get_accuracy', 'get_token_accuracy', 'get_moses_multi_bleu']
+__all__ = ['get_accuracy', 'get_token_accuracy', 'get_moses_multi_bleu', 'get_rouge_n']
diff --git a/torchnlp/metrics/rouge.py b/torchnlp/metrics/rouge.py
new file mode 100644
index 0000000..a80b0d0
--- /dev/null
+++ b/torchnlp/metrics/rouge.py
@@ -0,0 +1,104 @@
+import numpy as np
+
+class Ngrams(object):
+    """
+    datastructure for n grams.
+    if `exclusive`, datastructure is set
+    otherwise, datastructure is list
+    """
+    def __init__(self, ngrams={}, exclusive=True):
+        self.exclusive = exclusive
+        if exclusive:
+            self._grams = set(ngrams)
+        else:
+            self._grams = list(ngrams)
+    def __len__(self):
+        return len(self._grams)
+    def add(self, elem):
+        if self.exclusive:
+            self._grams.add(elem)
+        else:
+            self._grams.append(elem)
+    def intersection(self, other_gram):
+        if self.exclusive:
+            inter_set = self._grams.intersection(other_gram._grams)
+            return Ngrams(inter_set, exclusive=True)
+        else:
+            other_dict = dict()
+            inter_list = list()
+            for gram in other_gram._grams:
+                other_dict[gram] = other_dict.get(gram, 0) + 1
+            for gram in self._grams:
+                if gram in other_dict and other_dict[gram] > 0:
+                    other_dict[gram] -= 1
+                    inter_list.append(gram)
+            return Ngrams(inter_list, exclusive=False)
+
+
+def _get_ngrams(n, text, exclusive):
+    """
+    calculate the n-grams.
+    Args:
+        n: n-gram to calculate
+        text: An array of tokens
+        exclusive: if True, the datastructure is set, else is set.
+    Returns:
+        A set of n-grams
+    """
+    ngram_set = Ngrams(exclusive=exclusive)
+    if type(text) == str:
+        text = text.split()
+    text_length = len(text)
+    index_ngram_end = text_length - n
+    for i in range(index_ngram_end + 1):
+        ngram_set.add(tuple(text[i:i + n]))
+    return ngram_set
+
+def _frp_rouge_n(eval_count, ref_count, overlapping_count):
+    """
+    compute f score, precision socre and recall score.
+    Args:
+        eval_count: the evaluation sentence n-gram count
+        ref_count: the reference sentence n-gram count
+        overlapping_count: the overlapping n-gram between evaluation and reference. 
+    """
+    if eval_count == 0:
+        precision = 0.0
+    else:
+        precision = overlapping_count / eval_count
+
+    if ref_count == 0:
+        recall = 0.0
+    else:
+        recall = overlapping_count / ref_count
+
+    f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))
+
+    return {"f": f1_score, "p": precision, "r": recall}
+
+
+def get_rouge_n(evaluated_sentences, reference_sentences, n=2, exclusive=True):
+    """
+    Computes ROUGE-N of two text collections of sentences, namely evaluated_sentences and
+        reference senteces
+    Args:
+        evaluated_sentences: The sentences that have been produced by the
+                           summarizer
+        reference_sentences: The sentences from the referene set
+        n: Size of ngram.  Defaults to 2.
+    Returns:
+        tuple. (f1, precision, recall) for ROUGE-N
+    """
+    if len(evaluated_sentences) <= 0:
+        raise ValueError("Hypothesis set is empty.")
+    if len(reference_sentences) <= 0:
+        raise ValueError("reference set is empty")
+    for eval_sentence, ref_sentence in zip(evaluated_sentences, reference_sentences):
+        eval_ngrams = _get_ngrams(n, eval_sentence, exclusive)
+        ref_grams = _get_ngrams(n, ref_sentence, exclusive)
+        ref_count = len(ref_grams)
+        eval_count = len(eval_ngrams)
+        overlapping_ngrams = eval_ngrams.intersection(ref_grams)
+        overlapping_count = len(overlapping_ngrams)
+    return _frp_rouge_n(eval_count, ref_count, overlapping_count)
+    
\ No newline at end of file

From 20b25c3e9628e79e2755e688d3971893d3481a69 Mon Sep 17 00:00:00 2001
From: YuhengHuang <hyhzxhy@qq.com>
Date: Sun, 17 May 2020 21:21:51 +0800
Subject: [PATCH 4/5] modify setting in metrics/__init__.py

---
 torchnlp/metrics/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchnlp/metrics/__init__.py b/torchnlp/metrics/__init__.py
index f2dc80a..03ba4d4 100755
--- a/torchnlp/metrics/__init__.py
+++ b/torchnlp/metrics/__init__.py
@@ -1,7 +1,7 @@
 from torchnlp.metrics.accuracy import get_accuracy
 from torchnlp.metrics.accuracy import get_token_accuracy
 from torchnlp.metrics.bleu import get_moses_multi_bleu
-
+from torchnlp.metrics.rouge import get_rouge_n
 # TODO: Use `sklearn.metrics` for a `confusion_matrix` implemented with ignore_index
 # TODO: Use `sklearn.metrics` for a `recall` implemented with ignore_index
 # TODO: Use `sklearn.metrics` for a `precision` implemented with ignore_index

From dc3b347c2db6aa9f8bac02d722e276f466fe772a Mon Sep 17 00:00:00 2001
From: YuhengHuang <hyhzxhy@qq.com>
Date: Sun, 17 May 2020 21:52:35 +0800
Subject: [PATCH 5/5] fix code-style problem

---
 tests/metrics/test_rouge.py          |  25 -------
 torchnlp/metrics/__init__.py         |   3 +-
 torchnlp/metrics/rouge.py            | 104 ---------------------------
 torchnlp/word_to_vector/fast_text.py |   2 +-
 4 files changed, 2 insertions(+), 132 deletions(-)
 delete mode 100644 tests/metrics/test_rouge.py
 delete mode 100644 torchnlp/metrics/rouge.py

diff --git a/tests/metrics/test_rouge.py b/tests/metrics/test_rouge.py
deleted file mode 100644
index af6832f..0000000
--- a/tests/metrics/test_rouge.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import numpy as np
-
-from torchnlp.metrics import get_rouge_n
-
-
-def test_rouge_1_exclusive():
-    hypotheses = ["the cat was found under the bed"]
-    references = ["the cat was under the bed"]
-    result = get_rouge_n(hypotheses, references, n=1, exclusive=True)
-    precision = result['p']
-    np.testing.assert_almost_equal(precision, 0.833, decimal=3)
-
-def test_rouge_1_inclusive():
-    hypotheses = ["the cat was found under the bed"]
-    references = ["the cat was under the bed"]
-    result = get_rouge_n(hypotheses, references, n=1, exclusive=False)
-    precision = result['p']
-    np.testing.assert_almost_equal(precision, 0.857, decimal=3)
-
-def test_rouge_2_exclusive():
-    hypotheses =["police killed the gunman"]
-    references = ["police kill the gunman"]
-    result = get_rouge_n(hypotheses, references, exclusive=True)
-    recall = result['p']
-    np.testing.assert_almost_equal(recall, 0.333, decimal=3)
\ No newline at end of file
diff --git a/torchnlp/metrics/__init__.py b/torchnlp/metrics/__init__.py
index 03ba4d4..5db0192 100755
--- a/torchnlp/metrics/__init__.py
+++ b/torchnlp/metrics/__init__.py
@@ -1,7 +1,6 @@
 from torchnlp.metrics.accuracy import get_accuracy
 from torchnlp.metrics.accuracy import get_token_accuracy
 from torchnlp.metrics.bleu import get_moses_multi_bleu
-from torchnlp.metrics.rouge import get_rouge_n
 # TODO: Use `sklearn.metrics` for a `confusion_matrix` implemented with ignore_index
 # TODO: Use `sklearn.metrics` for a `recall` implemented with ignore_index
 # TODO: Use `sklearn.metrics` for a `precision` implemented with ignore_index
@@ -9,4 +8,4 @@
 # TODO: Implement perplexity
 # TODO: Implement rogue metric
 
-__all__ = ['get_accuracy', 'get_token_accuracy', 'get_moses_multi_bleu', 'get_rouge_n']
+__all__ = ['get_accuracy', 'get_token_accuracy', 'get_moses_multi_bleu']
diff --git a/torchnlp/metrics/rouge.py b/torchnlp/metrics/rouge.py
deleted file mode 100644
index a80b0d0..0000000
--- a/torchnlp/metrics/rouge.py
+++ /dev/null
@@ -1,104 +0,0 @@
-import numpy as np
-
-class Ngrams(object):
-    """
-    datastructure for n grams.
-    if `exclusive`, datastructure is set
-    otherwise, datastructure is list
-    """
-    def __init__(self, ngrams={}, exclusive=True):
-        self.exclusive = exclusive
-        if exclusive:
-            self._grams = set(ngrams)
-        else:
-            self._grams = list(ngrams)
-    def __len__(self):
-        return len(self._grams)
-    def add(self, elem):
-        if self.exclusive:
-            self._grams.add(elem)
-        else:
-            self._grams.append(elem)
-    def intersection(self, other_gram):
-        if self.exclusive:
-            inter_set = self._grams.intersection(other_gram._grams)
-            return Ngrams(inter_set, exclusive=True)
-        else:
-            other_dict = dict()
-            inter_list = list()
-            for gram in other_gram._grams:
-                other_dict[gram] = other_dict.get(gram, 0) + 1
-            for gram in self._grams:
-                if gram in other_dict and other_dict[gram] > 0:
-                    other_dict[gram] -= 1
-                    inter_list.append(gram)
-            return Ngrams(inter_list, exclusive=False)
-
-
-def _get_ngrams(n, text, exclusive):
-    """
-    calculate the n-grams.
-    Args:
-        n: n-gram to calculate
-        text: An array of tokens
-        exclusive: if True, the datastructure is set, else is set.
-    Returns:
-        A set of n-grams
-    """
-    ngram_set = Ngrams(exclusive=exclusive)
-    if type(text) == str:
-        text = text.split()
-    text_length = len(text)
-    index_ngram_end = text_length - n
-    for i in range(index_ngram_end + 1):
-        ngram_set.add(tuple(text[i:i + n]))
-    return ngram_set
-
-def _frp_rouge_n(eval_count, ref_count, overlapping_count):
-    """
-    compute f score, precision socre and recall score.
-    Args:
-        eval_count: the evaluation sentence n-gram count
-        ref_count: the reference sentence n-gram count
-        overlapping_count: the overlapping n-gram between evaluation and reference. 
-    """
-    if eval_count == 0:
-        precision = 0.0
-    else:
-        precision = overlapping_count / eval_count
-
-    if ref_count == 0:
-        recall = 0.0
-    else:
-        recall = overlapping_count / ref_count
-
-    f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))
-
-    return {"f": f1_score, "p": precision, "r": recall}
-
-
-def get_rouge_n(evaluated_sentences, reference_sentences, n=2, exclusive=True):
-    """
-    Computes ROUGE-N of two text collections of sentences, namely evaluated_sentences and
-        reference senteces
-    Args:
-        evaluated_sentences: The sentences that have been produced by the
-                           summarizer
-        reference_sentences: The sentences from the referene set
-        n: Size of ngram.  Defaults to 2.
-    Returns:
-        tuple. (f1, precision, recall) for ROUGE-N
-    """
-    if len(evaluated_sentences) <= 0:
-        raise ValueError("Hypothesis set is empty.")
-    if len(reference_sentences) <= 0:
-        raise ValueError("reference set is empty")
-    for eval_sentence, ref_sentence in zip(evaluated_sentences, reference_sentences):
-        eval_ngrams = _get_ngrams(n, eval_sentence, exclusive)
-        ref_grams = _get_ngrams(n, ref_sentence, exclusive)
-        ref_count = len(ref_grams)
-        eval_count = len(eval_ngrams)
-        overlapping_ngrams = eval_ngrams.intersection(ref_grams)
-        overlapping_count = len(overlapping_ngrams)
-    return _frp_rouge_n(eval_count, ref_count, overlapping_count)
-    
\ No newline at end of file
diff --git a/torchnlp/word_to_vector/fast_text.py b/torchnlp/word_to_vector/fast_text.py
index 0ac8706..85c67ce 100644
--- a/torchnlp/word_to_vector/fast_text.py
+++ b/torchnlp/word_to_vector/fast_text.py
@@ -48,7 +48,7 @@ class FastText(_PretrainedWordVectors):
     Args:
         name (str or None, optional): The name of the file that contains the vectors
         url (str or None, optional): url for download if vectors not found in cache
-        language (str): language of the vectors (only needed when both url and name 
+        language (str): language of the vectors (only needed when both url and name
             are ignored)
         aligned (bool): if True: use multilingual embeddings where words with
             the same meaning share (approximately) the same position in the