Implement pickle support for Vocab objects (#303)

guillaumekln · web-flow · commit d3b5ad845457 · 2022-12-06T09:48:12.000+01:00
diff --git a/bindings/python/README.md b/bindings/python/README.md
@@ -264,7 +264,7 @@ vocab.__getitem__(token: str) -> int    # Implements: vocab["hello"]
 # If a tokenizer is not set, the text is split on spaces.
 vocab.add_from_text(text: str, tokenizer: Optional[pyonmttok.Tokenizer] = None) -> None
 vocab.add_from_file(path: str, tokenizer: Optional[pyonmttok.Tokenizer] = None) -> None
-vocab.add_token(token: str) -> None
+vocab.add_token(token: str, count: int = 1) -> None
 
 vocab.resize(maximum_size: int = 0, minimum_frequency: int = 1) -> None
 
diff --git a/bindings/python/pyonmttok/Python.cc b/bindings/python/pyonmttok/Python.cc
@@ -754,7 +754,7 @@ PYBIND11_MODULE(_ext, m)
     .def_property_readonly("ids_to_tokens", &onmt::Vocab::ids_to_tokens)
     .def_property_readonly("counters", &onmt::Vocab::counters)
 
-    .def("add_token", &onmt::Vocab::add_token, py::arg("token"))
+    .def("add_token", &onmt::Vocab::add_token, py::arg("token"), py::arg("count")=1)
 
     .def("add_from_text",
          [](onmt::Vocab& vocab,
@@ -792,5 +792,30 @@ PYBIND11_MODULE(_ext, m)
          [](const onmt::Vocab& vocab, const py::object& dict) {
            return onmt::Vocab(vocab);
          })
+
+    .def(py::pickle(
+           [](const onmt::Vocab& vocab) {
+             return py::make_tuple(
+               /*version=*/1,
+               vocab.ids_to_tokens(),
+               vocab.counters(),
+               vocab.get_default_id());
+           },
+           [](py::tuple t) {
+             if (t.size() != 4 || t[0].cast<unsigned int>() != 1)
+               throw std::runtime_error("Invalid pickle data");
+
+             auto tokens = t[1].cast<std::vector<std::string>>();
+             auto counters = t[2].cast<std::vector<size_t>>();
+             auto default_id = t[3].cast<size_t>();
+
+             onmt::Vocab vocab;
+             vocab.set_default_id(default_id);
+
+             for (size_t i = 0; i < tokens.size(); ++i)
+               vocab.add_token(std::move(tokens[i]), counters[i]);
+
+             return vocab;
+           }));
     ;
 }
diff --git a/bindings/python/test/test.py b/bindings/python/test/test.py
@@ -530,6 +530,9 @@ def test_token_pickle():
     assert token == token2
 
 
+_MAX_COUNTER = 18446744073709551615
+
+
 def test_vocab():
     special_tokens = ["<blank>", "<s>", "</s>"]
     vocab = pyonmttok.Vocab(special_tokens=special_tokens)
@@ -557,9 +560,9 @@ def test_vocab():
     }
 
     assert vocab.counters == [
-        18446744073709551615,
-        18446744073709551615,
-        18446744073709551615,
+        _MAX_COUNTER,
+        _MAX_COUNTER,
+        _MAX_COUNTER,
         2,
         1,
     ]
@@ -628,3 +631,19 @@ def test_vocab_default_id(tokens, default_id, expected_default_id):
         vocab.default_id = default_id
     assert vocab.default_id == expected_default_id
     assert vocab.lookup_token("oov") == expected_default_id
+
+
+def test_vocab_pickle():
+    vocab = pyonmttok.build_vocab_from_tokens(
+        ["a", "b", "a", "a", "c", "c"], special_tokens=["z"]
+    )
+    vocab.default_id = 0
+
+    data = pickle.dumps(vocab)
+    vocab_clone = pickle.loads(data)
+
+    assert vocab_clone is not vocab
+    assert len(vocab) == 4
+    assert vocab.ids_to_tokens == ["z", "a", "b", "c"]
+    assert vocab.default_id == 0
+    assert vocab.counters == [_MAX_COUNTER, 3, 1, 2]
diff --git a/include/onmt/Vocab.h b/include/onmt/Vocab.h
@@ -61,7 +61,7 @@ namespace onmt
       return _frequencies;
     }
 
-    void add_token(std::string token);
+    void add_token(std::string token, size_t count = 1);
     void add_from_text(const std::string& text, const Tokenizer* tokenizer = nullptr);
     void add_from_stream(std::istream& is, const Tokenizer* tokenizer = nullptr);
     void resize(size_t maximum_size = 0, size_t minimum_frequency = 1);
diff --git a/src/Vocab.cc b/src/Vocab.cc
@@ -18,7 +18,7 @@ namespace onmt
       frequency = maximum_frequency;
   }
 
-  void Vocab::add_token(std::string token)
+  void Vocab::add_token(std::string token, size_t count)
   {
     const size_t id = _ids_to_tokens.size();
     const auto pair = _tokens_to_ids.emplace(std::move(token), id);
@@ -28,11 +28,15 @@ namespace onmt
     if (inserted)
     {
       _ids_to_tokens.emplace_back(entry.first);
-      _frequencies.emplace_back(1);
+      _frequencies.emplace_back(count);
     }
-    else if (_frequencies[entry.second] < maximum_frequency)
+    else if (_frequencies[entry.second] <= maximum_frequency - count)
     {
-      _frequencies[entry.second]++;
+      _frequencies[entry.second] += count;
+    }
+    else
+    {
+      _frequencies[entry.second] = maximum_frequency;
     }
   }
 

Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@ namespace onmt`
`61`	`61`	`return _frequencies;`
`62`	`62`	`}`
`63`	`63`
`64`		`- void add_token(std::string token);`
	`64`	`+ void add_token(std::string token, size_t count = 1);`
`65`	`65`	`void add_from_text(const std::string& text, const Tokenizer* tokenizer = nullptr);`
`66`	`66`	`void add_from_stream(std::istream& is, const Tokenizer* tokenizer = nullptr);`
`67`	`67`	`void resize(size_t maximum_size = 0, size_t minimum_frequency = 1);`
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ namespace onmt`
`18`	`18`	`frequency = maximum_frequency;`
`19`	`19`	`}`
`20`	`20`
`21`		`- void Vocab::add_token(std::string token)`
	`21`	`+ void Vocab::add_token(std::string token, size_t count)`
`22`	`22`	`{`
`23`	`23`	`const size_t id = _ids_to_tokens.size();`
`24`	`24`	`const auto pair = _tokens_to_ids.emplace(std::move(token), id);`
`@@ -28,11 +28,15 @@ namespace onmt`
`28`	`28`	`if (inserted)`
`29`	`29`	`{`
`30`	`30`	`_ids_to_tokens.emplace_back(entry.first);`
`31`		`- _frequencies.emplace_back(1);`
	`31`	`+ _frequencies.emplace_back(count);`
`32`	`32`	`}`
`33`		`- else if (_frequencies[entry.second] < maximum_frequency)`
	`33`	`+ else if (_frequencies[entry.second] <= maximum_frequency - count)`
`34`	`34`	`{`
`35`		`- _frequencies[entry.second]++;`
	`35`	`+ _frequencies[entry.second] += count;`
	`36`	`+ }`
	`37`	`+ else`
	`38`	`+ {`
	`39`	`+ _frequencies[entry.second] = maximum_frequency;`
`36`	`40`	`}`
`37`	`41`	`}`
`38`	`42`