Some more cleanup

davisking · davisking · commit 12561a1d3069 · 2025-03-23T21:23:33.000-04:00
diff --git a/dlib/tokenizer/bpe_tokenizer.h b/dlib/tokenizer/bpe_tokenizer.h
@@ -71,7 +71,7 @@ namespace dlib
         // Train the tokenizer on the given text
         void train(const std::string& text, int vocab_size, bool verbose = false)
         {
-            assert(vocab_size >= BASE_VOCAB_SIZE);
+            DLIB_CASSERT(vocab_size >= BASE_VOCAB_SIZE);
             this->vocab_size = vocab_size;
             int num_merges = vocab_size - BASE_VOCAB_SIZE;
 
@@ -122,7 +122,7 @@ namespace dlib
         }
 
         // Encode the given text into subword tokens
-        std::vector<int> encode(const std::string& text)
+        std::vector<int> encode(const std::string& text) const
         {
             std::vector<int> result_ids;
             std::mutex result_mutex;
@@ -210,13 +210,13 @@ namespace dlib
         }
 
         // Decode a single token ID back into text
-        std::string decode(int id, bool display_special_tokens = true)
+        std::string decode(int id, bool display_special_tokens = true) const
         {
             return decode(std::vector<int>({ id }), display_special_tokens);
         }
 
         // Decode a sequence of token IDs back into text
-        std::string decode(const std::vector<int>& ids, bool display_special_tokens = true)
+        std::string decode(const std::vector<int>& ids, bool display_special_tokens = true) const
         {
             std::vector<uint8_t> bytes;
             int vocab_size = static_cast<int>(get_vocab_size());
@@ -275,7 +275,7 @@ namespace dlib
         }
 
         // Get the total vocabulary size
-        size_t get_vocab_size(void) const
+        size_t get_vocab_size() const
         {
             return (vocab.size() + special_tokens.size());
         }
@@ -300,7 +300,7 @@ namespace dlib
                 return hash1 ^ (hash2 << 1);
             }
         };
-        std::unordered_map<std::pair<int, int>, int, pair_hash> get_stats(const std::vector<int>& ids)
+        std::unordered_map<std::pair<int, int>, int, pair_hash> get_stats(const std::vector<int>& ids) const
         {
             std::unordered_map<std::pair<int, int>, int, pair_hash> global_stats;
             std::mutex global_stats_mutex;
@@ -332,7 +332,8 @@ namespace dlib
         }
 
         // Finds the most frequent pair of tokens in the given statistics map that does not exceed the maximum token length
-        std::pair<int, int> get_most_frequent_pair(const std::unordered_map<std::pair<int, int>, int, pair_hash>& stats) {
+        std::pair<int, int> get_most_frequent_pair(const std::unordered_map<std::pair<int, int>, int, pair_hash>& stats) const 
+        {
             std::pair<int, int> best_pair = { -1, -1 }; // Initialize the best pair to an invalid value
             double max_score = 0; // Initialize the maximum score to 0
 
@@ -342,7 +343,7 @@ namespace dlib
                 int count = stat.second; // Extract the frequency count
 
                 // Check if the new token formed by merging the pair would exceed the maximum allowed length
-                size_t new_token_length = vocab[pair.first].size() + vocab[pair.second].size();
+                size_t new_token_length = vocab.at(pair.first).size() + vocab.at(pair.second).size();
                 if (new_token_length > MAX_TOKEN_LENGTH) continue; // Skip this pair if it exceeds the maximum token length
 
                 // Calculate the score for this pair (frequency * length_penalty)
@@ -360,7 +361,8 @@ namespace dlib
         }
 
         // Merge the most frequent pair in the token sequence
-        std::vector<int> merge(std::vector<int>& ids, const std::pair<int, int>& pair, int idx) {
+        std::vector<int> merge(std::vector<int>& ids, const std::pair<int, int>& pair, int idx) const
+        {
             std::vector<int> new_ids;
             new_ids.reserve(ids.size()); // Reserve space to avoid reallocations
 
diff --git a/dlib/tokenizer/bpe_tokenizer_abstract.h b/dlib/tokenizer/bpe_tokenizer_abstract.h
@@ -19,9 +19,7 @@ namespace dlib
     class bpe_tokenizer
     {
         /*!
-            CLASS bpe_tokenizer
-                A Byte Pair Encoding (BPE) tokenizer for text processing.
-
+            WHAT THIS OBJECT REPRESENTS
                 This class implements a Byte Pair Encoding (BPE) tokenizer, which is a subword
                 tokenization algorithm commonly used in natural language processing (NLP). The
                 BPE algorithm iteratively merges the most frequent pairs of bytes or characters
@@ -37,21 +35,17 @@ namespace dlib
                 text into subword tokens, and decoding tokens back into text. The tokenizer can be
                 serialized and deserialized to/from a file, allowing for easy storage and reuse.
 
-                INITIAL VALUE
-                    - The base vocabulary is initialized with single-byte tokens (0-255).
-                    - Special tokens are pre-defined and assigned IDs starting from 256.
-                    - The maximum token length is set to 8 bytes.
-
-                WHAT THIS OBJECT REPRESENTS
-                    This object represents a BPE tokenizer capable of encoding and decoding text
-                    using a learned subword vocabulary. It is designed to handle UTF-8 encoded text
-                    and supports multi-threaded processing for efficient tokenization.
-
                 REFERENCES
                     - Sennrich, R., Haddow, B., & Birch, A. (2016). Neural Machine Translation of
                       Rare Words with Subword Units. In Proceedings of the 54th Annual Meeting of
                       the Association for Computational Linguistics (ACL 2016).
+
+            INITIAL VALUE
+                - The base vocabulary is initialized with single-byte tokens (0-255).
+                - Special tokens are pre-defined and assigned IDs starting from 256.
+                - The maximum token length is set to 8 bytes.
         !*/
+
     public:
         bpe_tokenizer();
         /*!
@@ -77,7 +71,7 @@ namespace dlib
 
         std::vector<int> encode(
             const std::string& text
-        );
+        ) const;
         /*!
             ensures
                 - Encodes the input text into a sequence of subword tokens.
@@ -88,32 +82,19 @@ namespace dlib
         std::string decode(
             const std::vector<int>& ids,
             bool display_special_tokens = true
-        );
+        ) const;
         /*!
             ensures
                 - Decodes a sequence of token IDs back into a human-readable string.
                 - If `display_special_tokens` is true, special tokens are included in the output.
                 - Returns the decoded text as a UTF-8 encoded string.
         !*/
 
-        void serialize(
-            const bpe_tokenizer& tok,
-            std::ostream& out
-        );
-        /*!
-            ensures
-                - Serializes the tokenizer's vocabulary and merge operations to the output stream.
-                - The serialized data can be used to reconstruct the tokenizer later.
-        !*/
-
-        void deserialize(
-            bpe_tokenizer& tok,
-            std::istream& in
-        );
+        std::string decode(int id, bool display_special_tokens = true) const
+        { return decode(std::vector<int>({ id }), display_special_tokens); }
         /*!
             ensures
-                - Deserializes the tokenizer's vocabulary and merge operations from the input stream.
-                - Restores the tokenizer to the state it was in when serialized.
+                - decode a single token back into text.
         !*/
 
         int get_special_token_id(
@@ -130,26 +111,25 @@ namespace dlib
             ensures
                 - Returns the total size of the vocabulary, including base tokens and special tokens.
         !*/
-
-    private:
-        // Private implementation details
-        std::map<std::string, int> special_tokens;
-        std::unordered_map<int, std::string> special_token_map;
-        std::map<std::pair<int, int>, int> merges;
-        std::map<int, std::vector<uint8_t>> vocab;
-        int vocab_size;
-
-        static const size_t MAX_TOKEN_LENGTH = 8;
-        static const int BASE_VOCAB_SIZE = 256;
-
-        // Helper functions
-        std::unordered_map<std::pair<int, int>, int, pair_hash> get_stats(const std::vector<int>& ids);
-        std::pair<int, int> get_most_frequent_pair(const std::unordered_map<std::pair<int, int>, int, pair_hash>& stats);
-        std::vector<int> merge(std::vector<int>& ids, const std::pair<int, int>& pair, int idx);
-        std::string bytes_to_string(const std::vector<uint8_t>& bytes);
-        std::vector<uint8_t> string_to_bytes(const std::string& str);
     };
 
+    void serialize(
+        const bpe_tokenizer& tok,
+        std::ostream& out
+    );
+    /*!
+        ensures
+            - Saves the entire state of tok to out.
+    !*/
+
+    void deserialize(
+        bpe_tokenizer& tok,
+        std::istream& in
+    );
+    /*!
+        ensures
+            - Restores the state of a bpe_tokenizer from a serialized state.
+    !*/
 }
 
-#endif // DLIB_BPE_TOKENIZER_ABSTRACT_
+#endif // DLIB_BPE_TOKENIZER_ABSTRACT_

Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ namespace dlib`
`71`	`71`	`// Train the tokenizer on the given text`
`72`	`72`	`void train(const std::string& text, int vocab_size, bool verbose = false)`
`73`	`73`	`{`
`74`		`- assert(vocab_size >= BASE_VOCAB_SIZE);`
	`74`	`+ DLIB_CASSERT(vocab_size >= BASE_VOCAB_SIZE);`
`75`	`75`	`this->vocab_size = vocab_size;`
`76`	`76`	`int num_merges = vocab_size - BASE_VOCAB_SIZE;`
`77`	`77`
`@@ -122,7 +122,7 @@ namespace dlib`
`122`	`122`	`}`
`123`	`123`
`124`	`124`	`// Encode the given text into subword tokens`
`125`		`- std::vector<int> encode(const std::string& text)`
	`125`	`+ std::vector<int> encode(const std::string& text) const`
`126`	`126`	`{`
`127`	`127`	`std::vector<int> result_ids;`
`128`	`128`	`std::mutex result_mutex;`
`@@ -210,13 +210,13 @@ namespace dlib`
`210`	`210`	`}`
`211`	`211`
`212`	`212`	`// Decode a single token ID back into text`
`213`		`- std::string decode(int id, bool display_special_tokens = true)`
	`213`	`+ std::string decode(int id, bool display_special_tokens = true) const`
`214`	`214`	`{`
`215`	`215`	`return decode(std::vector<int>({ id }), display_special_tokens);`
`216`	`216`	`}`
`217`	`217`
`218`	`218`	`// Decode a sequence of token IDs back into text`
`219`		`- std::string decode(const std::vector<int>& ids, bool display_special_tokens = true)`
	`219`	`+ std::string decode(const std::vector<int>& ids, bool display_special_tokens = true) const`
`220`	`220`	`{`
`221`	`221`	`std::vector<uint8_t> bytes;`
`222`	`222`	`int vocab_size = static_cast<int>(get_vocab_size());`
`@@ -275,7 +275,7 @@ namespace dlib`
`275`	`275`	`}`
`276`	`276`
`277`	`277`	`// Get the total vocabulary size`
`278`		`- size_t get_vocab_size(void) const`
	`278`	`+ size_t get_vocab_size() const`
`279`	`279`	`{`
`280`	`280`	`return (vocab.size() + special_tokens.size());`
`281`	`281`	`}`
`@@ -300,7 +300,7 @@ namespace dlib`
`300`	`300`	`return hash1 ^ (hash2 << 1);`
`301`	`301`	`}`
`302`	`302`	`};`
`303`		`- std::unordered_map<std::pair<int, int>, int, pair_hash> get_stats(const std::vector<int>& ids)`
	`303`	`+ std::unordered_map<std::pair<int, int>, int, pair_hash> get_stats(const std::vector<int>& ids) const`
`304`	`304`	`{`
`305`	`305`	`std::unordered_map<std::pair<int, int>, int, pair_hash> global_stats;`
`306`	`306`	`std::mutex global_stats_mutex;`
`@@ -332,7 +332,8 @@ namespace dlib`
`332`	`332`	`}`
`333`	`333`
`334`	`334`	`// Finds the most frequent pair of tokens in the given statistics map that does not exceed the maximum token length`
`335`		`- std::pair<int, int> get_most_frequent_pair(const std::unordered_map<std::pair<int, int>, int, pair_hash>& stats) {`
	`335`	`+ std::pair<int, int> get_most_frequent_pair(const std::unordered_map<std::pair<int, int>, int, pair_hash>& stats) const`
	`336`	`+ {`
`336`	`337`	`std::pair<int, int> best_pair = { -1, -1 }; // Initialize the best pair to an invalid value`
`337`	`338`	`double max_score = 0; // Initialize the maximum score to 0`
`338`	`339`
`@@ -342,7 +343,7 @@ namespace dlib`
`342`	`343`	`int count = stat.second; // Extract the frequency count`
`343`	`344`
`344`	`345`	`// Check if the new token formed by merging the pair would exceed the maximum allowed length`
`345`		`- size_t new_token_length = vocab[pair.first].size() + vocab[pair.second].size();`
	`346`	`+ size_t new_token_length = vocab.at(pair.first).size() + vocab.at(pair.second).size();`
`346`	`347`	`if (new_token_length > MAX_TOKEN_LENGTH) continue; // Skip this pair if it exceeds the maximum token length`
`347`	`348`
`348`	`349`	`// Calculate the score for this pair (frequency * length_penalty)`
`@@ -360,7 +361,8 @@ namespace dlib`
`360`	`361`	`}`
`361`	`362`
`362`	`363`	`// Merge the most frequent pair in the token sequence`
`363`		`- std::vector<int> merge(std::vector<int>& ids, const std::pair<int, int>& pair, int idx) {`
	`364`	`+ std::vector<int> merge(std::vector<int>& ids, const std::pair<int, int>& pair, int idx) const`
	`365`	`+ {`
`364`	`366`	`std::vector<int> new_ids;`
`365`	`367`	`new_ids.reserve(ids.size()); // Reserve space to avoid reallocations`
`366`	`368`