From 2e4f35337ac87dca27a183cb86211501233e99bc Mon Sep 17 00:00:00 2001 From: Nicholas Long <19273992+cptspacemanspiff@users.noreply.github.com> Date: Sat, 25 Jan 2025 15:26:17 -0800 Subject: [PATCH 1/3] Add HFTokenizerHeader --- include/tokenizers_cpp.h | 42 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/include/tokenizers_cpp.h b/include/tokenizers_cpp.h index d37aa57..6d6ca61 100644 --- a/include/tokenizers_cpp.h +++ b/include/tokenizers_cpp.h @@ -106,5 +106,47 @@ class Tokenizer { static std::unique_ptr FromBlobRWKVWorld(const std::string& model_blob); }; +#include + +class HFTokenizer : public Tokenizer { + public: + explicit HFTokenizer(TokenizerHandle handle) : handle_(handle) { +#ifdef COMPILE_WASM_RUNTIME + setenv("TOKENIZERS_PARALLELISM", "false", true); +#endif + } + + HFTokenizer(const HFTokenizer&); + HFTokenizer(HFTokenizer&& other); + + ~HFTokenizer(); + + // use i32 to be consistent with sentencepiece + std::vector Encode(const std::string& text, bool add_special_tokens); + + // use i32 to be consistent with sentencepiece + std::vector Encode(const std::string& text) final; + + std::vector> EncodeBatch(const std::vector& texts, + bool add_special_tokens); + + std::vector> EncodeBatch(const std::vector& texts) final; + + // use i32 to be consistent with sentencepiece + std::string Decode(const std::vector& ids, bool skip_special_tokens); + + std::string Decode(const std::vector& ids) final; + + size_t GetVocabSize() final; + + std::string IdToToken(int32_t id) final; + + int32_t TokenToId(const std::string& token) final; + + private: + // internal handle + TokenizerHandle handle_{nullptr}; +}; + } // namespace tokenizers #endif // TOKENIZERS_CPP_H_ From 434e6b2c2b699bbea796f8dfbda477c3820adba2 Mon Sep 17 00:00:00 2001 From: Nicholas Long <19273992+cptspacemanspiff@users.noreply.github.com> Date: Sat, 25 Jan 2025 15:49:19 -0800 Subject: [PATCH 2/3] Moved the hf tokenizer defs to the header. --- include/tokenizers_cpp.h | 10 +-- src/huggingface_tokenizer.cc | 163 ++++++++++++++++++----------------- 2 files changed, 84 insertions(+), 89 deletions(-) diff --git a/include/tokenizers_cpp.h b/include/tokenizers_cpp.h index 6d6ca61..c1a9f31 100644 --- a/include/tokenizers_cpp.h +++ b/include/tokenizers_cpp.h @@ -9,7 +9,7 @@ #include #include #include - +#include namespace tokenizers { /*! @@ -106,15 +106,9 @@ class Tokenizer { static std::unique_ptr FromBlobRWKVWorld(const std::string& model_blob); }; -#include - class HFTokenizer : public Tokenizer { public: - explicit HFTokenizer(TokenizerHandle handle) : handle_(handle) { -#ifdef COMPILE_WASM_RUNTIME - setenv("TOKENIZERS_PARALLELISM", "false", true); -#endif - } + explicit HFTokenizer(TokenizerHandle handle); HFTokenizer(const HFTokenizer&); HFTokenizer(HFTokenizer&& other); diff --git a/src/huggingface_tokenizer.cc b/src/huggingface_tokenizer.cc index 6cbe0d8..d6c94a4 100644 --- a/src/huggingface_tokenizer.cc +++ b/src/huggingface_tokenizer.cc @@ -13,100 +13,101 @@ namespace tokenizers { /*! * \brief A simple c++ header of tokenizer via C API. */ -class HFTokenizer : public Tokenizer { - public: - explicit HFTokenizer(TokenizerHandle handle) : handle_(handle) { - #ifdef COMPILE_WASM_RUNTIME - setenv("TOKENIZERS_PARALLELISM", "false", true); - #endif - } - HFTokenizer(const HFTokenizer&) = delete; - HFTokenizer(HFTokenizer&& other) { std::swap(other.handle_, handle_); } +/* +These are the methods for the HFTokenizer class. +*/ - ~HFTokenizer() { - if (handle_ != nullptr) { - tokenizers_free(handle_); - } - } +HFTokenizer::HFTokenizer(TokenizerHandle handle) : handle_(handle) { +#ifdef COMPILE_WASM_RUNTIME + setenv("TOKENIZERS_PARALLELISM", "false", true); +#endif +} - // use i32 to be consistent with sentencepiece - std::vector Encode(const std::string& text, bool add_special_tokens) { - TokenizerEncodeResult result; - tokenizers_encode(handle_, text.data(), text.length(), static_cast(add_special_tokens), - &result); - std::vector ret(result.token_ids, result.token_ids + result.len); - tokenizers_free_encode_results(&result, 1); - return ret; - } +// HFTokenizer::HFTokenizer(const HFTokenizer&) = delete; +HFTokenizer::HFTokenizer(HFTokenizer&& other) { std::swap(other.handle_, handle_); } - // use i32 to be consistent with sentencepiece - std::vector Encode(const std::string& text) final { return Encode(text, false); } - - std::vector> EncodeBatch(const std::vector& texts, - bool add_special_tokens) { - std::vector texts_raw; - std::vector seq_lens; - size_t num_seqs = texts.size(); - texts_raw.reserve(num_seqs); - seq_lens.reserve(num_seqs); - for (const auto& text : texts) { - texts_raw.push_back(text.data()); - seq_lens.push_back(text.length()); - } - std::vector results(num_seqs); - tokenizers_encode_batch(handle_, texts_raw.data(), seq_lens.data(), texts.size(), - static_cast(add_special_tokens), results.data()); - std::vector> ret; - ret.reserve(texts.size()); - for (size_t i = 0; i < texts.size(); ++i) { - ret.push_back( - std::vector(results[i].token_ids, results[i].token_ids + results[i].len)); - } - tokenizers_free_encode_results(results.data(), texts.size()); - return ret; +HFTokenizer::~HFTokenizer() { + if (handle_ != nullptr) { + tokenizers_free(handle_); } +} - std::vector> EncodeBatch(const std::vector& texts) final { - return EncodeBatch(texts, false); - } +// use i32 to be consistent with sentencepiece +std::vector HFTokenizer::Encode(const std::string& text, bool add_special_tokens) { + TokenizerEncodeResult result; + tokenizers_encode(handle_, text.data(), text.length(), static_cast(add_special_tokens), + &result); + std::vector ret(result.token_ids, result.token_ids + result.len); + tokenizers_free_encode_results(&result, 1); + return ret; +} - // use i32 to be consistent with sentencepiece - std::string Decode(const std::vector& ids, bool skip_special_tokens) { - tokenizers_decode(handle_, reinterpret_cast(ids.data()), ids.size(), - static_cast(skip_special_tokens)); - const char* data; - size_t len; - tokenizers_get_decode_str(handle_, &data, &len); - return std::string(data, len); +// use i32 to be consistent with sentencepiece +std::vector HFTokenizer::Encode(const std::string& text) { return Encode(text, false); } + +std::vector> HFTokenizer::EncodeBatch(const std::vector& texts, + bool add_special_tokens) { + std::vector texts_raw; + std::vector seq_lens; + size_t num_seqs = texts.size(); + texts_raw.reserve(num_seqs); + seq_lens.reserve(num_seqs); + for (const auto& text : texts) { + texts_raw.push_back(text.data()); + seq_lens.push_back(text.length()); + } + std::vector results(num_seqs); + tokenizers_encode_batch(handle_, texts_raw.data(), seq_lens.data(), texts.size(), + static_cast(add_special_tokens), results.data()); + std::vector> ret; + ret.reserve(texts.size()); + for (size_t i = 0; i < texts.size(); ++i) { + ret.push_back( + std::vector(results[i].token_ids, results[i].token_ids + results[i].len)); } + tokenizers_free_encode_results(results.data(), texts.size()); + return ret; +} + +std::vector> HFTokenizer::EncodeBatch(const std::vector& texts) { + return EncodeBatch(texts, false); +} - std::string Decode(const std::vector& ids) final { return Decode(ids, false); } +// use i32 to be consistent with sentencepiece +std::string HFTokenizer::Decode(const std::vector& ids, bool skip_special_tokens) { + tokenizers_decode(handle_, reinterpret_cast(ids.data()), ids.size(), + static_cast(skip_special_tokens)); + const char* data; + size_t len; + tokenizers_get_decode_str(handle_, &data, &len); + return std::string(data, len); +} - size_t GetVocabSize() final { - size_t size; - tokenizers_get_vocab_size(handle_, &size); - assert(size > 0); - return size; - } +std::string HFTokenizer::Decode(const std::vector& ids) { return Decode(ids, false); } - std::string IdToToken(int32_t id) final { - const char* data; - size_t len; - tokenizers_id_to_token(handle_, static_cast(id), &data, &len); - return std::string(data, len); - } +size_t HFTokenizer::GetVocabSize() { + size_t size; + tokenizers_get_vocab_size(handle_, &size); + assert(size > 0); + return size; +} - int32_t TokenToId(const std::string& token) final { - int32_t id; - tokenizers_token_to_id(handle_, token.data(), token.length(), &id); - return id; - } +std::string HFTokenizer::IdToToken(int32_t id) { + const char* data; + size_t len; + tokenizers_id_to_token(handle_, static_cast(id), &data, &len); + return std::string(data, len); +} + +int32_t HFTokenizer::TokenToId(const std::string& token) { + int32_t id; + tokenizers_token_to_id(handle_, token.data(), token.length(), &id); + return id; +} + +// These are factory methods defined in the base class Tokenizer: - private: - // internal handle - TokenizerHandle handle_{nullptr}; -}; std::unique_ptr Tokenizer::FromBlobJSON(const std::string& json) { return std::make_unique(tokenizers_new_from_str(json.data(), json.length())); From 6be4671261be59319817f26b327548c060a02fce Mon Sep 17 00:00:00 2001 From: Nicholas Long <19273992+cptspacemanspiff@users.noreply.github.com> Date: Sat, 25 Jan 2025 16:20:07 -0800 Subject: [PATCH 3/3] Added factories to HFTokenizer, Tokenizer factories call them. --- include/tokenizers_cpp.h | 23 +++++++++++++++++++++++ src/huggingface_tokenizer.cc | 15 +++++++++++++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/include/tokenizers_cpp.h b/include/tokenizers_cpp.h index c1a9f31..9eaf1ac 100644 --- a/include/tokenizers_cpp.h +++ b/include/tokenizers_cpp.h @@ -121,6 +121,7 @@ class HFTokenizer : public Tokenizer { // use i32 to be consistent with sentencepiece std::vector Encode(const std::string& text) final; + // version specific to HFTokenizer, which adds special tokens flag std::vector> EncodeBatch(const std::vector& texts, bool add_special_tokens); @@ -137,6 +138,28 @@ class HFTokenizer : public Tokenizer { int32_t TokenToId(const std::string& token) final; + + /*! + * \brief Create HF tokenizer from a single in-memory json blob. + * + * \param json_blob The json blob. + * \return The created tokenzier. + */ + static std::unique_ptr FromBlobJSON(const std::string& json_blob); + + /*! + * \brief Create BPE tokenizer + * + * \param vocab_blob The blob that contains vocabs. + * \param merges_blob The blob that contains the merges. + * \param added_tokens The added tokens. + * \return The created tokenizer. + */ + static std::unique_ptr FromBlobByteLevelBPE(const std::string& vocab_blob, + const std::string& merges_blob, + const std::string& added_tokens = ""); + + private: // internal handle TokenizerHandle handle_{nullptr}; diff --git a/src/huggingface_tokenizer.cc b/src/huggingface_tokenizer.cc index d6c94a4..d906b86 100644 --- a/src/huggingface_tokenizer.cc +++ b/src/huggingface_tokenizer.cc @@ -109,15 +109,26 @@ int32_t HFTokenizer::TokenToId(const std::string& token) { // These are factory methods defined in the base class Tokenizer: -std::unique_ptr Tokenizer::FromBlobJSON(const std::string& json) { +std::unique_ptr HFTokenizer::FromBlobJSON(const std::string& json) { return std::make_unique(tokenizers_new_from_str(json.data(), json.length())); } -std::unique_ptr Tokenizer::FromBlobByteLevelBPE(const std::string& vocab, +std::unique_ptr Tokenizer::FromBlobJSON(const std::string& json) { + return HFTokenizer::FromBlobJSON(json); +} + +std::unique_ptr HFTokenizer::FromBlobByteLevelBPE(const std::string& vocab, const std::string& merges, const std::string& added_tokens) { return std::make_unique(byte_level_bpe_tokenizers_new_from_str( vocab.data(), vocab.length(), merges.data(), merges.length(), added_tokens.data(), added_tokens.length())); } + +std::unique_ptr Tokenizer::FromBlobByteLevelBPE(const std::string& vocab, + const std::string& merges, + const std::string& added_tokens) { + return HFTokenizer::FromBlobByteLevelBPE(vocab, merges, added_tokens); +} + } // namespace tokenizers