Skip to content

Commit c0fab1e

Browse files
authored
[CMake] Support disable SentencePiece tokenizer (#38)
This PR supports turning off SentencePiece tokenizer to reduce binary size.
1 parent eec9d68 commit c0fab1e

File tree

2 files changed

+13
-0
lines changed

2 files changed

+13
-0
lines changed

CMakeLists.txt

+5
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ set(TOKENIZERS_CPP_CARGO_SOURCE_PATH ${TOKENIZERS_CPP_ROOT}/rust)
8888
option(MSGPACK_USE_BOOST "Use Boost libraried" OFF)
8989
add_subdirectory(msgpack)
9090

91+
option(MLC_ENABLE_SENTENCEPIECE_TOKENIZER "Enable SentencePiece tokenizer" OFF)
92+
9193
if(MSVC)
9294
set(TOKENIZERS_RUST_LIB "${TOKENIZERS_CPP_CARGO_BINARY_DIR}/tokenizers_c.lib")
9395
else()
@@ -120,6 +122,9 @@ add_library(tokenizer_cpp_objs OBJECT ${TOKENIZER_CPP_SRCS})
120122
target_include_directories(tokenizer_cpp_objs PRIVATE sentencepiece/src)
121123
target_include_directories(tokenizer_cpp_objs PRIVATE msgpack/include)
122124
target_include_directories(tokenizer_cpp_objs PUBLIC ${TOKENIZERS_CPP_INCLUDE})
125+
if (MLC_ENABLE_SENTENCEPIECE_TOKENIZER STREQUAL "ON")
126+
target_compile_definitions(tokenizer_cpp_objs PUBLIC MLC_ENABLE_SENTENCEPIECE_TOKENIZER)
127+
endif ()
123128
target_link_libraries(tokenizer_cpp_objs PRIVATE msgpack-cxx)
124129

125130
# sentencepiece config

src/sentencepiece_tokenizer.cc

+8
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
namespace tokenizers {
1212

13+
#ifdef MLC_ENABLE_SENTENCEPIECE_TOKENIZER
1314
class SentencePieceTokenizer : public Tokenizer {
1415
public:
1516
explicit SentencePieceTokenizer(const std::string& model_blob) {
@@ -46,4 +47,11 @@ class SentencePieceTokenizer : public Tokenizer {
4647
std::unique_ptr<Tokenizer> Tokenizer::FromBlobSentencePiece(const std::string& model_blob) {
4748
return std::make_unique<SentencePieceTokenizer>(model_blob);
4849
}
50+
#else
51+
std::unique_ptr<Tokenizer> Tokenizer::FromBlobSentencePiece(const std::string& model_blob) {
52+
assert(false);
53+
throw;
54+
}
55+
#endif // MLC_ENABLE_SENTENCEPIECE_TOKENIZER
56+
4957
} // namespace tokenizers

0 commit comments

Comments
 (0)