Skip to content

Commit d5dd7f4

Browse files
authored
[Web] Set TOKENIZERS_PARALLELISM to false for HFTokenizer (#42)
This PR sets TOKENIZERS_PARALLELISM to false when compiling for web, which huggingface/tokenizers reads in runtime.
1 parent c0fab1e commit d5dd7f4

File tree

3 files changed

+19
-2
lines changed

3 files changed

+19
-2
lines changed

src/huggingface_tokenizer.cc

+5-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,11 @@ namespace tokenizers {
1515
*/
1616
class HFTokenizer : public Tokenizer {
1717
public:
18-
explicit HFTokenizer(TokenizerHandle handle) : handle_(handle) {}
18+
explicit HFTokenizer(TokenizerHandle handle) : handle_(handle) {
19+
#ifdef COMPILE_WASM_RUNTIME
20+
setenv("TOKENIZERS_PARALLELISM", "false", true);
21+
#endif
22+
}
1923

2024
HFTokenizer(const HFTokenizer&) = delete;
2125
HFTokenizer(HFTokenizer&& other) { std::swap(other.handle_, handle_); }

web/build.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ rustup target add wasm32-unknown-emscripten
55

66
mkdir -p build
77
cd build
8-
emcmake cmake ../.. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="-O3"
8+
emcmake cmake ../.. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="-O3 -DCOMPILE_WASM_RUNTIME"
99
emmake make tokenizers_cpp tokenizers_c sentencepiece-static -j8
1010
cd ..
1111

web/tests/src/index.ts

+13
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,21 @@ async function testLlamaTokenizer() {
4848
}
4949
}
5050

51+
// Without COMPILE_WASM_RUNTIME, this triggers parallel processing, leading to error
52+
async function testBertTokenizer() {
53+
console.log("Bert Tokenizer");
54+
const modelBuffer = await (await
55+
fetch("https://huggingface.co/Snowflake/snowflake-arctic-embed-l/raw/main/tokenizer.json")
56+
).arrayBuffer();
57+
const tok = await Tokenizer.fromJSON(modelBuffer);
58+
const text = "What is the capital of Canada?";
59+
const ids = tok.encode(text);
60+
console.log(ids);
61+
}
62+
5163
async function main() {
5264
await testJSONTokenizer()
65+
await testBertTokenizer();
5366
await testLlamaTokenizer()
5467
}
5568

0 commit comments

Comments
 (0)