Skip to content

Commit 23e8fcf

Browse files
committed
Implement a decoding tokenizer
Signed-off-by: Simon Wülker <[email protected]>
1 parent 559f96a commit 23e8fcf

File tree

18 files changed

+500
-38
lines changed

18 files changed

+500
-38
lines changed

html5ever/Cargo.toml

+3
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,16 @@ readme = "../README.md"
1313
rust-version.workspace = true
1414

1515
[features]
16+
default = ["encoding"]
1617
trace_tokenizer = []
18+
encoding = ["dep:encoding_rs", "markup5ever/encoding"]
1719

1820
[dependencies]
1921
log = "0.4"
2022
mac = "0.1"
2123
markup5ever = { version = "0.16", path = "../markup5ever" }
2224
match_token = { workspace = true }
25+
encoding_rs = { version = "0.8", optional = true }
2326

2427
[dev-dependencies]
2528
criterion = "0.5"

html5ever/examples/noop-tokenize.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ use std::cell::RefCell;
1515
use std::io;
1616

1717
use html5ever::tendril::*;
18-
use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer};
18+
use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer};
19+
use markup5ever::buffer_queue::BufferQueue;
1920

2021
/// In our case, our sink only contains a tokens vector
2122
struct Sink(RefCell<Vec<Token>>);

html5ever/examples/tokenize.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@ use std::cell::Cell;
1313
use std::io;
1414

1515
use html5ever::tendril::*;
16-
use html5ever::tokenizer::BufferQueue;
1716
use html5ever::tokenizer::{CharacterTokens, EndTag, NullCharacterToken, StartTag, TagToken};
1817
use html5ever::tokenizer::{
1918
ParseError, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
2019
};
20+
use markup5ever::buffer_queue::BufferQueue;
2121

2222
#[derive(Clone)]
2323
struct TokenPrinter {

html5ever/src/tokenizer/char_ref/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@
88
// except according to those terms.
99

1010
use super::{TokenSink, Tokenizer};
11-
use crate::buffer_queue::BufferQueue;
1211
use crate::data;
1312
use crate::tendril::StrTendril;
1413

1514
use log::debug;
1615
use mac::format_if;
16+
use markup5ever::buffer_queue::BufferQueue;
1717
use std::borrow::Cow::Borrowed;
1818
use std::char::from_u32;
1919

html5ever/src/tokenizer/interface.rs

+2
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ pub enum TokenSinkResult<Handle> {
7777
Script(Handle),
7878
Plaintext,
7979
RawData(states::RawKind),
80+
#[cfg(feature = "encoding")]
81+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
8082
}
8183

8284
/// Types which can receive tokens from the tokenizer.

html5ever/src/tokenizer/mod.rs

+38-8
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,18 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
2222
use self::char_ref::{CharRef, CharRefTokenizer};
2323

2424
use crate::util::str::lower_ascii_letter;
25-
2625
use log::{debug, trace};
2726
use mac::format_if;
28-
use markup5ever::{ns, small_char_set, TokenizerResult};
27+
use markup5ever::{
28+
buffer_queue::BufferQueue, namespace_url, ns, small_char_set, InputSink, InputSinkResult,
29+
TokenizerResult,
30+
};
2931
use std::borrow::Cow::{self, Borrowed};
3032
use std::cell::{Cell, RefCell, RefMut};
3133
use std::collections::BTreeMap;
32-
use std::mem;
34+
use std::{iter, mem};
3335

34-
pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
36+
pub use crate::buffer_queue::{FromSet, NotFromSet, SetResult};
3537
use crate::tendril::StrTendril;
3638
use crate::{Attribute, LocalName, QualName, SmallCharSet};
3739

@@ -43,6 +45,8 @@ pub enum ProcessResult<Handle> {
4345
Continue,
4446
Suspend,
4547
Script(Handle),
48+
#[cfg(feature = "encoding")]
49+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
4650
}
4751

4852
fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
@@ -357,6 +361,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
357361
ProcessResult::Continue => (),
358362
ProcessResult::Suspend => break,
359363
ProcessResult::Script(node) => return TokenizerResult::Script(node),
364+
#[cfg(feature = "encoding")]
365+
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
366+
return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding)
367+
},
360368
}
361369
}
362370
} else {
@@ -365,6 +373,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
365373
ProcessResult::Continue => (),
366374
ProcessResult::Suspend => break,
367375
ProcessResult::Script(node) => return TokenizerResult::Script(node),
376+
#[cfg(feature = "encoding")]
377+
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
378+
return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding)
379+
},
368380
}
369381
}
370382
}
@@ -456,6 +468,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
456468
self.state.set(states::RawData(kind));
457469
ProcessResult::Continue
458470
},
471+
#[cfg(feature = "encoding")]
472+
TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding) => {
473+
ProcessResult::MaybeChangeEncodingAndStartOver(encoding)
474+
},
459475
}
460476
}
461477

@@ -1680,6 +1696,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
16801696
ProcessResult::Continue => (),
16811697
ProcessResult::Suspend => break,
16821698
ProcessResult::Script(_) => unreachable!(),
1699+
#[cfg(feature = "encoding")]
1700+
ProcessResult::MaybeChangeEncodingAndStartOver(_) => unreachable!(),
16831701
}
16841702
}
16851703

@@ -1841,13 +1859,27 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
18411859
}
18421860
}
18431861

1862+
impl<Sink> InputSink for Tokenizer<Sink>
1863+
where
1864+
Sink: TokenSink,
1865+
{
1866+
type Handle = Sink::Handle;
1867+
1868+
fn feed<'a>(
1869+
&'a self,
1870+
input: &'a BufferQueue,
1871+
) -> impl Iterator<Item = InputSinkResult<Self::Handle>> + 'a {
1872+
iter::from_fn(|| self.feed(input).into())
1873+
}
1874+
}
1875+
18441876
#[cfg(test)]
18451877
#[allow(non_snake_case)]
18461878
mod test {
18471879
use super::option_push; // private items
1848-
use crate::tendril::{SliceExt, StrTendril};
1849-
18501880
use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
1881+
use crate::tendril::{SliceExt, StrTendril};
1882+
use crate::LocalName;
18511883

18521884
use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
18531885
use super::interface::{EndTag, StartTag, Tag, TagKind};
@@ -1856,8 +1888,6 @@ mod test {
18561888
use markup5ever::buffer_queue::BufferQueue;
18571889
use std::cell::RefCell;
18581890

1859-
use crate::LocalName;
1860-
18611891
// LinesMatch implements the TokenSink trait. It is used for testing to see
18621892
// if current_line is being updated when process_token is called. The lines
18631893
// vector is a collection of the line numbers that each token is on.

html5ever/src/tree_builder/mod.rs

+4
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,10 @@ where
396396
assert!(more_tokens.is_empty());
397397
return tokenizer::TokenSinkResult::RawData(k);
398398
},
399+
#[cfg(feature = "encoding")]
400+
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
401+
return tokenizer::TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding);
402+
},
399403
}
400404
}
401405
}

html5ever/src/tree_builder/rules.rs

+25-9
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,24 @@
1010
// The tree builder rules, as a single, enormous nested match expression.
1111

1212
use crate::interface::Quirks;
13-
use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData};
13+
use crate::tokenizer::states::{Rawtext, Rcdata};
1414
use crate::tokenizer::TagKind::{EndTag, StartTag};
1515
use crate::tree_builder::tag_sets::*;
1616
use crate::tree_builder::types::*;
17-
use crate::tree_builder::{
18-
create_element, html_elem, ElemName, NodeOrText::AppendNode, StrTendril, Tag, TreeBuilder,
19-
TreeSink,
20-
};
21-
use crate::QualName;
22-
use markup5ever::{expanded_name, local_name, ns};
17+
use crate::tree_builder::RawKind::ScriptData;
18+
use crate::tree_builder::{html_elem, ElemName, StrTendril, Tag, TreeBuilder, TreeSink};
19+
20+
use markup5ever::interface::create_element;
21+
use markup5ever::interface::NodeOrText::AppendNode;
22+
use markup5ever::{expanded_name, local_name, namespace_url, ns, QualName};
2323
use std::borrow::Cow::Borrowed;
2424

2525
use crate::tendril::SliceExt;
2626
use match_token::match_token;
2727

28+
#[cfg(feature = "encoding")]
29+
use encoding_rs::Encoding;
30+
2831
fn any_not_whitespace(x: &StrTendril) -> bool {
2932
// FIXME: this might be much faster as a byte scan
3033
x.chars().any(|c| !c.is_ascii_whitespace())
@@ -113,8 +116,21 @@ where
113116

114117
<html> => self.step(InsertionMode::InBody, token),
115118

116-
tag @ <base> <basefont> <bgsound> <link> <meta> => {
117-
// FIXME: handle <meta charset=...> and <meta http-equiv="Content-Type">
119+
tag @ <meta> => {
120+
// FIXME: handle <meta http-equiv="Content-Type">
121+
#[cfg(feature = "encoding")]
122+
if let Some(charset) = tag.attrs.iter().find(|a| a.name == QualName::new(None, ns!(html), local_name!("charset"))) {
123+
if let Some(encoding) = Encoding::for_label(charset.value.as_bytes()) {
124+
self.insert_and_pop_element_for(tag);
125+
return ProcessResult::MaybeChangeEncodingAndStartOver(encoding);
126+
}
127+
}
128+
129+
self.insert_and_pop_element_for(tag);
130+
ProcessResult::DoneAckSelfClosing
131+
},
132+
133+
tag @ <base> <basefont> <bgsound> <link> => {
118134
self.insert_and_pop_element_for(tag);
119135
ProcessResult::DoneAckSelfClosing
120136
}

html5ever/src/tree_builder/types.rs

+2
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ pub(crate) enum ProcessResult<Handle> {
7070
Script(Handle),
7171
ToPlaintext,
7272
ToRawData(RawKind),
73+
#[cfg(feature = "encoding")]
74+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
7375
}
7476

7577
pub(crate) enum FormatEntry<Handle> {

markup5ever/Cargo.toml

+9-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,15 @@ rust-version.workspace = true
1313
[lib]
1414
path = "lib.rs"
1515

16+
[features]
17+
encoding = ["dep:encoding_rs"]
18+
1619
[dependencies]
1720
web_atoms = { version = "0.1", path = "../web_atoms" }
1821
tendril = "0.4"
19-
log = "0.4"
22+
log = "0.4"
23+
encoding_rs = { version = "0.8", optional = true }
24+
25+
[build-dependencies]
26+
string_cache_codegen = "0.5.4"
27+
phf_codegen = "0.11"

0 commit comments

Comments
 (0)