Skip to content

Commit f3affa9

Browse files
committed
Add ParserState method to get current utf16 position
This adds a `utf16_position` method on `ParserState`, exposing a `current_position` field that we compute. The implementation is closely following what what done to compute the utf16 column position.
1 parent 8f13fc9 commit f3affa9

File tree

4 files changed

+49
-22
lines changed

4 files changed

+49
-22
lines changed

src/parser.rs

+7
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ use std::ops::Range;
1818
pub struct ParserState {
1919
pub(crate) position: usize,
2020
pub(crate) current_line_start_position: usize,
21+
pub(crate) current_position: usize,
2122
pub(crate) current_line_number: u32,
2223
pub(crate) at_start_of: Option<BlockType>,
2324
}
@@ -37,6 +38,12 @@ impl ParserState {
3738
column: (self.position - self.current_line_start_position + 1) as u32,
3839
}
3940
}
41+
42+
/// The position from the start of the input, counted in UTF-16 code units
43+
#[inline]
44+
pub fn utf16_position(&self) -> u32 {
45+
self.current_position as u32
46+
}
4047
}
4148

4249
/// When parsing until a given token, sometimes the caller knows that parsing is going to restart

src/size_of_tests.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,11 @@ size_of_test!(token, Token, 32);
4242
size_of_test!(std_cow_str, std::borrow::Cow<'static, str>, 24, 32);
4343
size_of_test!(cow_rc_str, CowRcStr, 16);
4444

45-
size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 72);
46-
size_of_test!(parser_input, crate::parser::ParserInput, 136);
45+
size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 80);
46+
size_of_test!(parser_input, crate::parser::ParserInput, 152);
4747
size_of_test!(parser, crate::parser::Parser, 16);
4848
size_of_test!(source_position, crate::SourcePosition, 8);
49-
size_of_test!(parser_state, crate::ParserState, 24);
49+
size_of_test!(parser_state, crate::ParserState, 32);
5050

5151
size_of_test!(basic_parse_error, crate::BasicParseError, 40, 48);
5252
size_of_test!(parse_error_lower_bound, crate::ParseError<()>, 40, 48);

src/tests.rs

+20-17
Original file line numberDiff line numberDiff line change
@@ -1267,7 +1267,7 @@ fn roundtrip_percentage_token() {
12671267
}
12681268

12691269
#[test]
1270-
fn utf16_columns() {
1270+
fn utf16_columns_and_positions() {
12711271
// This particular test serves two purposes. First, it checks
12721272
// that the column number computations are correct. Second, it
12731273
// checks that tokenizer code paths correctly differentiate
@@ -1278,24 +1278,26 @@ fn utf16_columns() {
12781278
// the column is in units of UTF-16, the 4-byte sequence results
12791279
// in two columns.
12801280
let tests = vec![
1281-
("", 1),
1282-
("ascii", 6),
1283-
("/*QΡ✈🆒*/", 10),
1284-
("'QΡ✈🆒*'", 9),
1285-
("\"\\\"'QΡ✈🆒*'", 12),
1286-
("\\Q\\Ρ\\\\🆒", 10),
1287-
("QΡ✈🆒", 6),
1288-
("QΡ✈🆒\\Q\\Ρ\\\\🆒", 15),
1289-
("newline\r\nQΡ✈🆒", 6),
1290-
("url(QΡ✈🆒\\Q\\Ρ\\\\🆒)", 20),
1291-
("url(QΡ✈🆒)", 11),
1292-
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒)", 16),
1293-
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒", 15),
1294-
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒 x", 17),
1295-
("QΡ✈🆒()", 8),
1281+
("", 1, 0),
1282+
("ascii", 6, 5),
1283+
("/*QΡ✈🆒*/", 10, 9),
1284+
("/*QΡ✈\r\n🆒*/", 5, 11),
1285+
("'QΡ✈🆒*'", 9, 8),
1286+
("\"\\\"'QΡ✈🆒*'", 12, 11),
1287+
("\\Q\\Ρ\\\\🆒", 10, 9),
1288+
("QΡ✈🆒", 6, 5),
1289+
("QΡ✈🆒\\Q\\Ρ\\\\🆒", 15, 14),
1290+
("newline\r\nQΡ✈🆒", 6, 14),
1291+
("url(QΡ✈🆒\\Q\\Ρ\\\\🆒)", 20, 19),
1292+
("url(QΡ✈🆒)", 11, 10),
1293+
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒)", 16, 21),
1294+
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒", 15, 20),
1295+
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒 x", 17, 22),
1296+
("url( \tQ)", 10, 9),
1297+
("QΡ✈🆒()", 8, 7),
12961298
// Test that under/over-flow of current_line_start_position is
12971299
// handled properly; see the special case in consume_4byte_intro.
1298-
("🆒", 3),
1300+
("🆒", 3, 2),
12991301
];
13001302

13011303
for test in tests {
@@ -1321,6 +1323,7 @@ fn utf16_columns() {
13211323

13221324
// Check the resulting column.
13231325
assert_eq!(parser.current_source_location().column, test.1);
1326+
assert_eq!(parser.state().utf16_position(), test.2, "test: {}", test.0);
13241327
}
13251328
}
13261329

src/tokenizer.rs

+19-2
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ pub struct Tokenizer<'a> {
214214
/// ensure that computing the column will give the result in units
215215
/// of UTF-16 characters.
216216
current_line_start_position: usize,
217+
current_position: usize,
217218
current_line_number: u32,
218219
var_or_env_functions: SeenStatus,
219220
source_map_url: Option<&'a str>,
@@ -234,6 +235,7 @@ impl<'a> Tokenizer<'a> {
234235
input,
235236
position: 0,
236237
current_line_start_position: 0,
238+
current_position: 0,
237239
current_line_number: 0,
238240
var_or_env_functions: SeenStatus::DontCare,
239241
source_map_url: None,
@@ -296,6 +298,7 @@ impl<'a> Tokenizer<'a> {
296298
ParserState {
297299
position: self.position,
298300
current_line_start_position: self.current_line_start_position,
301+
current_position: self.current_position,
299302
current_line_number: self.current_line_number,
300303
at_start_of: None,
301304
}
@@ -305,6 +308,7 @@ impl<'a> Tokenizer<'a> {
305308
pub fn reset(&mut self, state: &ParserState) {
306309
self.position = state.position;
307310
self.current_line_start_position = state.current_line_start_position;
311+
self.current_position = state.current_position;
308312
self.current_line_number = state.current_line_number;
309313
}
310314

@@ -370,6 +374,7 @@ impl<'a> Tokenizer<'a> {
370374
debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
371375
}
372376
}
377+
self.current_position = self.current_position.wrapping_add(n);
373378
self.position += n
374379
}
375380

@@ -392,6 +397,7 @@ impl<'a> Tokenizer<'a> {
392397
// This takes two UTF-16 characters to represent, so we
393398
// actually have an undercount.
394399
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
400+
self.current_position = self.current_position.wrapping_add(2);
395401
self.position += 1;
396402
}
397403

@@ -417,10 +423,13 @@ impl<'a> Tokenizer<'a> {
417423
// This takes two UTF-16 characters to represent, so we
418424
// actually have an undercount.
419425
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
426+
self.current_position = self.current_position.wrapping_add(2);
420427
} else if byte & 0xC0 == 0x80 {
421428
// Note that due to the special case for the 4-byte
422429
// sequence intro, we must use wrapping add here.
423430
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
431+
} else {
432+
self.current_position = self.current_position.wrapping_add(1);
424433
}
425434
}
426435

@@ -439,8 +448,10 @@ impl<'a> Tokenizer<'a> {
439448
let byte = self.next_byte_unchecked();
440449
debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
441450
self.position += 1;
451+
self.current_position = self.current_position.wrapping_add(1);
442452
if byte == b'\r' && self.next_byte() == Some(b'\n') {
443453
self.position += 1;
454+
self.current_position = self.current_position.wrapping_add(1);
444455
}
445456
self.current_line_start_position = self.position;
446457
self.current_line_number += 1;
@@ -459,9 +470,11 @@ impl<'a> Tokenizer<'a> {
459470
self.position += len_utf8;
460471
// Note that due to the special case for the 4-byte sequence
461472
// intro, we must use wrapping add here.
473+
let len_utf16 = c.len_utf16();
462474
self.current_line_start_position = self
463475
.current_line_start_position
464-
.wrapping_add(len_utf8 - c.len_utf16());
476+
.wrapping_add(len_utf8 - len_utf16);
477+
self.current_position = self.current_position.wrapping_add(len_utf16);
465478
c
466479
}
467480

@@ -1151,12 +1164,16 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11511164
}
11521165
};
11531166
match_byte! { b,
1154-
b' ' | b'\t' => {},
1167+
b' ' | b'\t' => {
1168+
tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
1169+
},
11551170
b'\n' | b'\x0C' => {
11561171
newlines += 1;
11571172
last_newline = offset;
1173+
tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
11581174
}
11591175
b'\r' => {
1176+
tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
11601177
if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
11611178
newlines += 1;
11621179
last_newline = offset;

0 commit comments

Comments
 (0)