Skip to content

Commit 75ecffa

Browse files
committed
Store utf8/utf16 position difference instead of counting utf16 positions
1 parent f3affa9 commit 75ecffa

File tree

2 files changed

+32
-29
lines changed

2 files changed

+32
-29
lines changed

src/parser.rs

+8-3
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ use std::ops::Range;
1818
pub struct ParserState {
1919
pub(crate) position: usize,
2020
pub(crate) current_line_start_position: usize,
21-
pub(crate) current_position: usize,
21+
pub(crate) current_line_start_difference: u16,
22+
pub(crate) position_difference: u16,
2223
pub(crate) current_line_number: u32,
2324
pub(crate) at_start_of: Option<BlockType>,
2425
}
@@ -35,14 +36,18 @@ impl ParserState {
3536
pub fn source_location(&self) -> SourceLocation {
3637
SourceLocation {
3738
line: self.current_line_number,
38-
column: (self.position - self.current_line_start_position + 1) as u32,
39+
column: (
40+
self.position - self.current_line_start_position -
41+
(self.position_difference - self.current_line_start_difference) as usize +
42+
1
43+
) as u32,
3944
}
4045
}
4146

4247
/// The position from the start of the input, counted in UTF-16 code units
4348
#[inline]
4449
pub fn utf16_position(&self) -> u32 {
45-
self.current_position as u32
50+
(self.position - self.position_difference as usize) as u32
4651
}
4752
}
4853

src/tokenizer.rs

+24-26
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,8 @@ pub struct Tokenizer<'a> {
214214
/// ensure that computing the column will give the result in units
215215
/// of UTF-16 characters.
216216
current_line_start_position: usize,
217-
current_position: usize,
217+
position_difference: u16,
218+
current_line_start_difference: u16,
218219
current_line_number: u32,
219220
var_or_env_functions: SeenStatus,
220221
source_map_url: Option<&'a str>,
@@ -235,8 +236,9 @@ impl<'a> Tokenizer<'a> {
235236
input,
236237
position: 0,
237238
current_line_start_position: 0,
238-
current_position: 0,
239+
current_line_start_difference: 0,
239240
current_line_number: 0,
241+
position_difference: 0,
240242
var_or_env_functions: SeenStatus::DontCare,
241243
source_map_url: None,
242244
source_url: None,
@@ -279,7 +281,12 @@ impl<'a> Tokenizer<'a> {
279281
pub fn current_source_location(&self) -> SourceLocation {
280282
SourceLocation {
281283
line: self.current_line_number,
282-
column: (self.position - self.current_line_start_position + 1) as u32,
284+
column: (
285+
self.position -
286+
self.current_line_start_position -
287+
(self.position_difference - self.current_line_start_difference) as usize
288+
+ 1
289+
) as u32,
283290
}
284291
}
285292

@@ -298,7 +305,8 @@ impl<'a> Tokenizer<'a> {
298305
ParserState {
299306
position: self.position,
300307
current_line_start_position: self.current_line_start_position,
301-
current_position: self.current_position,
308+
current_line_start_difference: self.current_line_start_difference,
309+
position_difference: self.position_difference,
302310
current_line_number: self.current_line_number,
303311
at_start_of: None,
304312
}
@@ -308,7 +316,8 @@ impl<'a> Tokenizer<'a> {
308316
pub fn reset(&mut self, state: &ParserState) {
309317
self.position = state.position;
310318
self.current_line_start_position = state.current_line_start_position;
311-
self.current_position = state.current_position;
319+
self.current_line_start_difference = state.current_line_start_difference;
320+
self.position_difference = state.position_difference;
312321
self.current_line_number = state.current_line_number;
313322
}
314323

@@ -374,7 +383,6 @@ impl<'a> Tokenizer<'a> {
374383
debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
375384
}
376385
}
377-
self.current_position = self.current_position.wrapping_add(n);
378386
self.position += n
379387
}
380388

@@ -396,8 +404,7 @@ impl<'a> Tokenizer<'a> {
396404
debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
397405
// This takes two UTF-16 characters to represent, so we
398406
// actually have an undercount.
399-
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
400-
self.current_position = self.current_position.wrapping_add(2);
407+
self.position_difference = self.position_difference.wrapping_sub(1);
401408
self.position += 1;
402409
}
403410

@@ -409,7 +416,7 @@ impl<'a> Tokenizer<'a> {
409416
// Continuation bytes contribute to column overcount. Note
410417
// that due to the special case for the 4-byte sequence intro,
411418
// we must use wrapping add here.
412-
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
419+
self.position_difference = self.position_difference.wrapping_add(1);
413420
self.position += 1;
414421
}
415422

@@ -422,14 +429,11 @@ impl<'a> Tokenizer<'a> {
422429
if byte & 0xF0 == 0xF0 {
423430
// This takes two UTF-16 characters to represent, so we
424431
// actually have an undercount.
425-
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
426-
self.current_position = self.current_position.wrapping_add(2);
432+
self.position_difference = self.position_difference.wrapping_sub(1);
427433
} else if byte & 0xC0 == 0x80 {
428434
// Note that due to the special case for the 4-byte
429435
// sequence intro, we must use wrapping add here.
430-
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
431-
} else {
432-
self.current_position = self.current_position.wrapping_add(1);
436+
self.position_difference = self.position_difference.wrapping_add(1);
433437
}
434438
}
435439

@@ -448,12 +452,11 @@ impl<'a> Tokenizer<'a> {
448452
let byte = self.next_byte_unchecked();
449453
debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
450454
self.position += 1;
451-
self.current_position = self.current_position.wrapping_add(1);
452455
if byte == b'\r' && self.next_byte() == Some(b'\n') {
453456
self.position += 1;
454-
self.current_position = self.current_position.wrapping_add(1);
455457
}
456458
self.current_line_start_position = self.position;
459+
self.current_line_start_difference = self.position_difference;
457460
self.current_line_number += 1;
458461
}
459462

@@ -467,14 +470,13 @@ impl<'a> Tokenizer<'a> {
467470
fn consume_char(&mut self) -> char {
468471
let c = self.next_char();
469472
let len_utf8 = c.len_utf8();
473+
let len_utf16 = c.len_utf16();
470474
self.position += len_utf8;
471475
// Note that due to the special case for the 4-byte sequence
472476
// intro, we must use wrapping add here.
473-
let len_utf16 = c.len_utf16();
474-
self.current_line_start_position = self
475-
.current_line_start_position
476-
.wrapping_add(len_utf8 - len_utf16);
477-
self.current_position = self.current_position.wrapping_add(len_utf16);
477+
self.position_difference = self
478+
.position_difference
479+
.wrapping_add((len_utf8 - len_utf16) as u16);
478480
c
479481
}
480482

@@ -1164,16 +1166,12 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11641166
}
11651167
};
11661168
match_byte! { b,
1167-
b' ' | b'\t' => {
1168-
tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
1169-
},
1169+
b' ' | b'\t' => {},
11701170
b'\n' | b'\x0C' => {
11711171
newlines += 1;
11721172
last_newline = offset;
1173-
tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
11741173
}
11751174
b'\r' => {
1176-
tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
11771175
if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
11781176
newlines += 1;
11791177
last_newline = offset;

0 commit comments

Comments
 (0)