diff --git a/benches/tokenizer.rs b/benches/tokenizer.rs index 878d23e5..cdf1dbc8 100644 --- a/benches/tokenizer.rs +++ b/benches/tokenizer.rs @@ -29,7 +29,7 @@ struct Sink; impl TokenSink for Sink { type Handle = (); - fn process_token(&mut self, token: Token) -> TokenSinkResult<()> { + fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> { // Don't use the token, but make sure we don't get // optimized out entirely. black_box(token); diff --git a/examples/noop-tokenize.rs b/examples/noop-tokenize.rs index 0cb6b534..4fbc86a6 100644 --- a/examples/noop-tokenize.rs +++ b/examples/noop-tokenize.rs @@ -25,7 +25,7 @@ struct Sink(Vec); impl TokenSink for Sink { type Handle = (); - fn process_token(&mut self, token: Token) -> TokenSinkResult<()> { + fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> { // Don't use the token, but make sure we don't get // optimized out entirely. self.0.push(token); diff --git a/examples/noop-tree-builder.rs b/examples/noop-tree-builder.rs index 0860cb92..197e28c6 100644 --- a/examples/noop-tree-builder.rs +++ b/examples/noop-tree-builder.rs @@ -90,6 +90,7 @@ impl TreeSink for Sink { fn remove_from_parent(&mut self, _target: usize) { } fn reparent_children(&mut self, _node: usize, _new_parent: usize) { } fn mark_script_already_started(&mut self, _node: usize) { } + fn set_current_line(&mut self, line_number: u64) { } } fn main() { diff --git a/examples/print-tree-actions.rs b/examples/print-tree-actions.rs index 84d51d60..1009627f 100644 --- a/examples/print-tree-actions.rs +++ b/examples/print-tree-actions.rs @@ -132,6 +132,8 @@ impl TreeSink for Sink { fn mark_script_already_started(&mut self, node: usize) { println!("Mark script {} as already started", node); } + + fn set_current_line(&mut self, line_number: u64) { } } // FIXME: Copy of str::escape_default from std, which is currently unstable diff --git a/examples/tokenize.rs b/examples/tokenize.rs index 1916a2b6..0b2ab39d 100644 --- a/examples/tokenize.rs +++ b/examples/tokenize.rs @@ -43,7 +43,7 @@ impl TokenPrinter { impl TokenSink for TokenPrinter { type Handle = (); - fn process_token(&mut self, token: Token) -> TokenSinkResult<()> { + fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> { match token { CharacterTokens(b) => { for c in b.chars() { diff --git a/src/rcdom.rs b/src/rcdom.rs index 82dedec0..0e187528 100644 --- a/src/rcdom.rs +++ b/src/rcdom.rs @@ -326,6 +326,8 @@ impl TreeSink for RcDom { _ => unreachable!(), } } + + fn set_current_line(&mut self, _line_number: u64) { } } impl Default for RcDom { diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs index f154f011..37328eb0 100644 --- a/src/tokenizer/interface.rs +++ b/src/tokenizer/interface.rs @@ -111,7 +111,7 @@ pub trait TokenSink { type Handle; /// Process a token. - fn process_token(&mut self, token: Token) -> TokenSinkResult; + fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult; /// Used in the markup declaration open state. By default, this always /// returns false and thus all CDATA sections are tokenized as bogus diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index bcd6d601..5ea64305 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -169,6 +169,9 @@ pub struct Tokenizer { /// Record of how many ns we spent in the token sink. time_in_sink: u64, + + /// Track current line + current_line: u64, } impl Tokenizer { @@ -200,6 +203,7 @@ impl Tokenizer { temp_buf: StrTendril::new(), state_profile: BTreeMap::new(), time_in_sink: 0, + current_line: 1, } } @@ -240,11 +244,11 @@ impl Tokenizer { fn process_token(&mut self, token: Token) -> TokenSinkResult { if self.opts.profile { - let (ret, dt) = time!(self.sink.process_token(token)); + let (ret, dt) = time!(self.sink.process_token(token, self.current_line)); self.time_in_sink += dt; ret } else { - self.sink.process_token(token) + self.sink.process_token(token, self.current_line) } } @@ -272,6 +276,10 @@ impl Tokenizer { c = '\n'; } + if c == '\n' { + self.current_line += 1; + } + if self.opts.exact_errors && match c as u32 { 0x01...0x08 | 0x0B | 0x0E...0x1F | 0x7F...0x9F | 0xFDD0...0xFDEF => true, n if (n & 0xFFFE) == 0xFFFE => true, @@ -690,7 +698,7 @@ impl Tokenizer { match self.state { //§ data-state states::Data => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\0'), FromSet('&') => go!(self: consume_char_ref), FromSet('<') => go!(self: to TagOpen), @@ -701,7 +709,7 @@ impl Tokenizer { //§ rcdata-state states::RawData(Rcdata) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('&') => go!(self: consume_char_ref), FromSet('<') => go!(self: to RawLessThanSign Rcdata), @@ -712,7 +720,7 @@ impl Tokenizer { //§ rawtext-state states::RawData(Rawtext) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\0' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('<') => go!(self: to RawLessThanSign Rawtext), FromSet(c) => go!(self: emit c), @@ -722,7 +730,7 @@ impl Tokenizer { //§ script-data-state states::RawData(ScriptData) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\0' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('<') => go!(self: to RawLessThanSign ScriptData), FromSet(c) => go!(self: emit c), @@ -732,7 +740,7 @@ impl Tokenizer { //§ script-data-escaped-state states::RawData(ScriptDataEscaped(Escaped)) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped), FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped), @@ -743,7 +751,7 @@ impl Tokenizer { //§ script-data-double-escaped-state states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped), FromSet('<') => go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped), @@ -754,7 +762,7 @@ impl Tokenizer { //§ plaintext-state states::Plaintext => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\0')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet(c) => go!(self: emit c), NotFromSet(b) => self.emit_chars(b), @@ -974,7 +982,7 @@ impl Tokenizer { //§ attribute-value-(double-quoted)-state states::AttributeValue(DoubleQuoted) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0')) { + match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) { FromSet('"') => go!(self: to AfterAttributeValueQuoted), FromSet('&') => go!(self: consume_char_ref '"'), FromSet('\0') => go!(self: error; push_value '\u{fffd}'), @@ -985,7 +993,7 @@ impl Tokenizer { //§ attribute-value-(single-quoted)-state states::AttributeValue(SingleQuoted) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0')) { + match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) { FromSet('\'') => go!(self: to AfterAttributeValueQuoted), FromSet('&') => go!(self: consume_char_ref '\''), FromSet('\0') => go!(self: error; push_value '\u{fffd}'), @@ -1403,6 +1411,115 @@ mod test { use super::option_push; // private items use tendril::{StrTendril, SliceExt}; + use super::{TokenSink, Tokenizer, TokenizerOpts, TokenSinkResult}; + + use super::interface::{Token, DoctypeToken, TagToken, CommentToken}; + use super::interface::{CharacterTokens, NullCharacterToken, EOFToken, ParseError}; + use super::interface::{Doctype, Attribute, TagKind, StartTag, EndTag, Tag}; + + use super::buffer_queue::{BufferQueue}; + use std::mem::replace; + + use {LocalName}; + + // LinesMatch implements the TokenSink trait. It is used for testing to see + // if current_line is being updated when process_token is called. The lines + // vector is a collection of the line numbers that each token is on. + struct LinesMatch { + tokens: Vec, + current_str: StrTendril, + lines: Vec<(Token, u64)>, + } + + impl LinesMatch { + fn new() -> LinesMatch { + LinesMatch { + tokens: vec!(), + current_str: StrTendril::new(), + lines: vec!(), + } + } + + fn push(&mut self, token: Token, line_number: u64) { + self.finish_str(); + self.lines.push((token, line_number)); + } + + fn finish_str(&mut self) { + if self.current_str.len() > 0 { + let s = replace(&mut self.current_str, StrTendril::new()); + self.tokens.push(CharacterTokens(s)); + } + } + + } + + impl TokenSink for LinesMatch { + + type Handle = (); + + fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult { + + match token { + CharacterTokens(b) => { + self.current_str.push_slice(&b); + } + + NullCharacterToken => { + self.current_str.push_char('\0'); + } + + ParseError(_) => { + panic!("unexpected parse error"); + } + + TagToken(mut t) => { + // The spec seems to indicate that one can emit + // erroneous end tags with attrs, but the test + // cases don't contain them. + match t.kind { + EndTag => { + t.self_closing = false; + t.attrs = vec!(); + } + _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)), + } + self.push(TagToken(t), line_number); + } + + EOFToken => (), + + _ => self.push(token, line_number), + } + TokenSinkResult::Continue + } + } + + // Take in tokens, process them, and return vector with line + // numbers that each token is on + fn tokenize(input: Vec, opts: TokenizerOpts) -> Vec<(Token, u64)> { + let sink = LinesMatch::new(); + let mut tok = Tokenizer::new(sink, opts); + let mut buffer = BufferQueue::new(); + for chunk in input.into_iter() { + buffer.push_back(chunk); + let _ = tok.feed(&mut buffer); + } + tok.end(); + tok.sink.lines + } + + // Create a tag token + fn create_tag(token: StrTendril, tagkind: TagKind) -> Token { + let name = LocalName::from(&*token); + let token = TagToken(Tag { kind: tagkind, + name: name, + self_closing: false, + attrs: vec!(), + }); + token + } + #[test] fn push_to_None_gives_singleton() { let mut s: Option = None; @@ -1423,4 +1540,42 @@ mod test { option_push(&mut s, 'x'); assert_eq!(s, Some("yx".to_tendril())); } + + #[test] + fn check_lines() { + let opts = TokenizerOpts { + exact_errors: false, + discard_bom: true, + profile: false, + initial_state: None, + last_start_tag_name: None, + }; + let vector = vec![StrTendril::from("\n"), StrTendril::from("\n"), + StrTendril::from("\n"), StrTendril::from("\n")]; + let expected = vec![(create_tag(StrTendril::from("a"), StartTag), 1), + (create_tag(StrTendril::from("b"), StartTag), 2), + (create_tag(StrTendril::from("b"), EndTag), 3), + (create_tag(StrTendril::from("a"), EndTag), 4)]; + let results = tokenize(vector, opts); + assert_eq!(results, expected); + } + + #[test] + fn check_lines_with_new_line() { + let opts = TokenizerOpts { + exact_errors: false, + discard_bom: true, + profile: false, + initial_state: None, + last_start_tag_name: None, + }; + let vector = vec![StrTendril::from("\r\n"), StrTendril::from("\r\n"), + StrTendril::from("\r\n"), StrTendril::from("\r\n")]; + let expected = vec![(create_tag(StrTendril::from("a"), StartTag), 1), + (create_tag(StrTendril::from("b"), StartTag), 2), + (create_tag(StrTendril::from("b"), EndTag), 3), + (create_tag(StrTendril::from("a"), EndTag), 4)]; + let results = tokenize(vector, opts); + assert_eq!(results, expected); + } } diff --git a/src/tree_builder/interface.rs b/src/tree_builder/interface.rs index 3aaa3a87..8e992257 100644 --- a/src/tree_builder/interface.rs +++ b/src/tree_builder/interface.rs @@ -138,6 +138,8 @@ pub trait TreeSink { fn is_mathml_annotation_xml_integration_point(&self, handle: Self::Handle) -> bool { false } + + fn set_current_line(&mut self, line_number: u64); } /// Trace hooks for a garbage-collected DOM. diff --git a/src/tree_builder/mod.rs b/src/tree_builder/mod.rs index 3f276513..7eaa7ce6 100644 --- a/src/tree_builder/mod.rs +++ b/src/tree_builder/mod.rs @@ -135,6 +135,9 @@ pub struct TreeBuilder { /// The context element for the fragment parsing algorithm. context_elem: Option, + /// Track current line + current_line: u64, + // WARNING: If you add new fields that contain Handles, you // must add them to trace_handles() below to preserve memory // safety! @@ -168,6 +171,7 @@ impl TreeBuilder ignore_lf: false, foster_parenting: false, context_elem: None, + current_line: 1, } } @@ -199,6 +203,7 @@ impl TreeBuilder ignore_lf: false, foster_parenting: false, context_elem: Some(context_elem), + current_line: 1, }; // https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments @@ -373,7 +378,10 @@ impl TokenSink { type Handle = Handle; - fn process_token(&mut self, token: tokenizer::Token) -> TokenSinkResult { + fn process_token(&mut self, token: tokenizer::Token, line_number: u64) -> TokenSinkResult { + if line_number != self.current_line { + self.sink.set_current_line(line_number); + } let ignore_lf = replace(&mut self.ignore_lf, false); // Handle `ParseError` and `DoctypeToken`; convert everything else to the local `Token` type. @@ -435,3 +443,151 @@ impl TokenSink self.sink.elem_name(self.adjusted_current_node()).ns != ns!(html) } } + +#[cfg(test)] +#[allow(non_snake_case)] +mod test { + use super::interface::{QuirksMode, Quirks, LimitedQuirks, NoQuirks}; + use super::interface::{NodeOrText, AppendNode, AppendText}; + use super::interface::{TreeSink, Tracer}; + + use super::types::*; + use super::actions::TreeBuilderActions; + use super::rules::TreeBuilderStep; + + use QualName; + use tendril::StrTendril; + use tendril::stream::{TendrilSink, Utf8LossyDecoder, LossyDecoder}; + + use tokenizer; + use tokenizer::{Tokenizer, TokenizerOpts}; + use tokenizer::{Doctype, StartTag, Tag, TokenSink}; + use tokenizer::states as tok_state; + + use util::str::is_ascii_whitespace; + + use std::default::Default; + use std::mem::replace; + use std::borrow::Cow; + use std::borrow::Cow::Borrowed; + use std::collections::VecDeque; + + use driver::*; + use super::{TreeBuilderOpts, TreeBuilder}; + use tokenizer::Attribute; + use rcdom::{Node, Handle, RcDom, NodeEnum, ElementEnum}; + + pub struct LineCountingDOM { + pub line_vec: Vec<(QualName, u64)>, + pub current_line: u64, + pub rcdom: RcDom, + } + + impl TreeSink for LineCountingDOM { + type Output = Self; + + fn finish(self) -> Self { self } + + type Handle = Handle; + + fn parse_error(&mut self, msg: Cow<'static, str>) { + self.rcdom.parse_error(msg); + } + + fn get_document(&mut self) -> Handle { + self.rcdom.get_document() + } + + fn get_template_contents(&mut self, target: Handle) -> Handle { + self.rcdom.get_template_contents(target) + } + + fn set_quirks_mode(&mut self, mode: QuirksMode) { + self.rcdom.set_quirks_mode(mode) + } + + fn same_node(&self, x: Handle, y: Handle) -> bool { + self.rcdom.same_node(x, y) + } + + fn elem_name(&self, target: Handle) -> QualName { + self.rcdom.elem_name(target) + } + + fn create_element(&mut self, name: QualName, attrs: Vec) -> Handle { + self.line_vec.push((name.clone(), self.current_line)); + self.rcdom.create_element(name, attrs) + } + + fn create_comment(&mut self, text: StrTendril) -> Handle { + self.rcdom.create_comment(text) + } + + fn append(&mut self, parent: Handle, child: NodeOrText) { + self.rcdom.append(parent, child) + } + + fn append_before_sibling(&mut self, + sibling: Handle, + child: NodeOrText) -> Result<(), NodeOrText> { + self.rcdom.append_before_sibling(sibling, child) + } + + fn append_doctype_to_document(&mut self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril) { + self.rcdom.append_doctype_to_document(name, public_id, system_id); + } + + fn add_attrs_if_missing(&mut self, target: Handle, attrs: Vec) { + self.rcdom.add_attrs_if_missing(target, attrs); + } + + fn remove_from_parent(&mut self, target: Handle) { + self.rcdom.remove_from_parent(target); + } + + fn reparent_children(&mut self, node: Handle, new_parent: Handle) { + self.rcdom.reparent_children(node, new_parent); + } + + fn mark_script_already_started(&mut self, target: Handle) { + self.rcdom.mark_script_already_started(target); + } + + fn is_mathml_annotation_xml_integration_point(&self, handle: Self::Handle) -> bool { + self.rcdom.is_mathml_annotation_xml_integration_point(handle) + } + + fn set_current_line(&mut self, line_number: u64) { + self.current_line = line_number; + } + } + + #[test] + fn check_four_lines() { + // Input + let sink = LineCountingDOM { + line_vec: vec!(), + current_line: 1, + rcdom: RcDom::default(), + }; + let opts = ParseOpts::default(); + let mut resultTok = parse_document(sink, opts); + resultTok.process(StrTendril::from("\n")); + resultTok.process(StrTendril::from("\n")); + resultTok.process(StrTendril::from("\n")); + resultTok.process(StrTendril::from("")); + // Actual Output + let actual = resultTok.finish(); + // Expected Output + let expected = vec![(qualname!(html, "html"), 1), + (qualname!(html, "head"), 1), + (qualname!(html, "body"), 1), + (qualname!(html, "a"), 1), + (qualname!(html, "b"), 3)]; + // Assertion + assert_eq!(actual.line_vec, expected); + } +} \ No newline at end of file diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs index fb1cd1d7..8557c8c5 100644 --- a/tests/tokenizer.rs +++ b/tests/tokenizer.rs @@ -99,7 +99,7 @@ impl TokenLogger { impl TokenSink for TokenLogger { type Handle = (); - fn process_token(&mut self, token: Token) -> TokenSinkResult<()> { + fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<()> { match token { CharacterTokens(b) => { self.current_str.push_slice(&b); @@ -261,8 +261,8 @@ fn json_to_tokens(js: &Json, exact_errors: bool) -> Vec { for tok in js.get_list().iter() { assert_eq!(match *tok { Json::String(ref s) - if &s[..] == "ParseError" => sink.process_token(ParseError(Borrowed(""))), - _ => sink.process_token(json_to_token(tok)), + if &s[..] == "ParseError" => sink.process_token(ParseError(Borrowed("")), 0), + _ => sink.process_token(json_to_token(tok), 0), }, TokenSinkResult::Continue); } sink.get_tokens()