From 321c745d9d83439625990248cb949823c94eef50 Mon Sep 17 00:00:00 2001 From: karenher Date: Thu, 1 Dec 2016 17:33:16 -0500 Subject: [PATCH 1/5] Added current_line variable to Tokenizer and test to check that current_line is correctly updated --- examples/noop-tokenize.rs | 2 +- examples/tokenize.rs | 2 +- src/tokenizer/interface.rs | 2 +- src/tokenizer/mod.rs | 131 ++++++++++++++++++++++++++++++++++++- src/tree_builder/mod.rs | 2 +- tests/tokenizer.rs | 6 +- 6 files changed, 136 insertions(+), 9 deletions(-) diff --git a/examples/noop-tokenize.rs b/examples/noop-tokenize.rs index 0cb6b534..c041dfe9 100644 --- a/examples/noop-tokenize.rs +++ b/examples/noop-tokenize.rs @@ -25,7 +25,7 @@ struct Sink(Vec); impl TokenSink for Sink { type Handle = (); - fn process_token(&mut self, token: Token) -> TokenSinkResult<()> { + fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<()> { // Don't use the token, but make sure we don't get // optimized out entirely. self.0.push(token); diff --git a/examples/tokenize.rs b/examples/tokenize.rs index 1916a2b6..94d242ed 100644 --- a/examples/tokenize.rs +++ b/examples/tokenize.rs @@ -43,7 +43,7 @@ impl TokenPrinter { impl TokenSink for TokenPrinter { type Handle = (); - fn process_token(&mut self, token: Token) -> TokenSinkResult<()> { + fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<()> { match token { CharacterTokens(b) => { for c in b.chars() { diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs index f154f011..37328eb0 100644 --- a/src/tokenizer/interface.rs +++ b/src/tokenizer/interface.rs @@ -111,7 +111,7 @@ pub trait TokenSink { type Handle; /// Process a token. - fn process_token(&mut self, token: Token) -> TokenSinkResult; + fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult; /// Used in the markup declaration open state. By default, this always /// returns false and thus all CDATA sections are tokenized as bogus diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index bcd6d601..72b7c52f 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -169,6 +169,9 @@ pub struct Tokenizer { /// Record of how many ns we spent in the token sink. time_in_sink: u64, + + /// Track current line + current_line: u64, } impl Tokenizer { @@ -200,6 +203,7 @@ impl Tokenizer { temp_buf: StrTendril::new(), state_profile: BTreeMap::new(), time_in_sink: 0, + current_line: 1, } } @@ -240,11 +244,11 @@ impl Tokenizer { fn process_token(&mut self, token: Token) -> TokenSinkResult { if self.opts.profile { - let (ret, dt) = time!(self.sink.process_token(token)); + let (ret, dt) = time!(self.sink.process_token(token, self.current_line)); self.time_in_sink += dt; ret } else { - self.sink.process_token(token) + self.sink.process_token(token, self.current_line) } } @@ -270,6 +274,7 @@ impl Tokenizer { if c == '\r' { self.ignore_lf = true; c = '\n'; + self.current_line += 1; } if self.opts.exact_errors && match c as u32 { @@ -1397,12 +1402,118 @@ impl Tokenizer { } } + + + + #[cfg(test)] #[allow(non_snake_case)] mod test { use super::option_push; // private items use tendril::{StrTendril, SliceExt}; + use std::io::{self}; + use super::{TokenSink, Tokenizer, TokenizerOpts, TokenSinkResult}; + + pub use super::interface::{Token, DoctypeToken, TagToken, CommentToken}; + pub use super::interface::{CharacterTokens, NullCharacterToken, EOFToken, ParseError}; + pub use super::interface::{Doctype, Attribute, TagKind, StartTag, EndTag, Tag}; + + use super::buffer_queue::{BufferQueue}; + use std::mem::replace; + use std::borrow::Cow::{self, Borrowed}; + + // TokenMatch implements the TokenSink trait. It is used for testing to see + // if current_line is being updated when process_token is called. The lines + // vector is a collection of the line numbers that each token is on. + struct TokenMatch { + tokens: Vec, + current_str: StrTendril, + exact_errors: bool, + lines: Vec, + } + + impl TokenMatch { + fn new(exact_errors: bool) -> TokenMatch { + TokenMatch { + tokens: vec!(), + current_str: StrTendril::new(), + exact_errors: exact_errors, + lines: vec!(), + } + } + + fn push(&mut self, token: Token) { + self.finish_str(); + self.tokens.push(token); + } + + fn finish_str(&mut self) { + if self.current_str.len() > 0 { + let s = replace(&mut self.current_str, StrTendril::new()); + self.tokens.push(CharacterTokens(s)); + } + } + + } + + impl TokenSink for TokenMatch { + + type Handle = (); + + fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult{ + + self.lines.push(line_number); + + match token { + CharacterTokens(b) => { + self.current_str.push_slice(&b); + } + + NullCharacterToken => { + self.current_str.push_char('\0'); + } + + ParseError(_) => if self.exact_errors { + self.push(ParseError(Borrowed(""))); + }, + + TagToken(mut t) => { + // The spec seems to indicate that one can emit + // erroneous end tags with attrs, but the test + // cases don't contain them. + match t.kind { + EndTag => { + t.self_closing = false; + t.attrs = vec!(); + } + _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)), + } + self.push(TagToken(t)); + } + + EOFToken => (), + + _ => self.push(token), + } + TokenSinkResult::Continue + } + } + + // Take in tokens, process them, and return vector with line + // numbers that each token is on + fn tokenize(input: Vec, opts: TokenizerOpts) -> Vec { + let sink = TokenMatch::new(opts.exact_errors); + let mut tok = Tokenizer::new(sink, opts); + let mut buffer = BufferQueue::new(); + for chunk in input.into_iter() { + buffer.push_back(chunk); + let _ = tok.feed(&mut buffer); + } + tok.end(); + tok.sink.lines + } + #[test] fn push_to_None_gives_singleton() { let mut s: Option = None; @@ -1423,4 +1534,20 @@ mod test { option_push(&mut s, 'x'); assert_eq!(s, Some("yx".to_tendril())); } + + #[test] + fn check_lines() { + + let opts = TokenizerOpts{exact_errors: false, discard_bom: true, profile: false, + initial_state: None, last_start_tag_name: None,}; + + let vector = vec![StrTendril::from("\r"), StrTendril::from("\r"), + StrTendril::from("\r"), StrTendril::from("\r")]; + let expected = vec![1, 2, 2, 3, 3, 4, 4, 5, 5]; + + let results = tokenize(vector, opts); + + assert_eq!(results, expected); + + } } diff --git a/src/tree_builder/mod.rs b/src/tree_builder/mod.rs index 3f276513..cb64915f 100644 --- a/src/tree_builder/mod.rs +++ b/src/tree_builder/mod.rs @@ -373,7 +373,7 @@ impl TokenSink { type Handle = Handle; - fn process_token(&mut self, token: tokenizer::Token) -> TokenSinkResult { + fn process_token(&mut self, token: tokenizer::Token, line_number: u64) -> TokenSinkResult { let ignore_lf = replace(&mut self.ignore_lf, false); // Handle `ParseError` and `DoctypeToken`; convert everything else to the local `Token` type. diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs index fb1cd1d7..8557c8c5 100644 --- a/tests/tokenizer.rs +++ b/tests/tokenizer.rs @@ -99,7 +99,7 @@ impl TokenLogger { impl TokenSink for TokenLogger { type Handle = (); - fn process_token(&mut self, token: Token) -> TokenSinkResult<()> { + fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<()> { match token { CharacterTokens(b) => { self.current_str.push_slice(&b); @@ -261,8 +261,8 @@ fn json_to_tokens(js: &Json, exact_errors: bool) -> Vec { for tok in js.get_list().iter() { assert_eq!(match *tok { Json::String(ref s) - if &s[..] == "ParseError" => sink.process_token(ParseError(Borrowed(""))), - _ => sink.process_token(json_to_token(tok)), + if &s[..] == "ParseError" => sink.process_token(ParseError(Borrowed("")), 0), + _ => sink.process_token(json_to_token(tok), 0), }, TokenSinkResult::Continue); } sink.get_tokens() From 12259aaf4114dd752e3fdba2d0d289f690b3f17d Mon Sep 17 00:00:00 2001 From: karenher Date: Thu, 1 Dec 2016 18:39:17 -0500 Subject: [PATCH 2/5] Added in missing parameter --- benches/tokenizer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benches/tokenizer.rs b/benches/tokenizer.rs index 878d23e5..9e6a609f 100644 --- a/benches/tokenizer.rs +++ b/benches/tokenizer.rs @@ -29,7 +29,7 @@ struct Sink; impl TokenSink for Sink { type Handle = (); - fn process_token(&mut self, token: Token) -> TokenSinkResult<()> { + fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<()> { // Don't use the token, but make sure we don't get // optimized out entirely. black_box(token); From d8322a5c85a76af15e7b721fe9e9be91c1a6a3f3 Mon Sep 17 00:00:00 2001 From: karenher Date: Wed, 21 Dec 2016 09:14:25 -0500 Subject: [PATCH 3/5] Made test check tuples of tokens and line numbers and changed other style issues --- src/tokenizer/mod.rs | 97 +++++++++++++++++++++++++++----------------- 1 file changed, 60 insertions(+), 37 deletions(-) diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 72b7c52f..c0bcd13f 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -203,7 +203,7 @@ impl Tokenizer { temp_buf: StrTendril::new(), state_profile: BTreeMap::new(), time_in_sink: 0, - current_line: 1, + current_line: 1, } } @@ -274,7 +274,10 @@ impl Tokenizer { if c == '\r' { self.ignore_lf = true; c = '\n'; - self.current_line += 1; + } + + if c == '\n' { + self.current_line += 1; } if self.opts.exact_errors && match c as u32 { @@ -1402,10 +1405,6 @@ impl Tokenizer { } } - - - - #[cfg(test)] #[allow(non_snake_case)] mod test { @@ -1415,37 +1414,38 @@ mod test { use std::io::{self}; use super::{TokenSink, Tokenizer, TokenizerOpts, TokenSinkResult}; - pub use super::interface::{Token, DoctypeToken, TagToken, CommentToken}; - pub use super::interface::{CharacterTokens, NullCharacterToken, EOFToken, ParseError}; - pub use super::interface::{Doctype, Attribute, TagKind, StartTag, EndTag, Tag}; + use super::interface::{Token, DoctypeToken, TagToken, CommentToken}; + use super::interface::{CharacterTokens, NullCharacterToken, EOFToken, ParseError}; + use super::interface::{Doctype, Attribute, TagKind, StartTag, EndTag, Tag}; use super::buffer_queue::{BufferQueue}; use std::mem::replace; use std::borrow::Cow::{self, Borrowed}; - // TokenMatch implements the TokenSink trait. It is used for testing to see + use {LocalName}; + + // LinesMatch implements the TokenSink trait. It is used for testing to see // if current_line is being updated when process_token is called. The lines // vector is a collection of the line numbers that each token is on. - struct TokenMatch { + struct LinesMatch { tokens: Vec, current_str: StrTendril, - exact_errors: bool, - lines: Vec, + lines: Vec<(Token, u64)>, } - impl TokenMatch { - fn new(exact_errors: bool) -> TokenMatch { - TokenMatch { + impl LinesMatch { + fn new() -> LinesMatch { + LinesMatch { tokens: vec!(), current_str: StrTendril::new(), - exact_errors: exact_errors, lines: vec!(), } } - fn push(&mut self, token: Token) { + fn push(&mut self, token: Token, line_number: u64) { self.finish_str(); - self.tokens.push(token); + // self.tokens.push(token); + self.lines.push((token, line_number)); } fn finish_str(&mut self) { @@ -1457,14 +1457,12 @@ mod test { } - impl TokenSink for TokenMatch { + impl TokenSink for LinesMatch { type Handle = (); - fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult{ + fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult { - self.lines.push(line_number); - match token { CharacterTokens(b) => { self.current_str.push_slice(&b); @@ -1474,9 +1472,9 @@ mod test { self.current_str.push_char('\0'); } - ParseError(_) => if self.exact_errors { - self.push(ParseError(Borrowed(""))); - }, + ParseError(_) => { + panic!("unexpected parse error"); + } TagToken(mut t) => { // The spec seems to indicate that one can emit @@ -1489,12 +1487,12 @@ mod test { } _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)), } - self.push(TagToken(t)); + self.push(TagToken(t), line_number); } EOFToken => (), - _ => self.push(token), + _ => self.push(token, line_number), } TokenSinkResult::Continue } @@ -1502,8 +1500,8 @@ mod test { // Take in tokens, process them, and return vector with line // numbers that each token is on - fn tokenize(input: Vec, opts: TokenizerOpts) -> Vec { - let sink = TokenMatch::new(opts.exact_errors); + fn tokenize(input: Vec, opts: TokenizerOpts) -> Vec<(Token, u64)> { + let sink = LinesMatch::new(); let mut tok = Tokenizer::new(sink, opts); let mut buffer = BufferQueue::new(); for chunk in input.into_iter() { @@ -1514,6 +1512,17 @@ mod test { tok.sink.lines } + // Create a tag token + fn create_tag(token: StrTendril, tagkind: TagKind) -> Token { + let name = LocalName::from(&*token); + let token = TagToken(Tag { kind: tagkind, + name: name, + self_closing: false, + attrs: vec!(), + }); + token + } + #[test] fn push_to_None_gives_singleton() { let mut s: Option = None; @@ -1537,17 +1546,31 @@ mod test { #[test] fn check_lines() { - - let opts = TokenizerOpts{exact_errors: false, discard_bom: true, profile: false, - initial_state: None, last_start_tag_name: None,}; - + let opts = TokenizerOpts { exact_errors: false, discard_bom: true, profile: false, + initial_state: None, last_start_tag_name: None, + }; let vector = vec![StrTendril::from("\r"), StrTendril::from("\r"), StrTendril::from("\r"), StrTendril::from("\r")]; - let expected = vec![1, 2, 2, 3, 3, 4, 4, 5, 5]; - + let expected = vec![(create_tag(StrTendril::from("a"), StartTag), 1), + (create_tag(StrTendril::from("b"), StartTag), 2), + (create_tag(StrTendril::from("b"), EndTag), 3), + (create_tag(StrTendril::from("a"), EndTag), 4)]; let results = tokenize(vector, opts); + assert_eq!(results, expected); + } + #[test] + fn check_lines_with_new_line() { + let opts = TokenizerOpts { exact_errors: false, discard_bom: true, profile: false, + initial_state: None, last_start_tag_name: None, + }; + let vector = vec![StrTendril::from("\r\n"), StrTendril::from("\r\n"), + StrTendril::from("\r\n"), StrTendril::from("\r\n")]; + let expected = vec![(create_tag(StrTendril::from("a"), StartTag), 1), + (create_tag(StrTendril::from("b"), StartTag), 2), + (create_tag(StrTendril::from("b"), EndTag), 3), + (create_tag(StrTendril::from("a"), EndTag), 4)]; + let results = tokenize(vector, opts); assert_eq!(results, expected); - } } From 8766e0afdcabff7e7e9febdc9d5b5fd19f1399af Mon Sep 17 00:00:00 2001 From: karenher Date: Wed, 21 Dec 2016 10:32:30 -0500 Subject: [PATCH 4/5] Added set_current_line function and test to treebuilder --- examples/noop-tree-builder.rs | 1 + examples/print-tree-actions.rs | 2 + src/rcdom.rs | 8 ++ src/tree_builder/interface.rs | 2 + src/tree_builder/mod.rs | 169 +++++++++++++++++++++++++++++++++ 5 files changed, 182 insertions(+) diff --git a/examples/noop-tree-builder.rs b/examples/noop-tree-builder.rs index 0860cb92..197e28c6 100644 --- a/examples/noop-tree-builder.rs +++ b/examples/noop-tree-builder.rs @@ -90,6 +90,7 @@ impl TreeSink for Sink { fn remove_from_parent(&mut self, _target: usize) { } fn reparent_children(&mut self, _node: usize, _new_parent: usize) { } fn mark_script_already_started(&mut self, _node: usize) { } + fn set_current_line(&mut self, line_number: u64) { } } fn main() { diff --git a/examples/print-tree-actions.rs b/examples/print-tree-actions.rs index 84d51d60..1009627f 100644 --- a/examples/print-tree-actions.rs +++ b/examples/print-tree-actions.rs @@ -132,6 +132,8 @@ impl TreeSink for Sink { fn mark_script_already_started(&mut self, node: usize) { println!("Mark script {} as already started", node); } + + fn set_current_line(&mut self, line_number: u64) { } } // FIXME: Copy of str::escape_default from std, which is currently unstable diff --git a/src/rcdom.rs b/src/rcdom.rs index 82dedec0..12f385e6 100644 --- a/src/rcdom.rs +++ b/src/rcdom.rs @@ -161,6 +161,9 @@ pub struct RcDom { /// The document's quirks mode. pub quirks_mode: QuirksMode, + + /// The current line being parsed + pub current_line: u64, } impl TreeSink for RcDom { @@ -326,6 +329,10 @@ impl TreeSink for RcDom { _ => unreachable!(), } } + + fn set_current_line(&mut self, line_number: u64) { + self.current_line = line_number; + } } impl Default for RcDom { @@ -334,6 +341,7 @@ impl Default for RcDom { document: new_node(Document), errors: vec!(), quirks_mode: tree_builder::NoQuirks, + current_line: 1, } } } diff --git a/src/tree_builder/interface.rs b/src/tree_builder/interface.rs index 3aaa3a87..8e992257 100644 --- a/src/tree_builder/interface.rs +++ b/src/tree_builder/interface.rs @@ -138,6 +138,8 @@ pub trait TreeSink { fn is_mathml_annotation_xml_integration_point(&self, handle: Self::Handle) -> bool { false } + + fn set_current_line(&mut self, line_number: u64); } /// Trace hooks for a garbage-collected DOM. diff --git a/src/tree_builder/mod.rs b/src/tree_builder/mod.rs index cb64915f..5a6f2251 100644 --- a/src/tree_builder/mod.rs +++ b/src/tree_builder/mod.rs @@ -33,6 +33,13 @@ use std::mem::replace; use std::borrow::Cow::Borrowed; use std::collections::VecDeque; +pub use rcdom::ElementEnum::{AnnotationXml, Normal, Template}; +pub use rcdom::NodeEnum::{Document, Comment}; +use std::cell::RefCell; +use std::rc::{Rc, Weak}; + +use rcdom::{Node, Handle, RcDom, NodeEnum, ElementEnum}; + #[macro_use] mod tag_sets; // "pub" is a workaround for rust#18241 (?) pub mod interface; @@ -135,6 +142,9 @@ pub struct TreeBuilder { /// The context element for the fragment parsing algorithm. context_elem: Option, + /// Track current line + current_line: u64, + // WARNING: If you add new fields that contain Handles, you // must add them to trace_handles() below to preserve memory // safety! @@ -168,6 +178,7 @@ impl TreeBuilder ignore_lf: false, foster_parenting: false, context_elem: None, + current_line: 1, } } @@ -199,6 +210,7 @@ impl TreeBuilder ignore_lf: false, foster_parenting: false, context_elem: Some(context_elem), + current_line: 1, }; // https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments @@ -374,6 +386,9 @@ impl TokenSink type Handle = Handle; fn process_token(&mut self, token: tokenizer::Token, line_number: u64) -> TokenSinkResult { + if line_number != self.current_line { + self.sink.set_current_line(line_number); + } let ignore_lf = replace(&mut self.ignore_lf, false); // Handle `ParseError` and `DoctypeToken`; convert everything else to the local `Token` type. @@ -435,3 +450,157 @@ impl TokenSink self.sink.elem_name(self.adjusted_current_node()).ns != ns!(html) } } + +#[cfg(test)] +#[allow(non_snake_case)] +mod test { + use super::interface::{QuirksMode, Quirks, LimitedQuirks, NoQuirks}; + use super::interface::{NodeOrText, AppendNode, AppendText}; + use super::interface::{TreeSink, Tracer}; + + use super::types::*; + use super::actions::TreeBuilderActions; + use super::rules::TreeBuilderStep; + + use QualName; + use tendril::StrTendril; + use tendril::stream::{TendrilSink, Utf8LossyDecoder, LossyDecoder}; + + use tokenizer; + use tokenizer::{Tokenizer, TokenizerOpts}; + use tokenizer::{Doctype, StartTag, Tag, TokenSink}; + use tokenizer::states as tok_state; + + use util::str::is_ascii_whitespace; + + use std::default::Default; + use std::mem::replace; + use std::borrow::Cow; + use std::borrow::Cow::Borrowed; + use std::collections::VecDeque; + + use driver::*; + use super::{TreeBuilderOpts, TreeBuilder}; + use tokenizer::Attribute; + use rcdom::{Node, Handle, RcDom, NodeEnum, ElementEnum}; + + pub struct LineCountingDOM { + pub line_vec: Vec<(QualName, u64)>, + pub rcdom: RcDom, + } + + impl TreeSink for LineCountingDOM { + type Output = Self; + + fn finish(self) -> Self { self } + + type Handle = Handle; + + fn parse_error(&mut self, msg: Cow<'static, str>) { + self.rcdom.parse_error(msg); + } + + fn get_document(&mut self) -> Handle { + self.rcdom.get_document() + } + + fn get_template_contents(&mut self, target: Handle) -> Handle { + self.rcdom.get_template_contents(target) + } + + fn set_quirks_mode(&mut self, mode: QuirksMode) { + self.rcdom.set_quirks_mode(mode) + } + + fn same_node(&self, x: Handle, y: Handle) -> bool { + self.rcdom.same_node(x, y) + } + + fn elem_name(&self, target: Handle) -> QualName { + self.rcdom.elem_name(target) + } + + fn create_element(&mut self, name: QualName, attrs: Vec) -> Handle { + self.line_vec.push((name.clone(), self.rcdom.current_line)); + self.rcdom.create_element(name, attrs) + } + + fn create_comment(&mut self, text: StrTendril) -> Handle { + self.rcdom.create_comment(text) + } + + fn append(&mut self, parent: Handle, child: NodeOrText) { + self.rcdom.append(parent, child) + } + + fn append_before_sibling(&mut self, + sibling: Handle, + child: NodeOrText) -> Result<(), NodeOrText> { + self.rcdom.append_before_sibling(sibling, child) + } + + fn append_doctype_to_document(&mut self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril) { + self.rcdom.append_doctype_to_document(name, public_id, system_id); + } + + fn add_attrs_if_missing(&mut self, target: Handle, attrs: Vec) { + self.rcdom.add_attrs_if_missing(target, attrs); + } + + fn remove_from_parent(&mut self, target: Handle) { + self.rcdom.remove_from_parent(target); + } + + fn reparent_children(&mut self, node: Handle, new_parent: Handle) { + self.rcdom.reparent_children(node, new_parent); + } + + fn mark_script_already_started(&mut self, target: Handle) { + self.rcdom.mark_script_already_started(target); + } + + fn is_mathml_annotation_xml_integration_point(&self, handle: Self::Handle) -> bool { + self.rcdom.is_mathml_annotation_xml_integration_point(handle) + } + + fn set_current_line(&mut self, line_number: u64) { + self.set_current_line(line_number); + self.rcdom.set_current_line(line_number); + } + } + + impl Default for LineCountingDOM { + fn default() -> LineCountingDOM { + LineCountingDOM { + line_vec: vec!(), + rcdom: RcDom::default(), + } + } + } + + #[test] + fn check_four_lines() { + // Input + let sink = LineCountingDOM::default(); + let opts = ParseOpts::default(); + let mut resultTok = parse_document(sink, opts); + resultTok.process(StrTendril::from("")); + resultTok.process(StrTendril::from("")); + resultTok.process(StrTendril::from("")); + resultTok.process(StrTendril::from("")); + // Actual Output + let actual = resultTok.finish(); + // Expected Output + let expected = vec![(qualname!(html, "html"), 1), + (qualname!(html, "head"), 1), + (qualname!(html, "body"), 1), + (qualname!(html, "a"), 1), + (qualname!(html, "b"), 1)]; + let result = actual.line_vec.clone(); + // Assertion + assert_eq!(result, expected); + } +} \ No newline at end of file From df8982bd1b6cebd6152f2fff63b9f6af4f49c23c Mon Sep 17 00:00:00 2001 From: karenher Date: Fri, 23 Dec 2016 19:35:20 -0500 Subject: [PATCH 5/5] Fixed style issues and test. Added '\n' to pop_except_from function call in step function to account to make sure current_line is updated. --- benches/tokenizer.rs | 2 +- examples/noop-tokenize.rs | 2 +- examples/tokenize.rs | 2 +- src/rcdom.rs | 8 +------- src/tokenizer/mod.rs | 41 ++++++++++++++++++++++----------------- src/tree_builder/mod.rs | 39 +++++++++++++------------------------ 6 files changed, 40 insertions(+), 54 deletions(-) diff --git a/benches/tokenizer.rs b/benches/tokenizer.rs index 9e6a609f..cdf1dbc8 100644 --- a/benches/tokenizer.rs +++ b/benches/tokenizer.rs @@ -29,7 +29,7 @@ struct Sink; impl TokenSink for Sink { type Handle = (); - fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<()> { + fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> { // Don't use the token, but make sure we don't get // optimized out entirely. black_box(token); diff --git a/examples/noop-tokenize.rs b/examples/noop-tokenize.rs index c041dfe9..4fbc86a6 100644 --- a/examples/noop-tokenize.rs +++ b/examples/noop-tokenize.rs @@ -25,7 +25,7 @@ struct Sink(Vec); impl TokenSink for Sink { type Handle = (); - fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<()> { + fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> { // Don't use the token, but make sure we don't get // optimized out entirely. self.0.push(token); diff --git a/examples/tokenize.rs b/examples/tokenize.rs index 94d242ed..0b2ab39d 100644 --- a/examples/tokenize.rs +++ b/examples/tokenize.rs @@ -43,7 +43,7 @@ impl TokenPrinter { impl TokenSink for TokenPrinter { type Handle = (); - fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<()> { + fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> { match token { CharacterTokens(b) => { for c in b.chars() { diff --git a/src/rcdom.rs b/src/rcdom.rs index 12f385e6..0e187528 100644 --- a/src/rcdom.rs +++ b/src/rcdom.rs @@ -161,9 +161,6 @@ pub struct RcDom { /// The document's quirks mode. pub quirks_mode: QuirksMode, - - /// The current line being parsed - pub current_line: u64, } impl TreeSink for RcDom { @@ -330,9 +327,7 @@ impl TreeSink for RcDom { } } - fn set_current_line(&mut self, line_number: u64) { - self.current_line = line_number; - } + fn set_current_line(&mut self, _line_number: u64) { } } impl Default for RcDom { @@ -341,7 +336,6 @@ impl Default for RcDom { document: new_node(Document), errors: vec!(), quirks_mode: tree_builder::NoQuirks, - current_line: 1, } } } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index c0bcd13f..5ea64305 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -698,7 +698,7 @@ impl Tokenizer { match self.state { //§ data-state states::Data => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\0'), FromSet('&') => go!(self: consume_char_ref), FromSet('<') => go!(self: to TagOpen), @@ -709,7 +709,7 @@ impl Tokenizer { //§ rcdata-state states::RawData(Rcdata) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('&') => go!(self: consume_char_ref), FromSet('<') => go!(self: to RawLessThanSign Rcdata), @@ -720,7 +720,7 @@ impl Tokenizer { //§ rawtext-state states::RawData(Rawtext) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\0' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('<') => go!(self: to RawLessThanSign Rawtext), FromSet(c) => go!(self: emit c), @@ -730,7 +730,7 @@ impl Tokenizer { //§ script-data-state states::RawData(ScriptData) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\0' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('<') => go!(self: to RawLessThanSign ScriptData), FromSet(c) => go!(self: emit c), @@ -740,7 +740,7 @@ impl Tokenizer { //§ script-data-escaped-state states::RawData(ScriptDataEscaped(Escaped)) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped), FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped), @@ -751,7 +751,7 @@ impl Tokenizer { //§ script-data-double-escaped-state states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped), FromSet('<') => go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped), @@ -762,7 +762,7 @@ impl Tokenizer { //§ plaintext-state states::Plaintext => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\0')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet(c) => go!(self: emit c), NotFromSet(b) => self.emit_chars(b), @@ -982,7 +982,7 @@ impl Tokenizer { //§ attribute-value-(double-quoted)-state states::AttributeValue(DoubleQuoted) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0')) { + match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) { FromSet('"') => go!(self: to AfterAttributeValueQuoted), FromSet('&') => go!(self: consume_char_ref '"'), FromSet('\0') => go!(self: error; push_value '\u{fffd}'), @@ -993,7 +993,7 @@ impl Tokenizer { //§ attribute-value-(single-quoted)-state states::AttributeValue(SingleQuoted) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0')) { + match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) { FromSet('\'') => go!(self: to AfterAttributeValueQuoted), FromSet('&') => go!(self: consume_char_ref '\''), FromSet('\0') => go!(self: error; push_value '\u{fffd}'), @@ -1411,7 +1411,6 @@ mod test { use super::option_push; // private items use tendril::{StrTendril, SliceExt}; - use std::io::{self}; use super::{TokenSink, Tokenizer, TokenizerOpts, TokenSinkResult}; use super::interface::{Token, DoctypeToken, TagToken, CommentToken}; @@ -1420,7 +1419,6 @@ mod test { use super::buffer_queue::{BufferQueue}; use std::mem::replace; - use std::borrow::Cow::{self, Borrowed}; use {LocalName}; @@ -1444,7 +1442,6 @@ mod test { fn push(&mut self, token: Token, line_number: u64) { self.finish_str(); - // self.tokens.push(token); self.lines.push((token, line_number)); } @@ -1546,11 +1543,15 @@ mod test { #[test] fn check_lines() { - let opts = TokenizerOpts { exact_errors: false, discard_bom: true, profile: false, - initial_state: None, last_start_tag_name: None, + let opts = TokenizerOpts { + exact_errors: false, + discard_bom: true, + profile: false, + initial_state: None, + last_start_tag_name: None, }; - let vector = vec![StrTendril::from("\r"), StrTendril::from("\r"), - StrTendril::from("\r"), StrTendril::from("\r")]; + let vector = vec![StrTendril::from("\n"), StrTendril::from("\n"), + StrTendril::from("\n"), StrTendril::from("\n")]; let expected = vec![(create_tag(StrTendril::from("a"), StartTag), 1), (create_tag(StrTendril::from("b"), StartTag), 2), (create_tag(StrTendril::from("b"), EndTag), 3), @@ -1561,8 +1562,12 @@ mod test { #[test] fn check_lines_with_new_line() { - let opts = TokenizerOpts { exact_errors: false, discard_bom: true, profile: false, - initial_state: None, last_start_tag_name: None, + let opts = TokenizerOpts { + exact_errors: false, + discard_bom: true, + profile: false, + initial_state: None, + last_start_tag_name: None, }; let vector = vec![StrTendril::from("\r\n"), StrTendril::from("\r\n"), StrTendril::from("\r\n"), StrTendril::from("\r\n")]; diff --git a/src/tree_builder/mod.rs b/src/tree_builder/mod.rs index 5a6f2251..7eaa7ce6 100644 --- a/src/tree_builder/mod.rs +++ b/src/tree_builder/mod.rs @@ -33,13 +33,6 @@ use std::mem::replace; use std::borrow::Cow::Borrowed; use std::collections::VecDeque; -pub use rcdom::ElementEnum::{AnnotationXml, Normal, Template}; -pub use rcdom::NodeEnum::{Document, Comment}; -use std::cell::RefCell; -use std::rc::{Rc, Weak}; - -use rcdom::{Node, Handle, RcDom, NodeEnum, ElementEnum}; - #[macro_use] mod tag_sets; // "pub" is a workaround for rust#18241 (?) pub mod interface; @@ -486,6 +479,7 @@ mod test { pub struct LineCountingDOM { pub line_vec: Vec<(QualName, u64)>, + pub current_line: u64, pub rcdom: RcDom, } @@ -521,7 +515,7 @@ mod test { } fn create_element(&mut self, name: QualName, attrs: Vec) -> Handle { - self.line_vec.push((name.clone(), self.rcdom.current_line)); + self.line_vec.push((name.clone(), self.current_line)); self.rcdom.create_element(name, attrs) } @@ -567,29 +561,23 @@ mod test { } fn set_current_line(&mut self, line_number: u64) { - self.set_current_line(line_number); - self.rcdom.set_current_line(line_number); - } - } - - impl Default for LineCountingDOM { - fn default() -> LineCountingDOM { - LineCountingDOM { - line_vec: vec!(), - rcdom: RcDom::default(), - } + self.current_line = line_number; } } #[test] fn check_four_lines() { // Input - let sink = LineCountingDOM::default(); + let sink = LineCountingDOM { + line_vec: vec!(), + current_line: 1, + rcdom: RcDom::default(), + }; let opts = ParseOpts::default(); let mut resultTok = parse_document(sink, opts); - resultTok.process(StrTendril::from("")); - resultTok.process(StrTendril::from("")); - resultTok.process(StrTendril::from("")); + resultTok.process(StrTendril::from("\n")); + resultTok.process(StrTendril::from("\n")); + resultTok.process(StrTendril::from("\n")); resultTok.process(StrTendril::from("")); // Actual Output let actual = resultTok.finish(); @@ -598,9 +586,8 @@ mod test { (qualname!(html, "head"), 1), (qualname!(html, "body"), 1), (qualname!(html, "a"), 1), - (qualname!(html, "b"), 1)]; - let result = actual.line_vec.clone(); + (qualname!(html, "b"), 3)]; // Assertion - assert_eq!(result, expected); + assert_eq!(actual.line_vec, expected); } } \ No newline at end of file