diff --git a/examples/noop-tokenize.rs b/examples/noop-tokenize.rs index 1310d54c..1eb5764a 100644 --- a/examples/noop-tokenize.rs +++ b/examples/noop-tokenize.rs @@ -22,7 +22,7 @@ use html5ever::tokenizer::{TokenSink, Token, Tokenizer}; struct Sink(Vec); impl TokenSink for Sink { - fn process_token(&mut self, token: Token) { + fn process_token(&mut self, token: Token, line_number: u64) { // Don't use the token, but make sure we don't get // optimized out entirely. self.0.push(token); diff --git a/examples/tokenize.rs b/examples/tokenize.rs index 08dd1ae6..2d2653ca 100644 --- a/examples/tokenize.rs +++ b/examples/tokenize.rs @@ -40,7 +40,7 @@ impl TokenPrinter { } impl TokenSink for TokenPrinter { - fn process_token(&mut self, token: Token) { + fn process_token(&mut self, token: Token, line_number: u64) { match token { CharacterTokens(b) => { for c in b.chars() { diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs index d552b28d..07401a46 100644 --- a/src/tokenizer/interface.rs +++ b/src/tokenizer/interface.rs @@ -100,7 +100,8 @@ unsafe impl Send for Token { } /// Types which can receive tokens from the tokenizer. pub trait TokenSink { /// Process a token. - fn process_token(&mut self, token: Token); + //fn process_token(&mut self, token: Token); + fn process_token(&mut self, token: Token, line_number: u64); /// Used in the markup declaration open state. By default, this always /// returns false and thus all CDATA sections are tokenized as bogus diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index ea23e490..94678d00 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -160,6 +160,9 @@ pub struct Tokenizer { /// Record of how many ns we spent in the token sink. time_in_sink: u64, + + /// Track current line + current_line: u64, } impl Tokenizer { @@ -192,6 +195,7 @@ impl Tokenizer { temp_buf: StrTendril::new(), state_profile: BTreeMap::new(), time_in_sink: 0, + current_line: 1, } } @@ -227,10 +231,10 @@ impl Tokenizer { fn process_token(&mut self, token: Token) { if self.opts.profile { - let (_, dt) = time!(self.sink.process_token(token)); + let (_, dt) = time!(self.sink.process_token(token, self.current_line)); self.time_in_sink += dt; } else { - self.sink.process_token(token); + self.sink.process_token(token, self.current_line); } } @@ -248,6 +252,7 @@ impl Tokenizer { if c == '\r' { self.ignore_lf = true; c = '\n'; + self.current_line += 1; } if self.opts.exact_errors && match c as u32 { @@ -353,6 +358,7 @@ impl Tokenizer { } fn emit_char(&mut self, c: char) { + self.process_token(match c { '\0' => NullCharacterToken, _ => CharacterTokens(StrTendril::from_char(c)), @@ -1336,12 +1342,91 @@ impl Tokenizer { } } +/// A struct with implementation of TokenSink to test behavior of Tokenizer when calling process_token +#[allow(dead_code)] +struct TokenMatch { + tokens: Vec, + current_str: StrTendril, + exact_errors: bool, + lines: Vec, +} +#[allow(dead_code)] +impl TokenMatch { + fn new(exact_errors: bool) -> TokenMatch { + TokenMatch { + tokens: vec!(), + current_str: StrTendril::new(), + exact_errors: exact_errors, + lines: vec!(), + } + } + + // Push anything other than character tokens + fn push(&mut self, token: Token) { + self.finish_str(); + self.tokens.push(token); + } + + fn finish_str(&mut self) { + if self.current_str.len() > 0 { + let s = replace(&mut self.current_str, StrTendril::new()); + self.tokens.push(CharacterTokens(s)); + } + } + +} + +impl TokenSink for TokenMatch { + fn process_token(&mut self, token: Token, line_number: u64) { + self.lines.push(line_number); + match token { + CharacterTokens(b) => { + self.current_str.push_slice(&b); + } + + NullCharacterToken => { + self.current_str.push_char('\0'); + } + + ParseError(_) => if self.exact_errors { + self.push(ParseError(Borrowed(""))); + }, + + TagToken(mut t) => { + // The spec seems to indicate that one can emit + // erroneous end tags with attrs, but the test + // cases don't contain them. + match t.kind { + EndTag => { + t.self_closing = false; + t.attrs = vec!(); + } + _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)), + } + self.push(TagToken(t)); + } + + EOFToken => (), + + _ => self.push(token), + } + } +} + + #[cfg(test)] #[allow(non_snake_case)] mod test { use super::option_push; // private items use tendril::{StrTendril, SliceExt}; + use std::io::{self}; + use super::{TokenSink, Tokenizer, TokenizerOpts, TokenMatch}; + + pub use super::interface::{Token, DoctypeToken, TagToken, CommentToken}; + pub use super::interface::{CharacterTokens, NullCharacterToken, EOFToken, ParseError}; + pub use super::interface::{Doctype, Attribute, TagKind, StartTag, EndTag, Tag}; + #[test] fn push_to_None_gives_singleton() { let mut s: Option = None; @@ -1362,4 +1447,60 @@ mod test { option_push(&mut s, 'x'); assert_eq!(s, Some("yx".to_tendril())); } + + + fn tokenize(input: Vec, opts: TokenizerOpts) -> Vec { + let sink = TokenMatch::new(opts.exact_errors); + let mut tok = Tokenizer::new(sink, opts); + for chunk in input.into_iter() { + // println!("{}", chunk); + tok.feed(chunk); + } + tok.end(); + tok.sink.lines + } + #[test] + fn check_four_lines() { + + let opts = TokenizerOpts{exact_errors: false, discard_bom: true, profile: false, + initial_state: None, last_start_tag_name: None,}; + + let vector = vec![StrTendril::from("a\r"), StrTendril::from("b\r"), + StrTendril::from("c\r"), StrTendril::from("d\r")]; + let expected = vec![1, 2, 2, 3, 3, 4, 4, 5, 5]; + + let results = tokenize(vector, opts); + + assert_eq!(results, expected); + + } + + #[test] + fn check_one_line_tag() { + let opts = TokenizerOpts{exact_errors: false, discard_bom: true, profile: false, + initial_state: None, last_start_tag_name: None,}; + + let vector = vec![StrTendril::from("")]; + let expected = vec![1, 1]; + + let results = tokenize(vector, opts); + + assert_eq!(results, expected); + } + + #[test] + fn check_tags_on_same_line() { + let opts = TokenizerOpts{exact_errors: false, discard_bom: true, profile: false, + initial_state: None, last_start_tag_name: None,}; + + let vector = vec![StrTendril::from(""), + StrTendril::from("www.google.com"), + StrTendril::from("")]; + let expected = vec![1, 1, 1, 1]; + + let results = tokenize(vector, opts); + + assert_eq!(results, expected); + } + } diff --git a/src/tree_builder/mod.rs b/src/tree_builder/mod.rs index 5bbc1027..7dc6c549 100644 --- a/src/tree_builder/mod.rs +++ b/src/tree_builder/mod.rs @@ -361,7 +361,7 @@ impl TokenSink where Handle: Clone, Sink: TreeSink, { - fn process_token(&mut self, token: tokenizer::Token) { + fn process_token(&mut self, token: tokenizer::Token, line_number: u64) { let ignore_lf = replace(&mut self.ignore_lf, false); // Handle `ParseError` and `DoctypeToken`; convert everything else to the local `Token` type. diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs index 2faebfe2..445f83de 100644 --- a/tests/tokenizer.rs +++ b/tests/tokenizer.rs @@ -96,7 +96,7 @@ impl TokenLogger { } impl TokenSink for TokenLogger { - fn process_token(&mut self, token: Token) { + fn process_token(&mut self, token: Token, line_number: u64) { match token { CharacterTokens(b) => { self.current_str.push_slice(&b); @@ -254,8 +254,8 @@ fn json_to_tokens(js: &Json, exact_errors: bool) -> Vec { for tok in js.get_list().iter() { match *tok { Json::String(ref s) - if &s[..] == "ParseError" => sink.process_token(ParseError(Borrowed(""))), - _ => sink.process_token(json_to_token(tok)), + if &s[..] == "ParseError" => sink.process_token(ParseError(Borrowed("")), 0), + _ => sink.process_token(json_to_token(tok), 0), } } sink.get_tokens()