From 43f09846cb6e44a5e0a3d7716ac29fbbe092ad1e Mon Sep 17 00:00:00 2001 From: Anthony Ramine Date: Mon, 24 Oct 2016 10:08:15 +0200 Subject: [PATCH 1/5] Silence two warnings in examples --- examples/print-rcdom.rs | 2 +- examples/tokenize.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/print-rcdom.rs b/examples/print-rcdom.rs index 016a9595..7b9a8738 100644 --- a/examples/print-rcdom.rs +++ b/examples/print-rcdom.rs @@ -13,7 +13,7 @@ extern crate html5ever; extern crate string_cache; extern crate tendril; -use std::io::{self, Read}; +use std::io; use std::iter::repeat; use std::default::Default; use std::string::String; diff --git a/examples/tokenize.rs b/examples/tokenize.rs index 08dd1ae6..02be7c55 100644 --- a/examples/tokenize.rs +++ b/examples/tokenize.rs @@ -10,7 +10,7 @@ extern crate tendril; extern crate html5ever; -use std::io::{self, Read}; +use std::io; use std::default::Default; use tendril::{ByteTendril, ReadExt}; From b502a3ed5ef3862e5913e04fe615bb65b31ad60b Mon Sep 17 00:00:00 2001 From: Anthony Ramine Date: Mon, 24 Oct 2016 10:08:30 +0200 Subject: [PATCH 2/5] Use a saner, smaller enum for query_state_change --- src/tokenizer/interface.rs | 8 +++++++- src/tokenizer/mod.rs | 31 +++++++++++++++++-------------- src/tokenizer/states.rs | 1 - src/tree_builder/actions.rs | 4 ++-- src/tree_builder/mod.rs | 6 +++--- src/tree_builder/rules.rs | 8 ++++---- 6 files changed, 33 insertions(+), 25 deletions(-) diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs index d552b28d..ebb37b93 100644 --- a/src/tokenizer/interface.rs +++ b/src/tokenizer/interface.rs @@ -97,6 +97,12 @@ pub enum Token { // FIXME: rust-lang/rust#22629 unsafe impl Send for Token { } +pub enum StateChangeQuery { + Plaintext, + Quiescent, + RawData(states::RawKind), +} + /// Types which can receive tokens from the tokenizer. pub trait TokenSink { /// Process a token. @@ -113,7 +119,7 @@ pub trait TokenSink { /// The tokenizer will call this after emitting any tag. /// This allows the tree builder to change the tokenizer's state. /// By default no state changes occur. - fn query_state_change(&mut self) -> Option { + fn query_state_change(&mut self) -> Option { None } } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 18a90ae6..09a86cf6 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -14,7 +14,7 @@ pub use self::interface::{Doctype, Attribute, TagKind, StartTag, EndTag, Tag}; pub use self::interface::{Token, DoctypeToken, TagToken, CommentToken}; pub use self::interface::{CharacterTokens, NullCharacterToken, EOFToken, ParseError}; -pub use self::interface::TokenSink; +pub use self::interface::{StateChangeQuery, TokenSink}; use self::states::{RawLessThanSign, RawEndTagOpen, RawEndTagName}; use self::states::{Rcdata, Rawtext, ScriptData, ScriptDataEscaped}; @@ -364,7 +364,7 @@ impl Tokenizer { self.process_token(CharacterTokens(b)); } - fn emit_current_tag(&mut self) { + fn emit_current_tag(&mut self) -> bool { self.finish_attribute(); let name = Atom::from(&*self.current_tag_name); @@ -392,8 +392,19 @@ impl Tokenizer { self.process_token(token); match self.sink.query_state_change() { - None => (), - Some(s) => self.state = s, + None => true, + Some(StateChangeQuery::Plaintext) => { + self.state = states::Plaintext; + true + }, + Some(StateChangeQuery::Quiescent) => { + self.state = states::Data; + false + }, + Some(StateChangeQuery::RawData(kind)) => { + self.state = states::RawData(kind); + true + } } } @@ -587,8 +598,7 @@ macro_rules! go ( // We have a default next state after emitting a tag, but the sink can override. ( $me:ident : emit_tag $s:ident ) => ({ $me.state = states::$s; - $me.emit_current_tag(); - return true; + return $me.emit_current_tag(); }); ( $me:ident : eof ) => ({ $me.emit_eof(); return false; }); @@ -640,13 +650,6 @@ impl Tokenizer { debug!("processing in state {:?}", self.state); match self.state { - // Reachable only through `query_state_change`. The tree builder wants - // the tokenizer to suspend processing. - states::Quiescent => { - self.state = states::Data; - return false; - } - //§ data-state states::Data => loop { match pop_except_from!(self, small_char_set!('\r' '\0' '&' '<')) { @@ -1278,7 +1281,7 @@ impl Tokenizer { debug!("processing EOF in state {:?}", self.state); match self.state { states::Data | states::RawData(Rcdata) | states::RawData(Rawtext) - | states::RawData(ScriptData) | states::Plaintext | states::Quiescent + | states::RawData(ScriptData) | states::Plaintext => go!(self: eof), states::TagName | states::RawData(ScriptDataEscaped(_)) diff --git a/src/tokenizer/states.rs b/src/tokenizer/states.rs index 4b0da1a0..4df67569 100644 --- a/src/tokenizer/states.rs +++ b/src/tokenizer/states.rs @@ -90,5 +90,4 @@ pub enum State { CdataSection, CdataSectionBracket, CdataSectionEnd, - Quiescent, } diff --git a/src/tree_builder/actions.rs b/src/tree_builder/actions.rs index 8f304c6b..edf49d49 100644 --- a/src/tree_builder/actions.rs +++ b/src/tree_builder/actions.rs @@ -17,7 +17,7 @@ use tree_builder::tag_sets::*; use tree_builder::interface::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText}; use tree_builder::rules::TreeBuilderStep; -use tokenizer::{Attribute, Tag, StartTag, EndTag}; +use tokenizer::{Attribute, Tag, StartTag, StateChangeQuery, EndTag}; use tokenizer::states::{RawData, RawKind}; use util::str::to_escaped_string; @@ -178,7 +178,7 @@ impl TreeBuilderActions // `process_token` of a start tag returns! fn to_raw_text_mode(&mut self, k: RawKind) { assert!(self.next_tokenizer_state.is_none()); - self.next_tokenizer_state = Some(RawData(k)); + self.next_tokenizer_state = Some(StateChangeQuery::RawData(k)); self.orig_mode = Some(self.mode); self.mode = Text; } diff --git a/src/tree_builder/mod.rs b/src/tree_builder/mod.rs index 469a61ca..0272ba16 100644 --- a/src/tree_builder/mod.rs +++ b/src/tree_builder/mod.rs @@ -23,7 +23,7 @@ use string_cache::QualName; use tendril::StrTendril; use tokenizer; -use tokenizer::{Doctype, StartTag, Tag, TokenSink}; +use tokenizer::{Doctype, StartTag, StateChangeQuery, Tag, TokenSink}; use tokenizer::states as tok_state; use util::str::is_ascii_whitespace; @@ -124,7 +124,7 @@ pub struct TreeBuilder { //§ END /// Next state change for the tokenizer, if any. - next_tokenizer_state: Option, + next_tokenizer_state: Option, /// Frameset-ok flag. frameset_ok: bool, @@ -428,7 +428,7 @@ impl TokenSink self.sink.elem_name(self.adjusted_current_node()).ns != ns!(html) } - fn query_state_change(&mut self) -> Option { + fn query_state_change(&mut self) -> Option { self.next_tokenizer_state.take() } } diff --git a/src/tree_builder/rules.rs b/src/tree_builder/rules.rs index 796d1aa0..acfa2a2a 100644 --- a/src/tree_builder/rules.rs +++ b/src/tree_builder/rules.rs @@ -14,8 +14,8 @@ use tree_builder::tag_sets::*; use tree_builder::actions::{NoPush, Push, TreeBuilderActions}; use tree_builder::interface::{TreeSink, Quirks, AppendNode, NextParserState}; -use tokenizer::{Attribute, EndTag, StartTag, Tag}; -use tokenizer::states::{Rcdata, Rawtext, ScriptData, Plaintext, Quiescent}; +use tokenizer::{Attribute, EndTag, StartTag, StateChangeQuery, Tag}; +use tokenizer::states::{Rcdata, Rawtext, ScriptData, Plaintext}; use util::str::is_ascii_whitespace; @@ -438,7 +438,7 @@ impl TreeBuilderStep tag @ => { self.close_p_element_in_button_scope(); self.insert_element_for(tag); - self.next_tokenizer_state = Some(Plaintext); + self.next_tokenizer_state = Some(StateChangeQuery::Plaintext); Done } @@ -772,7 +772,7 @@ impl<Handle, Sink> TreeBuilderStep if tag.name == atom!("script") { warn!("FIXME: </script> not fully implemented"); if self.sink.complete_script(node) == NextParserState::Suspend { - self.next_tokenizer_state = Some(Quiescent); + self.next_tokenizer_state = Some(StateChangeQuery::Quiescent); } } self.mode = self.orig_mode.take().unwrap(); From 856fcb26c96e615b97c6be6cda065f180771fccf Mon Sep 17 00:00:00 2001 From: Anthony Ramine <n.oxyde@gmail.com> Date: Tue, 25 Oct 2016 14:06:00 +0200 Subject: [PATCH 3/5] Remove a useless arm in BeforeAttributeValue --- src/tokenizer/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 09a86cf6..4e169520 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -928,7 +928,6 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::BeforeAttributeValue => loop { match peek!(self) { '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char), '"' => go!(self: discard_char; to AttributeValue DoubleQuoted), - '&' => go!(self: to AttributeValue Unquoted), '\'' => go!(self: discard_char; to AttributeValue SingleQuoted), '\0' => go!(self: discard_char; error; push_value '\u{fffd}'; to AttributeValue Unquoted), '>' => go!(self: discard_char; error; emit_tag Data), From 7fec1da43d430a8d014a4c08b63f325656b99f70 Mon Sep 17 00:00:00 2001 From: Anthony Ramine <n.oxyde@gmail.com> Date: Wed, 26 Oct 2016 10:20:18 +0200 Subject: [PATCH 4/5] Make tokenizer not own the input stream This is the first step towards supporting document.write. --- examples/noop-tokenize.rs | 11 +- examples/tokenize.rs | 11 +- src/driver.rs | 10 +- src/lib.rs | 2 +- src/tokenizer/buffer_queue.rs | 7 +- src/tokenizer/char_ref/mod.rs | 114 ++++++++++------ src/tokenizer/mod.rs | 247 ++++++++++++++++++---------------- tests/tokenizer.rs | 6 +- 8 files changed, 238 insertions(+), 170 deletions(-) diff --git a/examples/noop-tokenize.rs b/examples/noop-tokenize.rs index 1310d54c..476e0ef3 100644 --- a/examples/noop-tokenize.rs +++ b/examples/noop-tokenize.rs @@ -18,6 +18,7 @@ use std::default::Default; use tendril::{ByteTendril, ReadExt}; use html5ever::tokenizer::{TokenSink, Token, Tokenizer}; +use html5ever::tokenizer::buffer_queue::BufferQueue; struct Sink(Vec<Token>); @@ -30,11 +31,13 @@ impl TokenSink for Sink { } fn main() { - let mut input = ByteTendril::new(); - io::stdin().read_to_tendril(&mut input).unwrap(); - let input = input.try_reinterpret().unwrap(); + let mut chunk = ByteTendril::new(); + io::stdin().read_to_tendril(&mut chunk).unwrap(); + let mut input = BufferQueue::new(); + input.push_back(chunk.try_reinterpret().unwrap()); let mut tok = Tokenizer::new(Sink(Vec::new()), Default::default()); - tok.feed(input); + tok.feed(&mut input); + assert!(input.is_empty()); tok.end(); } diff --git a/examples/tokenize.rs b/examples/tokenize.rs index 02be7c55..f2d4df9c 100644 --- a/examples/tokenize.rs +++ b/examples/tokenize.rs @@ -17,6 +17,7 @@ use tendril::{ByteTendril, ReadExt}; use html5ever::tokenizer::{TokenSink, Tokenizer, Token, TokenizerOpts, ParseError}; use html5ever::tokenizer::{CharacterTokens, NullCharacterToken, TagToken, StartTag, EndTag}; +use html5ever::tokenizer::buffer_queue::BufferQueue; #[derive(Copy, Clone)] struct TokenPrinter { @@ -80,15 +81,17 @@ fn main() { let mut sink = TokenPrinter { in_char_run: false, }; - let mut input = ByteTendril::new(); - io::stdin().read_to_tendril(&mut input).unwrap(); - let input = input.try_reinterpret().unwrap(); + let mut chunk = ByteTendril::new(); + io::stdin().read_to_tendril(&mut chunk).unwrap(); + let mut input = BufferQueue::new(); + input.push_back(chunk.try_reinterpret().unwrap()); let mut tok = Tokenizer::new(sink, TokenizerOpts { profile: true, .. Default::default() }); - tok.feed(input); + tok.feed(&mut input); + assert!(input.is_empty()); tok.end(); sink.is_char(false); } diff --git a/src/driver.rs b/src/driver.rs index 4e9b5b6a..981b70bd 100644 --- a/src/driver.rs +++ b/src/driver.rs @@ -10,6 +10,7 @@ //! High-level interface to the parser. use tokenizer::{Attribute, Tokenizer, TokenizerOpts}; +use tokenizer::buffer_queue::BufferQueue; use tree_builder::{TreeBuilderOpts, TreeBuilder, TreeSink}; use std::borrow::Cow; @@ -41,7 +42,7 @@ pub struct ParseOpts { pub fn parse_document<Sink>(sink: Sink, opts: ParseOpts) -> Parser<Sink> where Sink: TreeSink { let tb = TreeBuilder::new(sink, opts.tree_builder); let tok = Tokenizer::new(tb, opts.tokenizer); - Parser { tokenizer: tok } + Parser { tokenizer: tok, input_buffer: BufferQueue::new() } } /// Parse an HTML fragment @@ -72,18 +73,20 @@ pub fn parse_fragment_for_element<Sink>(sink: Sink, opts: ParseOpts, .. opts.tokenizer }; let tok = Tokenizer::new(tb, tok_opts); - Parser { tokenizer: tok } + Parser { tokenizer: tok, input_buffer: BufferQueue::new() } } /// An HTML parser, /// ready to recieve Unicode input through the `tendril::TendrilSink` trait’s methods. pub struct Parser<Sink> where Sink: TreeSink { pub tokenizer: Tokenizer<TreeBuilder<Sink::Handle, Sink>>, + pub input_buffer: BufferQueue, } impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> { fn process(&mut self, t: StrTendril) { - self.tokenizer.feed(t) + self.input_buffer.push_front(t); + self.tokenizer.feed(&mut self.input_buffer) } // FIXME: Is it too noisy to report every character decoding error? @@ -94,6 +97,7 @@ impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> { type Output = Sink::Output; fn finish(mut self) -> Self::Output { + self.tokenizer.feed(&mut self.input_buffer); self.tokenizer.end(); self.tokenizer.unwrap().unwrap().finish() } diff --git a/src/lib.rs b/src/lib.rs index f394953c..bbf3041f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -43,9 +43,9 @@ mod util { #[macro_use] pub mod smallcharset; } +pub mod serialize; pub mod tokenizer; pub mod tree_builder; -pub mod serialize; pub mod driver; pub mod rcdom; diff --git a/src/tokenizer/buffer_queue.rs b/src/tokenizer/buffer_queue.rs index 9e98516c..e32e39a2 100644 --- a/src/tokenizer/buffer_queue.rs +++ b/src/tokenizer/buffer_queue.rs @@ -38,6 +38,11 @@ impl BufferQueue { } } + /// Returns whether the queue is empty. + pub fn is_empty(&self) -> bool { + self.buffers.is_empty() + } + /// Add a buffer to the beginning of the queue. pub fn push_front(&mut self, buf: StrTendril) { if buf.len32() == 0 { @@ -55,7 +60,7 @@ impl BufferQueue { } /// Look at the next available character, if any. - pub fn peek(&mut self) -> Option<char> { + pub fn peek(&self) -> Option<char> { // Invariant: all buffers in the queue are non-empty. self.buffers.front().map(|b| b.chars().next().unwrap()) } diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs index d1302065..19f925c3 100644 --- a/src/tokenizer/char_ref/mod.rs +++ b/src/tokenizer/char_ref/mod.rs @@ -8,6 +8,7 @@ // except according to those terms. use super::{Tokenizer, TokenSink}; +use super::buffer_queue::BufferQueue; use util::str::{is_ascii_alnum}; @@ -113,31 +114,39 @@ impl CharRefTokenizer { } impl CharRefTokenizer { - pub fn step<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status { + pub fn step<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue) + -> Status { if self.result.is_some() { return Done; } debug!("char ref tokenizer stepping in state {:?}", self.state); match self.state { - Begin => self.do_begin(tokenizer), - Octothorpe => self.do_octothorpe(tokenizer), - Numeric(base) => self.do_numeric(tokenizer, base), - NumericSemicolon => self.do_numeric_semicolon(tokenizer), - Named => self.do_named(tokenizer), - BogusName => self.do_bogus_name(tokenizer), + Begin => self.do_begin(tokenizer, input), + Octothorpe => self.do_octothorpe(tokenizer, input), + Numeric(base) => self.do_numeric(tokenizer, input, base), + NumericSemicolon => self.do_numeric_semicolon(tokenizer, input), + Named => self.do_named(tokenizer, input), + BogusName => self.do_bogus_name(tokenizer, input), } } - fn do_begin<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status { - match unwrap_or_return!(tokenizer.peek(), Stuck) { + fn do_begin<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue) + -> Status { + match unwrap_or_return!(tokenizer.peek(input), Stuck) { '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(), c if Some(c) == self.addnl_allowed => self.finish_none(), '#' => { - tokenizer.discard_char(); + tokenizer.discard_char(input); self.state = Octothorpe; Progress } @@ -150,11 +159,15 @@ impl CharRefTokenizer { } } - fn do_octothorpe<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status { - let c = unwrap_or_return!(tokenizer.peek(), Stuck); + fn do_octothorpe<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue) + -> Status { + let c = unwrap_or_return!(tokenizer.peek(input), Stuck); match c { 'x' | 'X' => { - tokenizer.discard_char(); + tokenizer.discard_char(input); self.hex_marker = Some(c); self.state = Numeric(16); } @@ -167,11 +180,16 @@ impl CharRefTokenizer { Progress } - fn do_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>, base: u32) -> Status { - let c = unwrap_or_return!(tokenizer.peek(), Stuck); + fn do_numeric<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue, + base: u32) + -> Status { + let c = unwrap_or_return!(tokenizer.peek(input), Stuck); match c.to_digit(base) { Some(n) => { - tokenizer.discard_char(); + tokenizer.discard_char(input); self.num = self.num.wrapping_mul(base); if self.num > 0x10FFFF { // We might overflow, and the character is definitely invalid. @@ -183,7 +201,7 @@ impl CharRefTokenizer { Progress } - None if !self.seen_digit => self.unconsume_numeric(tokenizer), + None if !self.seen_digit => self.unconsume_numeric(tokenizer, input), None => { self.state = NumericSemicolon; @@ -192,22 +210,30 @@ impl CharRefTokenizer { } } - fn do_numeric_semicolon<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status { - match unwrap_or_return!(tokenizer.peek(), Stuck) { - ';' => tokenizer.discard_char(), + fn do_numeric_semicolon<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue) + -> Status { + match unwrap_or_return!(tokenizer.peek(input), Stuck) { + ';' => tokenizer.discard_char(input), _ => tokenizer.emit_error(Borrowed("Semicolon missing after numeric character reference")), }; self.finish_numeric(tokenizer) } - fn unconsume_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status { + fn unconsume_numeric<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue) + -> Status { let mut unconsume = StrTendril::from_char('#'); match self.hex_marker { Some(c) => unconsume.push_char(c), None => (), } - tokenizer.unconsume(unconsume); + input.push_front(unconsume); tokenizer.emit_error(Borrowed("Numeric character reference without digits")); self.finish_none() } @@ -245,8 +271,12 @@ impl CharRefTokenizer { self.finish_one(c) } - fn do_named<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status { - let c = unwrap_or_return!(tokenizer.get_char(), Stuck); + fn do_named<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue) + -> Status { + let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); self.name_buf_mut().push_char(c); match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { // We have either a full match or a prefix of one. @@ -261,7 +291,7 @@ impl CharRefTokenizer { } // Can't continue the match. - None => self.finish_named(tokenizer, Some(c)), + None => self.finish_named(tokenizer, input, Some(c)), } } @@ -272,12 +302,13 @@ impl CharRefTokenizer { tokenizer.emit_error(msg); } - fn unconsume_name<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) { - tokenizer.unconsume(self.name_buf_opt.take().unwrap()); + fn unconsume_name(&mut self, input: &mut BufferQueue) { + input.push_front(self.name_buf_opt.take().unwrap()); } fn finish_named<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue, end_char: Option<char>) -> Status { match self.name_match { None => { @@ -295,7 +326,7 @@ impl CharRefTokenizer { _ => (), } - self.unconsume_name(tokenizer); + self.unconsume_name(input); self.finish_none() } @@ -344,10 +375,10 @@ impl CharRefTokenizer { }; if unconsume_all { - self.unconsume_name(tokenizer); + self.unconsume_name(input); self.finish_none() } else { - tokenizer.unconsume(StrTendril::from_slice(&self.name_buf()[name_len..])); + input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..])); self.result = Some(CharRef { chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], num_chars: if c2 == 0 { 1 } else { 2 }, @@ -358,40 +389,47 @@ impl CharRefTokenizer { } } - fn do_bogus_name<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status { - let c = unwrap_or_return!(tokenizer.get_char(), Stuck); + fn do_bogus_name<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue) + -> Status { + let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); self.name_buf_mut().push_char(c); match c { _ if is_ascii_alnum(c) => return Progress, ';' => self.emit_name_error(tokenizer), _ => () } - self.unconsume_name(tokenizer); + self.unconsume_name(input); self.finish_none() } - pub fn end_of_file<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) { + pub fn end_of_file<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue) { while self.result.is_none() { match self.state { Begin => drop(self.finish_none()), Numeric(_) if !self.seen_digit - => drop(self.unconsume_numeric(tokenizer)), + => drop(self.unconsume_numeric(tokenizer, input)), Numeric(_) | NumericSemicolon => { tokenizer.emit_error(Borrowed("EOF in numeric character reference")); self.finish_numeric(tokenizer); } - Named => drop(self.finish_named(tokenizer, None)), + Named => drop(self.finish_named(tokenizer, input, None)), BogusName => { - self.unconsume_name(tokenizer); + self.unconsume_name(input); self.finish_none(); } Octothorpe => { - tokenizer.unconsume(StrTendril::from_slice("#")); + input.push_front(StrTendril::from_slice("#")); tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); self.finish_none(); } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 4e169520..fdb7e267 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -38,10 +38,10 @@ use std::collections::BTreeMap; use string_cache::{Atom, QualName}; use tendril::StrTendril; +pub mod buffer_queue; pub mod states; mod interface; mod char_ref; -mod buffer_queue; fn option_push(opt_str: &mut Option<StrTendril>, c: char) { match *opt_str { @@ -100,9 +100,6 @@ pub struct Tokenizer<Sink> { /// The abstract machine state as described in the spec. state: states::State, - /// Input ready to be tokenized. - input_buffers: BufferQueue, - /// Are we at the end of the file, once buffers have been processed /// completely? This affects whether we will wait for lookahead or not. at_eof: bool, @@ -174,7 +171,6 @@ impl<Sink: TokenSink> Tokenizer<Sink> { sink: sink, state: state, char_ref_tokenizer: None, - input_buffers: BufferQueue::new(), at_eof: false, current_char: '\0', reconsume: false, @@ -208,17 +204,22 @@ impl<Sink: TokenSink> Tokenizer<Sink> { } /// Feed an input string into the tokenizer. - pub fn feed(&mut self, mut input: StrTendril) { + pub fn feed(&mut self, input: &mut BufferQueue) { if input.is_empty() { return; } - if self.discard_bom && input.starts_with("\u{feff}") { - input.pop_front(3); + if self.discard_bom { + if let Some(c) = input.peek() { + if c == '\u{feff}' { + input.next(); + } + } else { + return; + } }; - self.input_buffers.push_back(input); - self.run(); + self.run(input) } pub fn set_plaintext_state(&mut self) { @@ -237,11 +238,15 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ preprocessing-the-input-stream // Get the next input character, which might be the character // 'c' that we already consumed from the buffers. - fn get_preprocessed_char(&mut self, mut c: char) -> Option<char> { + fn get_preprocessed_char( + &mut self, + mut c: char, + input: &mut BufferQueue) + -> Option<char> { if self.ignore_lf { self.ignore_lf = false; if c == '\n' { - c = unwrap_or_return!(self.input_buffers.next(), None); + c = unwrap_or_return!(input.next(), None); } } @@ -266,29 +271,28 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ tokenization // Get the next input character, if one is available. - fn get_char(&mut self) -> Option<char> { + fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> { if self.reconsume { self.reconsume = false; Some(self.current_char) } else { - self.input_buffers.next() - .and_then(|c| self.get_preprocessed_char(c)) + input.next().and_then(|c| self.get_preprocessed_char(c, input)) } } - fn pop_except_from(&mut self, set: SmallCharSet) -> Option<SetResult> { + fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> { // Bail to the slow path for various corner cases. // This means that `FromSet` can contain characters not in the set! // It shouldn't matter because the fallback `FromSet` case should // always do the same thing as the `NotFromSet` case. if self.opts.exact_errors || self.reconsume || self.ignore_lf { - return self.get_char().map(|x| FromSet(x)); + return self.get_char(input).map(|x| FromSet(x)); } - let d = self.input_buffers.pop_except_from(set); + let d = input.pop_except_from(set); debug!("got characters {:?}", d); match d { - Some(FromSet(c)) => self.get_preprocessed_char(c).map(|x| FromSet(x)), + Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(|x| FromSet(x)), // NB: We don't set self.current_char for a run of characters not // in the set. It shouldn't matter for the codepaths that use @@ -302,20 +306,32 @@ impl<Sink: TokenSink> Tokenizer<Sink> { // // NB: this doesn't do input stream preprocessing or set the current input // character. - fn eat(&mut self, pat: &str, eq: fn(&u8, &u8) -> bool) -> Option<bool> { - match self.input_buffers.eat(pat, eq) { + fn eat( + &mut self, + input: &mut BufferQueue, + pat: &str, + eq: fn(&u8, &u8) -> bool) + -> Option<bool> { + input.push_front(replace(&mut self.temp_buf, StrTendril::new())); + match input.eat(pat, eq) { None if self.at_eof => Some(false), - r => r, + None => { + while let Some(c) = input.next() { + self.temp_buf.push_char(c); + } + None + }, + Some(matched) => Some(matched), } } /// Run the state machine for as long as we can. - pub fn run(&mut self) { + pub fn run(&mut self, input: &mut BufferQueue) { if self.opts.profile { loop { let state = self.state; let old_sink = self.time_in_sink; - let (run, mut dt) = time!(self.step()); + let (run, mut dt) = time!(self.step(input)); dt -= (self.time_in_sink - old_sink); let new = match self.state_profile.get_mut(&state) { Some(x) => { @@ -331,8 +347,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { if !run { break; } } } else { - while self.step() { - } + while self.step(input) {} } } @@ -510,23 +525,19 @@ impl<Sink: TokenSink> Tokenizer<Sink> { self.process_token(EOFToken); } - fn peek(&mut self) -> Option<char> { + fn peek(&mut self, input: &BufferQueue) -> Option<char> { if self.reconsume { Some(self.current_char) } else { - self.input_buffers.peek() + input.peek() } } - fn discard_char(&mut self) { - let c = self.get_char(); + fn discard_char(&mut self, input: &mut BufferQueue) { + let c = self.get_char(input); assert!(c.is_some()); } - fn unconsume(&mut self, buf: StrTendril) { - self.input_buffers.push_front(buf); - } - fn emit_error(&mut self, error: Cow<'static, str>) { self.process_token(ParseError(error)); } @@ -539,7 +550,7 @@ macro_rules! shorthand ( ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c); ); ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push_char($c); ); ( $me:ident : discard_tag ) => ( $me.discard_tag(); ); - ( $me:ident : discard_char ) => ( $me.discard_char(); ); + ( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input); ); ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push_char($c); ); ( $me:ident : emit_temp ) => ( $me.emit_temp_buf(); ); ( $me:ident : clear_temp ) => ( $me.clear_temp_buf(); ); @@ -619,40 +630,40 @@ macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) // This is a macro because it can cause early return // from the function where it is used. -macro_rules! get_char ( ($me:expr) => ( - unwrap_or_return!($me.get_char(), false) +macro_rules! get_char ( ($me:expr, $input:expr) => ( + unwrap_or_return!($me.get_char($input), false) )); -macro_rules! peek ( ($me:expr) => ( - unwrap_or_return!($me.peek(), false) +macro_rules! peek ( ($me:expr, $input:expr) => ( + unwrap_or_return!($me.peek($input), false) )); -macro_rules! pop_except_from ( ($me:expr, $set:expr) => ( - unwrap_or_return!($me.pop_except_from($set), false) +macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => ( + unwrap_or_return!($me.pop_except_from($input, $set), false) )); -macro_rules! eat ( ($me:expr, $pat:expr) => ( - unwrap_or_return!($me.eat($pat, u8::eq_ignore_ascii_case), false) +macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => ( + unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), false) )); -macro_rules! eat_exact ( ($me:expr, $pat:expr) => ( - unwrap_or_return!($me.eat($pat, u8::eq), false) +macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => ( + unwrap_or_return!($me.eat($input, $pat, u8::eq), false) )); impl<Sink: TokenSink> Tokenizer<Sink> { // Run the state machine for a while. // Return true if we should be immediately re-invoked // (this just simplifies control flow vs. break / continue). - fn step(&mut self) -> bool { + fn step(&mut self, input: &mut BufferQueue) -> bool { if self.char_ref_tokenizer.is_some() { - return self.step_char_ref_tokenizer(); + return self.step_char_ref_tokenizer(input); } debug!("processing in state {:?}", self.state); match self.state { //§ data-state states::Data => loop { - match pop_except_from!(self, small_char_set!('\r' '\0' '&' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<')) { FromSet('\0') => go!(self: error; emit '\0'), FromSet('&') => go!(self: consume_char_ref), FromSet('<') => go!(self: to TagOpen), @@ -663,7 +674,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ rcdata-state states::RawData(Rcdata) => loop { - match pop_except_from!(self, small_char_set!('\r' '\0' '&' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('&') => go!(self: consume_char_ref), FromSet('<') => go!(self: to RawLessThanSign Rcdata), @@ -674,7 +685,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ rawtext-state states::RawData(Rawtext) => loop { - match pop_except_from!(self, small_char_set!('\r' '\0' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '<')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('<') => go!(self: to RawLessThanSign Rawtext), FromSet(c) => go!(self: emit c), @@ -684,7 +695,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ script-data-state states::RawData(ScriptData) => loop { - match pop_except_from!(self, small_char_set!('\r' '\0' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '<')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('<') => go!(self: to RawLessThanSign ScriptData), FromSet(c) => go!(self: emit c), @@ -694,7 +705,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ script-data-escaped-state states::RawData(ScriptDataEscaped(Escaped)) => loop { - match pop_except_from!(self, small_char_set!('\r' '\0' '-' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped), FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped), @@ -705,7 +716,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ script-data-double-escaped-state states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop { - match pop_except_from!(self, small_char_set!('\r' '\0' '-' '<')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped), FromSet('<') => go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped), @@ -716,7 +727,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ plaintext-state states::Plaintext => loop { - match pop_except_from!(self, small_char_set!('\r' '\0')) { + match pop_except_from!(self, input, small_char_set!('\r' '\0')) { FromSet('\0') => go!(self: error; emit '\u{fffd}'), FromSet(c) => go!(self: emit c), NotFromSet(b) => self.emit_chars(b), @@ -724,8 +735,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }, //§ tag-open-state - states::TagOpen => loop { match get_char!(self) { - '!' => go!(self: to MarkupDeclarationOpen), + states::TagOpen => loop { match get_char!(self, input) { + '!' => go!(self: clear_temp; to MarkupDeclarationOpen), '/' => go!(self: to EndTagOpen), '?' => go!(self: error; clear_comment; push_comment '?'; to BogusComment), c => match lower_ascii_letter(c) { @@ -735,7 +746,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ end-tag-open-state - states::EndTagOpen => loop { match get_char!(self) { + states::EndTagOpen => loop { match get_char!(self, input) { '>' => go!(self: error; to Data), '\0' => go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment), c => match lower_ascii_letter(c) { @@ -745,7 +756,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ tag-name-state - states::TagName => loop { match get_char!(self) { + states::TagName => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName), '/' => go!(self: to SelfClosingStartTag), @@ -755,7 +766,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ script-data-escaped-less-than-sign-state - states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop { match get_char!(self) { + states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop { match get_char!(self, input) { '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped), c => match lower_ascii_letter(c) { Some(cl) => go!(self: clear_temp; push_temp cl; emit '<'; emit c; @@ -765,14 +776,14 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ script-data-double-escaped-less-than-sign-state - states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop { match get_char!(self) { + states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop { match get_char!(self, input) { '/' => go!(self: clear_temp; emit '/'; to ScriptDataDoubleEscapeEnd), _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped), }}, //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state // otherwise - states::RawLessThanSign(kind) => loop { match get_char!(self) { + states::RawLessThanSign(kind) => loop { match get_char!(self, input) { '/' => go!(self: clear_temp; to RawEndTagOpen kind), '!' if kind == ScriptData => go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped), _ => go!(self: emit '<'; reconsume RawData kind), @@ -780,7 +791,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state states::RawEndTagOpen(kind) => loop { - let c = get_char!(self); + let c = get_char!(self, input); match lower_ascii_letter(c) { Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind), None => go!(self: emit '<'; emit '/'; reconsume RawData kind), @@ -789,7 +800,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state states::RawEndTagName(kind) => loop { - let c = get_char!(self); + let c = get_char!(self, input); if self.have_appropriate_end_tag() { match c { '\t' | '\n' | '\x0C' | ' ' @@ -808,7 +819,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ script-data-double-escape-start-state states::ScriptDataEscapeStart(DoubleEscaped) => loop { - let c = get_char!(self); + let c = get_char!(self, input); match c { '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => { let esc = if &*self.temp_buf == "script" { DoubleEscaped } else { Escaped }; @@ -822,19 +833,19 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }, //§ script-data-escape-start-state - states::ScriptDataEscapeStart(Escaped) => loop { match get_char!(self) { + states::ScriptDataEscapeStart(Escaped) => loop { match get_char!(self, input) { '-' => go!(self: emit '-'; to ScriptDataEscapeStartDash), _ => go!(self: reconsume RawData ScriptData), }}, //§ script-data-escape-start-dash-state - states::ScriptDataEscapeStartDash => loop { match get_char!(self) { + states::ScriptDataEscapeStartDash => loop { match get_char!(self, input) { '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash Escaped), _ => go!(self: reconsume RawData ScriptData), }}, //§ script-data-escaped-dash-state script-data-double-escaped-dash-state - states::ScriptDataEscapedDash(kind) => loop { match get_char!(self) { + states::ScriptDataEscapedDash(kind) => loop { match get_char!(self, input) { '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash kind), '<' => { if kind == DoubleEscaped { go!(self: emit '<'); } @@ -845,7 +856,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state - states::ScriptDataEscapedDashDash(kind) => loop { match get_char!(self) { + states::ScriptDataEscapedDashDash(kind) => loop { match get_char!(self, input) { '-' => go!(self: emit '-'), '<' => { if kind == DoubleEscaped { go!(self: emit '<'); } @@ -858,7 +869,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ script-data-double-escape-end-state states::ScriptDataDoubleEscapeEnd => loop { - let c = get_char!(self); + let c = get_char!(self, input); match c { '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => { let esc = if &*self.temp_buf == "script" { Escaped } else { DoubleEscaped }; @@ -872,7 +883,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }, //§ before-attribute-name-state - states::BeforeAttributeName => loop { match get_char!(self) { + states::BeforeAttributeName => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '/' => go!(self: to SelfClosingStartTag), '>' => go!(self: emit_tag Data), @@ -888,7 +899,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ attribute-name-state - states::AttributeName => loop { match get_char!(self) { + states::AttributeName => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName), '/' => go!(self: to SelfClosingStartTag), @@ -906,7 +917,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ after-attribute-name-state - states::AfterAttributeName => loop { match get_char!(self) { + states::AfterAttributeName => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '/' => go!(self: to SelfClosingStartTag), '=' => go!(self: to BeforeAttributeValue), @@ -925,18 +936,18 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ before-attribute-value-state // Use peek so we can handle the first attr character along with the rest, // hopefully in the same zero-copy buffer. - states::BeforeAttributeValue => loop { match peek!(self) { - '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char), - '"' => go!(self: discard_char; to AttributeValue DoubleQuoted), - '\'' => go!(self: discard_char; to AttributeValue SingleQuoted), - '\0' => go!(self: discard_char; error; push_value '\u{fffd}'; to AttributeValue Unquoted), - '>' => go!(self: discard_char; error; emit_tag Data), + states::BeforeAttributeValue => loop { match peek!(self, input) { + '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input), + '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted), + '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted), + '\0' => go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted), + '>' => go!(self: discard_char input; error; emit_tag Data), _ => go!(self: to AttributeValue Unquoted), }}, //§ attribute-value-(double-quoted)-state states::AttributeValue(DoubleQuoted) => loop { - match pop_except_from!(self, small_char_set!('\r' '"' '&' '\0')) { + match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0')) { FromSet('"') => go!(self: to AfterAttributeValueQuoted), FromSet('&') => go!(self: consume_char_ref '"'), FromSet('\0') => go!(self: error; push_value '\u{fffd}'), @@ -947,7 +958,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ attribute-value-(single-quoted)-state states::AttributeValue(SingleQuoted) => loop { - match pop_except_from!(self, small_char_set!('\r' '\'' '&' '\0')) { + match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0')) { FromSet('\'') => go!(self: to AfterAttributeValueQuoted), FromSet('&') => go!(self: consume_char_ref '\''), FromSet('\0') => go!(self: error; push_value '\u{fffd}'), @@ -958,7 +969,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ attribute-value-(unquoted)-state states::AttributeValue(Unquoted) => loop { - match pop_except_from!(self, small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')) { + match pop_except_from!(self, input, small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')) { FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => go!(self: to BeforeAttributeName), FromSet('&') => go!(self: consume_char_ref '>'), @@ -974,7 +985,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }, //§ after-attribute-value-(quoted)-state - states::AfterAttributeValueQuoted => loop { match get_char!(self) { + states::AfterAttributeValueQuoted => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName), '/' => go!(self: to SelfClosingStartTag), @@ -983,7 +994,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ self-closing-start-tag-state - states::SelfClosingStartTag => loop { match get_char!(self) { + states::SelfClosingStartTag => loop { match get_char!(self, input) { '>' => { self.current_tag_self_closing = true; go!(self: emit_tag Data); @@ -992,7 +1003,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ comment-start-state - states::CommentStart => loop { match get_char!(self) { + states::CommentStart => loop { match get_char!(self, input) { '-' => go!(self: to CommentStartDash), '\0' => go!(self: error; push_comment '\u{fffd}'; to Comment), '>' => go!(self: error; emit_comment; to Data), @@ -1000,7 +1011,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ comment-start-dash-state - states::CommentStartDash => loop { match get_char!(self) { + states::CommentStartDash => loop { match get_char!(self, input) { '-' => go!(self: to CommentEnd), '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment), '>' => go!(self: error; emit_comment; to Data), @@ -1008,21 +1019,21 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ comment-state - states::Comment => loop { match get_char!(self) { + states::Comment => loop { match get_char!(self, input) { '-' => go!(self: to CommentEndDash), '\0' => go!(self: error; push_comment '\u{fffd}'), c => go!(self: push_comment c), }}, //§ comment-end-dash-state - states::CommentEndDash => loop { match get_char!(self) { + states::CommentEndDash => loop { match get_char!(self, input) { '-' => go!(self: to CommentEnd), '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment), c => go!(self: push_comment '-'; push_comment c; to Comment), }}, //§ comment-end-state - states::CommentEnd => loop { match get_char!(self) { + states::CommentEnd => loop { match get_char!(self, input) { '>' => go!(self: emit_comment; to Data), '\0' => go!(self: error; append_comment "--\u{fffd}"; to Comment), '!' => go!(self: error; to CommentEndBang), @@ -1031,7 +1042,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ comment-end-bang-state - states::CommentEndBang => loop { match get_char!(self) { + states::CommentEndBang => loop { match get_char!(self, input) { '-' => go!(self: append_comment "--!"; to CommentEndDash), '>' => go!(self: emit_comment; to Data), '\0' => go!(self: error; append_comment "--!\u{fffd}"; to Comment), @@ -1039,14 +1050,14 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ doctype-state - states::Doctype => loop { match get_char!(self) { + states::Doctype => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName), _ => go!(self: error; reconsume BeforeDoctypeName), }}, //§ before-doctype-name-state - states::BeforeDoctypeName => loop { match get_char!(self) { + states::BeforeDoctypeName => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '\0' => go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName), '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data), @@ -1055,9 +1066,9 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ doctype-name-state - states::DoctypeName => loop { match get_char!(self) { + states::DoctypeName => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' - => go!(self: to AfterDoctypeName), + => go!(self: clear_temp; to AfterDoctypeName), '>' => go!(self: emit_doctype; to Data), '\0' => go!(self: error; push_doctype_name '\u{fffd}'), c => go!(self: push_doctype_name (c.to_ascii_lowercase())), @@ -1065,12 +1076,12 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ after-doctype-name-state states::AfterDoctypeName => loop { - if eat!(self, "public") { + if eat!(self, input, "public") { go!(self: to AfterDoctypeKeyword Public); - } else if eat!(self, "system") { + } else if eat!(self, input, "system") { go!(self: to AfterDoctypeKeyword System); } else { - match get_char!(self) { + match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '>' => go!(self: emit_doctype; to Data), _ => go!(self: error; force_quirks; to BogusDoctype), @@ -1079,7 +1090,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }, //§ after-doctype-public-keyword-state after-doctype-system-keyword-state - states::AfterDoctypeKeyword(kind) => loop { match get_char!(self) { + states::AfterDoctypeKeyword(kind) => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind), '"' => go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind), @@ -1089,7 +1100,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ before-doctype-public-identifier-state before-doctype-system-identifier-state - states::BeforeDoctypeIdentifier(kind) => loop { match get_char!(self) { + states::BeforeDoctypeIdentifier(kind) => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind), '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind), @@ -1098,7 +1109,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state - states::DoctypeIdentifierDoubleQuoted(kind) => loop { match get_char!(self) { + states::DoctypeIdentifierDoubleQuoted(kind) => loop { match get_char!(self, input) { '"' => go!(self: to AfterDoctypeIdentifier kind), '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'), '>' => go!(self: error; force_quirks; emit_doctype; to Data), @@ -1106,7 +1117,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state - states::DoctypeIdentifierSingleQuoted(kind) => loop { match get_char!(self) { + states::DoctypeIdentifierSingleQuoted(kind) => loop { match get_char!(self, input) { '\'' => go!(self: to AfterDoctypeIdentifier kind), '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'), '>' => go!(self: error; force_quirks; emit_doctype; to Data), @@ -1114,7 +1125,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ after-doctype-public-identifier-state - states::AfterDoctypeIdentifier(Public) => loop { match get_char!(self) { + states::AfterDoctypeIdentifier(Public) => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => go!(self: to BetweenDoctypePublicAndSystemIdentifiers), '>' => go!(self: emit_doctype; to Data), @@ -1124,14 +1135,14 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ after-doctype-system-identifier-state - states::AfterDoctypeIdentifier(System) => loop { match get_char!(self) { + states::AfterDoctypeIdentifier(System) => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '>' => go!(self: emit_doctype; to Data), _ => go!(self: error; to BogusDoctype), }}, //§ between-doctype-public-and-system-identifiers-state - states::BetweenDoctypePublicAndSystemIdentifiers => loop { match get_char!(self) { + states::BetweenDoctypePublicAndSystemIdentifiers => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '>' => go!(self: emit_doctype; to Data), '"' => go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System), @@ -1140,13 +1151,13 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }}, //§ bogus-doctype-state - states::BogusDoctype => loop { match get_char!(self) { + states::BogusDoctype => loop { match get_char!(self, input) { '>' => go!(self: emit_doctype; to Data), _ => (), }}, //§ bogus-comment-state - states::BogusComment => loop { match get_char!(self) { + states::BogusComment => loop { match get_char!(self, input) { '>' => go!(self: emit_comment; to Data), '\0' => go!(self: push_comment '\u{fffd}'), c => go!(self: push_comment c), @@ -1154,13 +1165,13 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ markup-declaration-open-state states::MarkupDeclarationOpen => loop { - if eat_exact!(self, "--") { + if eat_exact!(self, input, "--") { go!(self: clear_comment; to CommentStart); - } else if eat!(self, "doctype") { + } else if eat!(self, input, "doctype") { go!(self: to Doctype); } else { if self.sink.adjusted_current_node_present_but_not_in_html_namespace() { - if eat_exact!(self, "[CDATA[") { + if eat_exact!(self, input, "[CDATA[") { go!(self: clear_temp; to CdataSection); } } @@ -1169,20 +1180,20 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }, //§ cdata-section-state - states::CdataSection => loop { match get_char!(self) { + states::CdataSection => loop { match get_char!(self, input) { ']' => go!(self: to CdataSectionBracket), '\0' => go!(self: emit_temp; emit '\0'), c => go!(self: push_temp c), }}, //§ cdata-section-bracket - states::CdataSectionBracket => match get_char!(self) { + states::CdataSectionBracket => match get_char!(self, input) { ']' => go!(self: to CdataSectionEnd), _ => go!(self: push_temp ']'; reconsume CdataSection), }, //§ cdata-section-end - states::CdataSectionEnd => loop { match get_char!(self) { + states::CdataSectionEnd => loop { match get_char!(self, input) { ']' => go!(self: push_temp ']'), '>' => go!(self: emit_temp; to Data), _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection), @@ -1192,11 +1203,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> { } } - fn step_char_ref_tokenizer(&mut self) -> bool { + fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> bool { // FIXME HACK: Take and replace the tokenizer so we don't // double-mut-borrow self. This is why it's boxed. let mut tok = self.char_ref_tokenizer.take().unwrap(); - let outcome = tok.step(self); + let outcome = tok.step(self, input); let progress = match outcome { char_ref::Done => { @@ -1238,10 +1249,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> { pub fn end(&mut self) { // Handle EOF in the char ref sub-tokenizer, if there is one. // Do this first because it might un-consume stuff. + let mut input = BufferQueue::new(); match self.char_ref_tokenizer.take() { None => (), Some(mut tok) => { - tok.end_of_file(self); + tok.end_of_file(self, &mut input); self.process_char_ref(tok.get_result()); } } @@ -1249,11 +1261,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { // Process all remaining buffered input. // If we're waiting for lookahead, we're not gonna get it. self.at_eof = true; - self.run(); + self.run(&mut input); + assert!(input.is_empty()); - while self.eof_step() { - // loop - } + while self.eof_step() {} if self.opts.profile { self.dump_profile(); diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs index 2faebfe2..93f48549 100644 --- a/tests/tokenizer.rs +++ b/tests/tokenizer.rs @@ -32,6 +32,7 @@ use html5ever::tokenizer::{Doctype, Attribute, StartTag, EndTag, Tag}; use html5ever::tokenizer::{Token, DoctypeToken, TagToken, CommentToken}; use html5ever::tokenizer::{CharacterTokens, NullCharacterToken, EOFToken, ParseError}; use html5ever::tokenizer::{TokenSink, Tokenizer, TokenizerOpts}; +use html5ever::tokenizer::buffer_queue::BufferQueue; use html5ever::tokenizer::states::{Plaintext, RawData, Rcdata, Rawtext}; use string_cache::{Atom, QualName}; @@ -134,9 +135,12 @@ impl TokenSink for TokenLogger { fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<Token> { let sink = TokenLogger::new(opts.exact_errors); let mut tok = Tokenizer::new(sink, opts); + let mut buffer = BufferQueue::new(); for chunk in input.into_iter() { - tok.feed(chunk); + buffer.push_back(chunk); + let _ = tok.feed(&mut buffer); } + tok.feed(&mut buffer); tok.end(); tok.unwrap().get_tokens() } From ceb1bd3200b26f1c6cf905e9d9b98314f3c4d326 Mon Sep 17 00:00:00 2001 From: Anthony Ramine <n.oxyde@gmail.com> Date: Wed, 26 Oct 2016 10:20:51 +0200 Subject: [PATCH 5/5] Bump version to 0.8.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 19a25ceb..33eefc03 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "html5ever" -version = "0.7.0" +version = "0.8.0" authors = [ "The html5ever Project Developers" ] license = "MIT / Apache-2.0" repository = "https://github.com/servo/html5ever"