From c74e3df84cd7b085976ecd82ed5e570baba5a568 Mon Sep 17 00:00:00 2001 From: Anthony Ramine Date: Wed, 19 Aug 2015 01:57:09 +0200 Subject: [PATCH 1/2] Introduce eat_exact!() For case-sensitive comparisons. --- src/tokenizer/buffer_queue.rs | 11 ++++++----- src/tokenizer/mod.rs | 12 ++++++++---- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/tokenizer/buffer_queue.rs b/src/tokenizer/buffer_queue.rs index 19a63bad..9e98516c 100644 --- a/src/tokenizer/buffer_queue.rs +++ b/src/tokenizer/buffer_queue.rs @@ -114,7 +114,7 @@ impl BufferQueue { // If so, consume them and return Some(true). // If they do not match, return Some(false). // If not enough characters are available to know, return None. - pub fn eat(&mut self, pat: &str) -> Option { + pub fn eat bool>(&mut self, pat: &str, eq: F) -> Option { let mut buffers_exhausted = 0; let mut consumed_from_last = 0; if self.buffers.front().is_none() { @@ -127,7 +127,7 @@ impl BufferQueue { } let ref buf = self.buffers[buffers_exhausted]; - if !buf.as_bytes()[consumed_from_last].eq_ignore_ascii_case(&pattern_byte) { + if !eq(&buf.as_bytes()[consumed_from_last], &pattern_byte) { return Some(false) } @@ -155,6 +155,7 @@ impl BufferQueue { #[cfg(test)] #[allow(non_snake_case)] mod test { + use std::ascii::AsciiExt; use tendril::{StrTendril, SliceExt}; use super::{BufferQueue, FromSet, NotFromSet}; @@ -209,9 +210,9 @@ mod test { let mut bq = BufferQueue::new(); bq.push_back("a".to_tendril()); bq.push_back("bc".to_tendril()); - assert_eq!(bq.eat("abcd"), None); - assert_eq!(bq.eat("ax"), Some(false)); - assert_eq!(bq.eat("ab"), Some(true)); + assert_eq!(bq.eat("abcd", u8::eq_ignore_ascii_case), None); + assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false)); + assert_eq!(bq.eat("ab", u8::eq_ignore_ascii_case), Some(true)); assert_eq!(bq.next(), Some('c')); assert_eq!(bq.next(), None); } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 1ac6fc13..3ec11571 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -298,8 +298,8 @@ impl Tokenizer { // // NB: this doesn't do input stream preprocessing or set the current input // character. - fn eat(&mut self, pat: &str) -> Option { - match self.input_buffers.eat(pat) { + fn eat(&mut self, pat: &str, eq: fn(&u8, &u8) -> bool) -> Option { + match self.input_buffers.eat(pat, eq) { None if self.at_eof => Some(false), r => r, } @@ -618,7 +618,11 @@ macro_rules! pop_except_from ( ($me:expr, $set:expr) => ( )); macro_rules! eat ( ($me:expr, $pat:expr) => ( - unwrap_or_return!($me.eat($pat), false) + unwrap_or_return!($me.eat($pat, u8::eq_ignore_ascii_case), false) +)); + +macro_rules! eat_exact ( ($me:expr, $pat:expr) => ( + unwrap_or_return!($me.eat($pat, u8::eq), false) )); impl Tokenizer { @@ -1144,7 +1148,7 @@ impl Tokenizer { //§ markup-declaration-open-state states::MarkupDeclarationOpen => loop { - if eat!(self, "--") { + if eat_exact!(self, "--") { go!(self: clear_comment; to CommentStart); } else if eat!(self, "doctype") { go!(self: to Doctype); From 435c9fbfce38bf114993da62f4ce6515545b391d Mon Sep 17 00:00:00 2001 From: Anthony Ramine Date: Wed, 19 Aug 2015 01:57:59 +0200 Subject: [PATCH 2/2] Implement tokenization of CDATA sections (fixes #11) --- data/test/ignore | 27 --------------------------- src/tokenizer/interface.rs | 8 ++++++++ src/tokenizer/mod.rs | 21 ++++++++++++++++----- src/tree_builder/mod.rs | 5 +++++ 4 files changed, 29 insertions(+), 32 deletions(-) diff --git a/data/test/ignore b/data/test/ignore index 5f84915c..6e828667 100644 --- a/data/test/ignore +++ b/data/test/ignore @@ -51,12 +51,8 @@ tb: foreign-fragment.dat-40 tb: foreign-fragment.dat-41 tb: foreign-fragment.dat-47 tb: foreign-fragment.dat-48 -tb: domjs-unsafe.dat-0 -tb: domjs-unsafe.dat-1 -tb: domjs-unsafe.dat-2 tb: domjs-unsafe.dat-46 tb: domjs-unsafe.dat-47 -tb: plain-text-unsafe.dat-10 tb: plain-text-unsafe.dat-13 tb: plain-text-unsafe.dat-26 tb: plain-text-unsafe.dat-27 @@ -80,26 +76,3 @@ tb: tests20.dat-34 tb: tests20.dat-35 tb: tests20.dat-36 tb: tests20.dat-37 -tb: tests21.dat-0 -tb: tests21.dat-1 -tb: tests21.dat-10 -tb: tests21.dat-11 -tb: tests21.dat-12 -tb: tests21.dat-13 -tb: tests21.dat-14 -tb: tests21.dat-16 -tb: tests21.dat-17 -tb: tests21.dat-18 -tb: tests21.dat-19 -tb: tests21.dat-20 -tb: tests21.dat-21 -tb: tests21.dat-22 -tb: tests21.dat-23 -tb: tests21.dat-24 -tb: tests21.dat-3 -tb: tests21.dat-4 -tb: tests21.dat-5 -tb: tests21.dat-6 -tb: tests21.dat-7 -tb: tests21.dat-8 -tb: tests21.dat-9 diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs index 8264ab4e..d552b28d 100644 --- a/src/tokenizer/interface.rs +++ b/src/tokenizer/interface.rs @@ -102,6 +102,14 @@ pub trait TokenSink { /// Process a token. fn process_token(&mut self, token: Token); + /// Used in the markup declaration open state. By default, this always + /// returns false and thus all CDATA sections are tokenized as bogus + /// comments. + /// https://html.spec.whatwg.org/multipage/#markup-declaration-open-state + fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool { + false + } + /// The tokenizer will call this after emitting any tag. /// This allows the tree builder to change the tokenizer's state. /// By default no state changes occur. diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 3ec11571..f8c54f79 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -1153,15 +1153,26 @@ impl Tokenizer { } else if eat!(self, "doctype") { go!(self: to Doctype); } else { - // FIXME: CDATA, requires "adjusted current node" from tree builder - // FIXME: 'error' gives wrong message + if self.sink.adjusted_current_node_present_but_not_in_html_namespace() { + if eat_exact!(self, "[CDATA[") { + go!(self: clear_temp; to CdataSection); + } + } go!(self: error; to BogusComment); } }, //§ cdata-section-state - states::CdataSection - => panic!("FIXME: state {:?} not implemented", self.state), + states::CdataSection => loop { + if eat_exact!(self, "]]>") { + go!(self: emit_temp; to Data); + } else { + match get_char!(self) { + '\0' => go!(self: emit_temp; emit '\0'), + c => go!(self: push_temp c) + } + } + } //§ END } } @@ -1316,7 +1327,7 @@ impl Tokenizer { => go!(self: error; to BogusComment), states::CdataSection - => panic!("FIXME: state {:?} not implemented in EOF", self.state), + => go!(self: emit_temp; to Data), } } } diff --git a/src/tree_builder/mod.rs b/src/tree_builder/mod.rs index 060774a0..56a11c79 100644 --- a/src/tree_builder/mod.rs +++ b/src/tree_builder/mod.rs @@ -428,6 +428,11 @@ impl TokenSink self.process_to_completion(token); } + fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool { + !self.open_elems.is_empty() && + self.sink.elem_name(self.adjusted_current_node()).ns != ns!(HTML) + } + fn query_state_change(&mut self) -> Option { self.next_tokenizer_state.take() }