From 7bd9e8bb28a3f16c23ee287b86fdd99590ae103d Mon Sep 17 00:00:00 2001 From: Daniel Fath Date: Tue, 24 Mar 2015 22:52:35 +0100 Subject: [PATCH 1/6] Change elem_name in interface to borrow Changed elem_name in TreeSink to borrow instead of move. Rest of changes are caused by it. No change in behaviour detected. Change is prerequisite for XML5 parser; plus it avoids clones in a few places. --- dom_sink/src/owned_dom.rs | 2 +- dom_sink/src/rcdom.rs | 2 +- examples/noop-tree-builder.rs | 4 ++-- examples/print-tree-actions.rs | 4 ++-- src/tree_builder/actions.rs | 20 ++++++++++---------- src/tree_builder/interface.rs | 2 +- src/tree_builder/mod.rs | 8 ++++---- src/tree_builder/rules.rs | 4 ++-- 8 files changed, 23 insertions(+), 23 deletions(-) diff --git a/dom_sink/src/owned_dom.rs b/dom_sink/src/owned_dom.rs index 9b478165..737fdd4b 100644 --- a/dom_sink/src/owned_dom.rs +++ b/dom_sink/src/owned_dom.rs @@ -197,7 +197,7 @@ impl TreeSink for Sink { x == y } - fn elem_name(&self, target: Handle) -> QualName { + fn elem_name(&self, target: &Handle) -> QualName { match target.node { Element(ref name, _) => name.clone(), _ => panic!("not an element!"), diff --git a/dom_sink/src/rcdom.rs b/dom_sink/src/rcdom.rs index a09b2a88..5f0b56fa 100644 --- a/dom_sink/src/rcdom.rs +++ b/dom_sink/src/rcdom.rs @@ -147,7 +147,7 @@ impl TreeSink for RcDom { same_node(&x, &y) } - fn elem_name(&self, target: Handle) -> QualName { + fn elem_name(&self, target: &Handle) -> QualName { // FIXME: rust-lang/rust#22252 return match target.borrow().node { Element(ref name, _) => name.clone(), diff --git a/examples/noop-tree-builder.rs b/examples/noop-tree-builder.rs index ce1306b8..cb0d35c2 100644 --- a/examples/noop-tree-builder.rs +++ b/examples/noop-tree-builder.rs @@ -46,8 +46,8 @@ impl TreeSink for Sink { x == y } - fn elem_name(&self, target: usize) -> QualName { - self.names.get(&target).expect("not an element").clone() + fn elem_name(&self, target: &usize) -> QualName { + self.names.get(target).expect("not an element").clone() } fn create_element(&mut self, name: QualName, _attrs: Vec) -> usize { diff --git a/examples/print-tree-actions.rs b/examples/print-tree-actions.rs index 0da21099..26958f0a 100644 --- a/examples/print-tree-actions.rs +++ b/examples/print-tree-actions.rs @@ -56,8 +56,8 @@ impl TreeSink for Sink { x == y } - fn elem_name(&self, target: usize) -> QualName { - self.names.get(&target).expect("not an element").clone() + fn elem_name(&self, target: &usize) -> QualName { + self.names.get(target).expect("not an element").clone() } fn create_element(&mut self, name: QualName, _attrs: Vec) -> usize { diff --git a/src/tree_builder/actions.rs b/src/tree_builder/actions.rs index 25bc99ca..6ecc2367 100644 --- a/src/tree_builder/actions.rs +++ b/src/tree_builder/actions.rs @@ -202,7 +202,7 @@ impl TreeBuilderActions fn current_node_in(&self, set: TagSet) -> bool where TagSet: Fn(QualName) -> bool { - set(self.sink.elem_name(self.current_node())) + set(self.sink.elem_name(&self.current_node())) } // Insert at the "appropriate place for inserting a node". @@ -522,7 +522,7 @@ impl TreeBuilderActions thead tr body html); for elem in self.open_elems.iter() { - let name = self.sink.elem_name(elem.clone()); + let name = self.sink.elem_name(&elem); if !body_end_ok(name.clone()) { self.sink.parse_error(format_if!(self.opts.exact_errors, "Unexpected open tag at end of body", @@ -541,7 +541,7 @@ impl TreeBuilderActions if pred(node.clone()) { return true; } - if scope(self.sink.elem_name(node.clone())) { + if scope(self.sink.elem_name(&node)) { return false; } } @@ -554,11 +554,11 @@ impl TreeBuilderActions fn elem_in(&self, elem: Handle, set: TagSet) -> bool where TagSet: Fn(QualName) -> bool { - set(self.sink.elem_name(elem)) + set(self.sink.elem_name(&elem)) } fn html_elem_named(&self, elem: Handle, name: Atom) -> bool { - self.sink.elem_name(elem) == QualName::new(ns!(HTML), name) + self.sink.elem_name(&elem) == QualName::new(ns!(HTML), name) } fn current_node_named(&self, name: Atom) -> bool { @@ -578,7 +578,7 @@ impl TreeBuilderActions { loop { let elem = unwrap_or_return!(self.open_elems.last(), ()).clone(); - let nsname = self.sink.elem_name(elem); + let nsname = self.sink.elem_name(&elem); if !set(nsname) { return; } self.pop(); } @@ -614,7 +614,7 @@ impl TreeBuilderActions n += 1; match self.open_elems.pop() { None => break, - Some(elem) => if pred(self.sink.elem_name(elem)) { break; }, + Some(ref elem) => if pred(self.sink.elem_name(elem)) { break; }, } } n @@ -684,7 +684,7 @@ impl TreeBuilderActions if let (true, Some(ctx)) = (last, self.context_elem.as_ref()) { node = ctx; } - let name = match self.sink.elem_name(node.clone()) { + let name = match self.sink.elem_name(&node) { QualName { ns: ns!(HTML), local } => local, _ => continue, }; @@ -871,7 +871,7 @@ impl TreeBuilderActions return false; } - let name = self.sink.elem_name(self.adjusted_current_node()); + let name = self.sink.elem_name(&self.adjusted_current_node()); if let ns!(HTML) = name.ns { return false; } @@ -1064,7 +1064,7 @@ impl TreeBuilderActions } fn foreign_start_tag(&mut self, mut tag: Tag) -> ProcessResult { - let cur = self.sink.elem_name(self.adjusted_current_node()); + let cur = self.sink.elem_name(&self.adjusted_current_node()); match cur.ns { ns!(MathML) => self.adjust_mathml_attributes(&mut tag), ns!(SVG) => { diff --git a/src/tree_builder/interface.rs b/src/tree_builder/interface.rs index a6086af5..c3fdd82e 100644 --- a/src/tree_builder/interface.rs +++ b/src/tree_builder/interface.rs @@ -65,7 +65,7 @@ pub trait TreeSink { /// /// Should never be called on a non-element node; /// feel free to `panic!`. - fn elem_name(&self, target: Self::Handle) -> QualName; + fn elem_name(&self, target: &Self::Handle) -> QualName; /// Set the document's quirks mode. fn set_quirks_mode(&mut self, mode: QuirksMode); diff --git a/src/tree_builder/mod.rs b/src/tree_builder/mod.rs index 3d9e88b5..0f910880 100644 --- a/src/tree_builder/mod.rs +++ b/src/tree_builder/mod.rs @@ -185,7 +185,7 @@ impl TreeBuilder opts: TreeBuilderOpts) -> TreeBuilder { let doc_handle = sink.get_document(); let context_is_template = - sink.elem_name(context_elem.clone()) == qualname!(HTML, template); + sink.elem_name(&context_elem) == qualname!(HTML, template); let mut tb = TreeBuilder { opts: opts, sink: sink, @@ -221,7 +221,7 @@ impl TreeBuilder // Step 4. Set the state of the HTML parser's tokenization stage as follows: pub fn tokenizer_state_for_context_elem(&self) -> tok_state::State { let elem = self.context_elem.clone().expect("no context element"); - let name = match self.sink.elem_name(elem) { + let name = match self.sink.elem_name(&elem) { QualName { ns: ns!(HTML), local } => local, _ => return tok_state::Data }; @@ -282,7 +282,7 @@ impl TreeBuilder println!("dump_state on {}", label); print!(" open_elems:"); for node in self.open_elems.iter() { - let QualName { ns, local } = self.sink.elem_name(node.clone()); + let QualName { ns, local } = self.sink.elem_name(&node); match ns { ns!(HTML) => print!(" {}", &local[..]), _ => panic!(), @@ -294,7 +294,7 @@ impl TreeBuilder match entry { &Marker => print!(" Marker"), &Element(ref h, _) => { - let QualName { ns, local } = self.sink.elem_name(h.clone()); + let QualName { ns, local } = self.sink.elem_name(&h); match ns { ns!(HTML) => print!(" {}", &local[..]), _ => panic!(), diff --git a/src/tree_builder/rules.rs b/src/tree_builder/rules.rs index ce01bfab..10ee1f48 100644 --- a/src/tree_builder/rules.rs +++ b/src/tree_builder/rules.rs @@ -395,7 +395,7 @@ impl TreeBuilderStep let mut to_close = None; for node in self.open_elems.iter().rev() { - let name = self.sink.elem_name(node.clone()); + let name = self.sink.elem_name(&node); if can_close(name.clone()) { to_close = Some(name.local); break; @@ -1360,7 +1360,7 @@ impl TreeBuilderStep } let node = self.open_elems[stack_idx].clone(); - let node_name = self.sink.elem_name(node); + let node_name = self.sink.elem_name(&node); if !first && node_name.ns == ns!(HTML) { let mode = self.mode; return self.step(mode, TagToken(tag)); From bc67caeeff880d5aa4ba996baaa8621d7f8b7f0c Mon Sep 17 00:00:00 2001 From: Daniel Fath Date: Mon, 13 Apr 2015 15:30:31 +0200 Subject: [PATCH 2/6] Add XML5 parsing to html5ever Adds XML5 parser based on [spec](https://github.com/annevk/xml5) by . Currently working draft resides on https://github.com/Ygg01/xml5_draft and is rendered using [Bikeshed](https://github.com/tabatkins/bikeshed). This patch is only concerned with making it work right, according to spec. Things to be done: 1. Add some support for doctype as suggested by annevk here: Ygg01/xml5_draft#2 2. Finish references in xml5 - basically, use all entity replacements html5 uses and add test for those. 3. Add Namespace support. 4. Unify two parsers using associated types. 5. Add C API for xml5 parser. --- src/driver.rs | 70 ++ src/lib.rs | 1 + src/tokenizer/char_ref/mod.rs | 267 +++++++ src/tokenizer/interface.rs | 72 ++ src/tokenizer/mod.rs | 919 +++++++++++++++++++++- src/tokenizer/states.rs | 40 + src/tree_builder/actions.rs | 162 ++++ src/tree_builder/mod.rs | 146 ++++ src/tree_builder/rules.rs | 137 +++- src/tree_builder/types.rs | 27 + src/util/str.rs | 17 + xml5lib-tests/AUTHORS.rst | 9 + xml5lib-tests/LICENSE | 21 + xml5lib-tests/tokenizer/README.md | 104 +++ xml5lib-tests/tokenizer/eof.test | 113 +++ xml5lib-tests/tokenizer/test1.test | 149 ++++ xml5lib-tests/tokenizer/test2.test | 64 ++ xml5lib-tests/tree-construction/README.md | 104 +++ xml5lib-tests/tree-construction/test1.dat | 102 +++ 19 files changed, 2499 insertions(+), 25 deletions(-) create mode 100644 xml5lib-tests/AUTHORS.rst create mode 100644 xml5lib-tests/LICENSE create mode 100644 xml5lib-tests/tokenizer/README.md create mode 100644 xml5lib-tests/tokenizer/eof.test create mode 100644 xml5lib-tests/tokenizer/test1.test create mode 100644 xml5lib-tests/tokenizer/test2.test create mode 100644 xml5lib-tests/tree-construction/README.md create mode 100644 xml5lib-tests/tree-construction/test1.dat diff --git a/src/driver.rs b/src/driver.rs index 6fb359de..354509be 100644 --- a/src/driver.rs +++ b/src/driver.rs @@ -15,6 +15,11 @@ use tree_builder::{TreeBuilderOpts, TreeBuilder, TreeSink}; use std::option; use std::default::Default; +use tokenizer::{XmlTokenizerOpts, XmlTokenizer, XTokenSink}; +use tree_builder::{ XmlTreeBuilder}; + +use collections::string::String; + use string_cache::{Atom, QualName}; /// Convenience function to turn a single `String` into an iterator. @@ -46,6 +51,30 @@ pub fn tokenize_to< tok.unwrap() } +/// Tokenize and send results to a `XTokenSink`. +/// +/// ## Example +/// +/// ```ignore +/// let mut sink = MySink; +/// tokenize_xml_to(&mut sink, one_input(my_str), Default::default()); +/// ``` +pub fn tokenize_xml_to< + Sink: XTokenSink, + It: Iterator + >( + sink: Sink, + input: It, + opts: XmlTokenizerOpts) -> Sink { + + let mut tok = XmlTokenizer::new(sink, opts); + for s in input { + tok.feed(s); + } + tok.end(); + tok.unwrap() +} + /// All-encompassing options struct for the parser. #[derive(Clone, Default)] pub struct ParseOpts { @@ -81,6 +110,31 @@ pub fn parse_to< tok.unwrap().unwrap() } +/// Parse and send results to a `TreeSink`. +/// +/// ## Example +/// +/// ```ignore +/// let mut sink = MySink; +/// parse_xml_to(&mut sink, one_input(my_str), Default::default()); +/// ``` +pub fn parse_xml_to< + Sink:TreeSink, + It: Iterator + >( + sink: Sink, + input: It, + opts: XmlTokenizerOpts) -> Sink { + + let tb = XmlTreeBuilder::new(sink); + let mut tok = XmlTokenizer::new(tb, opts); + for s in input { + tok.feed(s); + } + tok.end(); + tok.unwrap().unwrap() +} + /// Parse an HTML fragment and send results to a `TreeSink`. /// /// ## Example @@ -137,6 +191,22 @@ pub fn parse(input: It, opts: ParseOpts) -> Output ParseResult::get_result(sink) } +/// Parse into a type which implements `ParseResult`. +/// +/// ## Example +/// +/// ```ignore +/// let dom: RcDom = parse_xml(one_input(my_str), Default::default()); +/// ``` +pub fn parse_xml(input: It, opts: XmlTokenizerOpts) -> Output + where Output: ParseResult, + It: Iterator, +{ + let sink = parse_xml_to(Default::default(), input, opts); + ParseResult::get_result(sink) +} + + /// Parse an HTML fragment into a type which implements `ParseResult`. /// /// ## Example diff --git a/src/lib.rs b/src/lib.rs index a7ac0f21..5d22a4d0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,6 +33,7 @@ extern crate time; pub use tokenizer::Attribute; pub use driver::{one_input, ParseOpts, parse_to, parse_fragment_to, parse, parse_fragment}; +pub use driver::{parse_xml, parse_xml_to, tokenize_xml_to}; pub use serialize::serialize; diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs index 5141de4f..85a34b95 100644 --- a/src/tokenizer/char_ref/mod.rs +++ b/src/tokenizer/char_ref/mod.rs @@ -10,6 +10,7 @@ use super::{Tokenizer, TokenSink}; use util::str::{is_ascii_alnum, empty_str}; +use util::str::{is_xml_namechar}; use std::char::from_u32; use std::borrow::Cow::Borrowed; @@ -17,6 +18,12 @@ use std::borrow::Cow::Borrowed; pub use self::Status::*; use self::State::*; +use super::{XTokenSink, XmlTokenizer}; +pub use self::XRef::*; +use self::XState::*; + +use collections::string::ToString; + mod data; //§ tokenizing-character-references @@ -397,3 +404,263 @@ impl CharRefTokenizer { } } } +#[derive(Debug)] +enum XState { + XBegin, + XNumeric(u32), + XNumericSemicolon, + XReference, + XOctothorpe, +} + +pub enum XRef { + NamedXRef(String), + CharXData(String), + NoReturn, +} + +pub struct XCharRefTokenizer { + state: XState, + result: Option, + hex_marker: Option, + + num: u32, + num_too_big: bool, + + seen_digit: bool, + name_buf_opt: Option, +} +impl XCharRefTokenizer { + + // NB: We assume that we have an additional allowed character iff we're + // tokenizing in an attribute value. + pub fn new() -> XCharRefTokenizer { + XCharRefTokenizer { + state: XState::XBegin, + result: None, + hex_marker: None, + num: 0, + num_too_big: false, + seen_digit: false, + name_buf_opt: None, + } + } + + // A CharRefTokenizer can only tokenize one character reference, + // so this method consumes the tokenizer. + pub fn get_result(self) -> XRef { + self.result.expect("get_result called before done") + } + + fn name_buf_mut<'t>(&'t mut self) -> &'t mut String { + self.name_buf_opt.as_mut() + .expect("name_buf missing in named character reference") + } + + fn name_buf<'t>(&'t self) -> &'t String { + self.name_buf_opt.as_ref() + .expect("name_buf missing in named character reference") + } +} + + + +impl XCharRefTokenizer { + + pub fn step( + &mut self, + tokenizer: &mut XmlTokenizer + ) -> Status { + + if self.result.is_some() { + return Done; + } + + h5e_debug!("Xml char ref tokenizer stepping in state {:?}", self.state); + match self.state { + XBegin => self.do_begin(tokenizer), + XNumeric(base) => self.do_numeric(tokenizer, base), + XNumericSemicolon => self.do_numeric_semicolon(tokenizer), + XReference => self.do_reference(tokenizer), + XOctothorpe => self.do_octothorpe(tokenizer), + } + } + + fn do_begin(&mut self, + tokenizer: &mut XmlTokenizer) -> Status { + match unwrap_or_return!(tokenizer.peek(), Stuck) { + '\t' | '\n' | '\x0C' | ' ' | '<' | '&' | '%' + => self.finish_none(), + + '#' => { + tokenizer.discard_char(); + self.state = XState::XOctothorpe; + Progress + } + + _ => { + self.state = XState::XReference; + self.name_buf_opt = Some(empty_str()); + Progress + } + } + } + + fn do_octothorpe(&mut self, + tokenizer: &mut XmlTokenizer) -> Status { + let c = unwrap_or_return!(tokenizer.peek(), Stuck); + match c { + 'x' | 'X' => { + tokenizer.discard_char(); + self.hex_marker = Some(c); + self.state = XNumeric(16); + } + + _ => { + self.hex_marker = None; + self.state = XNumeric(10); + } + } + Progress + } + + + fn do_numeric(&mut self, + tokenizer: &mut XmlTokenizer, base: u32) -> Status { + let c = unwrap_or_return!(tokenizer.peek(), Stuck); + match c.to_digit(base as u32) { + Some(n) => { + tokenizer.discard_char(); + self.num *= base; + if self.num > 0x10FFFF { + // We might overflow, and the character is definitely invalid. + // We still parse digits and semicolon, but don't use the result. + self.num_too_big = true; + } + self.num += n as u32; + self.seen_digit = true; + Progress + } + + None if !self.seen_digit => self.unconsume_numeric(tokenizer), + + None => { + self.state = XNumericSemicolon; + Progress + } + } + } + + fn do_numeric_semicolon(&mut self, + tokenizer: &mut XmlTokenizer) -> Status { + match unwrap_or_return!(tokenizer.peek(), Stuck) { + ';' => tokenizer.discard_char(), + _ => tokenizer.emit_error(Borrowed("Semicolon missing after numeric character reference")), + }; + self.finish_numeric(tokenizer) + } + + fn do_reference(&mut self, + tokenizer: &mut XmlTokenizer) -> Status { + let c = unwrap_or_return!(tokenizer.get_char(), Stuck); + if is_xml_namechar(&c) { + self.name_buf_mut().push(c); + Progress + } else if c == ';' { + self.finish_reference(tokenizer) + } else { + tokenizer.unconsume(c.to_string()); + let temp = self.name_buf().clone(); + self.finish_text(temp) + } + + } + + pub fn end_of_file(&mut self, + tokenizer: &mut XmlTokenizer) { + + + while self.result.is_none() { + match self.state { + XBegin => { self.finish_none(); }, + + XNumeric(_) if !self.seen_digit + => { self.unconsume_numeric(tokenizer); }, + + XNumeric(_) | XState::XNumericSemicolon => { + tokenizer.emit_error(Borrowed("EOF in numeric character reference")); + self.finish_numeric(tokenizer); + }, + + XReference => { + tokenizer.emit_error(Borrowed("EOF in reference")); + self.finish_reference(tokenizer); + }, + + XOctothorpe => { + tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); + self.finish_text("#".to_string()); + }, + } + } + } + + fn finish_none(&mut self) -> Status { + self.result = Some(NoReturn); + Done + } + + fn finish_text(&mut self, text: String) -> Status { + self.result = Some(CharXData(text)); + Done + } + + fn finish_reference(&mut self, + tokenizer: &mut XmlTokenizer) -> Status { + + use core::mem::replace; + + match self.name_buf_opt { + Some(ref mut c) if c.len() > 0 => { + self.result = Some(NamedXRef(replace(c, String::new()))); + }, + _ => { + tokenizer.emit_error(Borrowed("empty reference")); + self.result = Some(NoReturn); + } + }; + Done + } + + fn finish_numeric(&mut self, tokenizer: &mut XmlTokenizer) -> Status { + fn conv(n: u32) -> char { + from_u32(n).expect("invalid char missed by error handling cases") + } + + let (c, error) = match self.num { + n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true), + n => (conv(n), false), + }; + + if error { + let msg = format_if!(tokenizer.opts.exact_errors, + "Invalid numeric character reference", + "Invalid numeric character reference value 0x{:06X}", self.num); + tokenizer.emit_error(msg); + } + self.result = Some(CharXData(c.to_string())); + Done + } + + fn unconsume_numeric(&mut self, tokenizer: &mut XmlTokenizer) -> Status { + let mut unconsume = String::from_str("#"); + match self.hex_marker { + Some(c) => unconsume.push(c), + None => (), + } + + + tokenizer.emit_error(Borrowed("Numeric character reference without digits")); + self.finish_text(unconsume) + } +} diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs index 28123a67..bfd26911 100644 --- a/src/tokenizer/interface.rs +++ b/src/tokenizer/interface.rs @@ -17,6 +17,11 @@ pub use self::TagKind::{StartTag, EndTag}; pub use self::Token::{DoctypeToken, TagToken, CommentToken, CharacterTokens}; pub use self::Token::{NullCharacterToken, EOFToken, ParseError}; +pub use self::XTagKind::{StartXTag, EndXTag, EmptyXTag, ShortXTag}; +pub use self::XToken::{DoctypeXToken, XTagToken, PIToken, CommentXToken}; +pub use self::XToken::{CharacterXTokens, EOFXToken, XParseError, NullCharacterXToken}; + + /// A `DOCTYPE` token. // FIXME: already exists in Servo DOM #[derive(PartialEq, Eq, Clone, Debug)] @@ -108,3 +113,70 @@ pub trait TokenSink { None } } + + +#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)] +pub enum XTagKind { + StartXTag, + EndXTag, + EmptyXTag, + ShortXTag, +} + +/// XML 5 Tag Token +// FIXME: Possibly unify with Tag? +#[derive(PartialEq, Eq, Debug, Clone)] +pub struct XTag { + pub kind: XTagKind, + pub name: Atom, + pub attrs: Vec +} + +impl XTag { + pub fn equiv_modulo_attr_order(&self, other: &XTag) -> bool { + if (self.kind != other.kind) || (self.name != other.name) { + return false; + } + + let mut self_attrs = self.attrs.clone(); + let mut other_attrs = other.attrs.clone(); + self_attrs.sort(); + other_attrs.sort(); + + self_attrs == other_attrs + } +} + +// FIXME: rust-lang/rust#22629 +unsafe impl Send for XToken { } + +#[derive(PartialEq, Eq, Clone, Debug)] +pub struct XPi { + pub target: String, + pub data: String, +} + +#[derive(PartialEq, Eq, Debug)] +pub enum XToken { + DoctypeXToken(Doctype), + XTagToken(XTag), + PIToken(XPi), + CommentXToken(String), + CharacterXTokens(String), + EOFXToken, + NullCharacterXToken, + XParseError(Cow<'static, str>), +} + +/// Types which can receive tokens from the tokenizer. +pub trait XTokenSink { + /// Process a token. + fn process_token(&mut self, token: XToken); + + /// The tokenizer will call this after emitting any start tag. + /// This allows the tree builder to change the tokenizer's state. + /// By default no state changes occur. + fn query_state_change(&mut self) -> Option { + None + } +} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 476db92e..01fe9844 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -22,7 +22,18 @@ use self::states::{Escaped, DoubleEscaped}; use self::states::{Unquoted, SingleQuoted, DoubleQuoted}; use self::states::{DoctypeIdKind, Public, System}; +pub use self::interface::{StartXTag, EndXTag, EmptyXTag, ShortXTag}; +pub use self::interface::{DoctypeXToken, XTagToken, PIToken, CommentXToken}; +pub use self::interface::{CharacterXTokens, EOFXToken, NullCharacterXToken}; +pub use self::interface::{XTokenSink, XParseError, XTagKind, XToken, XTag}; + +pub use self::interface::XPi; + +use self::states::{XData, XTagState}; +use self::states::XmlState; + use self::char_ref::{CharRef, CharRefTokenizer}; +use self::char_ref::{XCharRefTokenizer, XRef}; use self::buffer_queue::{BufferQueue, SetResult, FromSet, NotFromSet}; @@ -470,34 +481,454 @@ impl Tokenizer { } } - fn emit_current_doctype(&mut self) { - let doctype = replace(&mut self.current_doctype, Doctype::new()); - self.process_token(DoctypeToken(doctype)); - } + fn emit_current_doctype(&mut self) { + let doctype = replace(&mut self.current_doctype, Doctype::new()); + self.process_token(DoctypeToken(doctype)); + } + + fn doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option { + match kind { + Public => &mut self.current_doctype.public_id, + System => &mut self.current_doctype.system_id, + } + } + + fn clear_doctype_id(&mut self, kind: DoctypeIdKind) { + let id = self.doctype_id(kind); + match *id { + Some(ref mut s) => s.truncate(0), + None => *id = Some(empty_str()), + } + } + + fn consume_char_ref(&mut self, addnl_allowed: Option) { + // NB: The char ref tokenizer assumes we have an additional allowed + // character iff we're tokenizing in an attribute value. + self.char_ref_tokenizer = Some(box CharRefTokenizer::new(addnl_allowed)); + } + + fn emit_eof(&mut self) { + self.process_token(EOFToken); + } + + fn peek(&mut self) -> Option { + if self.reconsume { + Some(self.current_char) + } else { + self.input_buffers.peek() + } + } + + fn discard_char(&mut self) { + let c = self.get_char(); + assert!(c.is_some()); + } + + fn unconsume(&mut self, buf: String) { + self.input_buffers.push_front(buf); + } + + fn emit_error(&mut self, error: Cow<'static, str>) { + self.process_token(ParseError(error)); + } +} +//§ END + +/// Copy of Tokenizer options, with an impl for `Default`. +/// FIXME: Unite this with TokenizerOpt +#[derive(Copy, Clone)] +pub struct XmlTokenizerOpts { + /// Report all parse errors described in the spec, at some + /// performance penalty? Default: false + pub exact_errors: bool, + + /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning + /// of the stream? Default: true + pub discard_bom: bool, + + /// Keep a record of how long we spent in each state? Printed + /// when `end()` is called. Default: false + pub profile: bool, + + /// Initial state override. Only the test runner should use + /// a non-`None` value! + pub initial_state: Option, + + /// Mod determining if the entity expansion is allowed + /// TODO: Upgrade to a struct with more options. + pub safe_mod: bool, + +} + +impl Default for XmlTokenizerOpts { + fn default() -> XmlTokenizerOpts { + XmlTokenizerOpts { + exact_errors: false, + discard_bom: true, + profile: false, + initial_state: None, + safe_mod: true, + } + } +} + +/// The Xml tokenizer. +pub struct XmlTokenizer { + /// Options controlling the behavior of the tokenizer. + opts: XmlTokenizerOpts, + + /// Destination for tokens we emit. + sink: Sink, + + /// The abstract machine state as described in the spec. + state: states::XmlState, + + /// Input ready to be tokenized. + input_buffers: BufferQueue, + + /// Are we at the end of the file, once buffers have been processed + /// completely? This affects whether we will wait for lookahead or not. + at_eof: bool, + + /// Tokenizer for character references, if we're tokenizing + /// one at the moment. + char_ref_tokenizer: Option>, + + /// Current input character. Just consumed, may reconsume. + current_char: char, + + /// Should we reconsume the current input character? + reconsume: bool, + + /// Did we just consume \r, translating it to \n? In that case we need + /// to ignore the next character if it's \n. + ignore_lf: bool, + + /// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the + /// beginning of the stream. + discard_bom: bool, + + /// Current tag kind. + current_tag_kind: XTagKind, + + /// Current tag name. + current_tag_name: String, + + /// Current tag attributes. + current_tag_attrs: Vec, + + /// Current attribute name. + current_attr_name: String, + + /// Current attribute value. + current_attr_value: String, + + /// Current comment. + current_comment: String, + + /// Current processing instruction target. + current_pi_target: String, + + /// Current processing instruction value. + current_pi_data: String, + + /// Record of how many ns we spent in each state, if profiling is enabled. + state_profile: BTreeMap, + + /// Record of how many ns we spent in the token sink. + time_in_sink: u64, +} + +impl XmlTokenizer { + /// Create a new tokenizer which feeds tokens to a particular `TokenSink`. + pub fn new(sink: Sink, opts: XmlTokenizerOpts) -> XmlTokenizer { + if opts.profile && cfg!(for_c) { + panic!("Can't profile tokenizer when built as a C library"); + } + + let state = *opts.initial_state.as_ref().unwrap_or(&states::XData); + let discard_bom = opts.discard_bom; + XmlTokenizer { + opts: opts, + sink: sink, + state: state, + char_ref_tokenizer: None, + input_buffers: BufferQueue::new(), + at_eof: false, + current_char: '\0', + reconsume: false, + ignore_lf: false, + discard_bom: discard_bom, + current_tag_kind: StartXTag, + current_tag_name: empty_str(), + current_tag_attrs: vec!(), + current_attr_name: empty_str(), + current_attr_value: empty_str(), + current_comment: empty_str(), + current_pi_data: empty_str(), + current_pi_target: empty_str(), + state_profile: BTreeMap::new(), + time_in_sink: 0, + } + } + pub fn unwrap(self) -> Sink { + self.sink + } + + pub fn sink<'a>(&'a self) -> &'a Sink { + &self.sink + } + + pub fn sink_mut<'a>(&'a mut self) -> &'a mut Sink { + &mut self.sink + } + + /// Feed an input string into the tokenizer. + pub fn feed(&mut self, input: String) { + if input.len() == 0 { + return; + } + + let pos = if self.discard_bom && input.char_at(0) == '\u{FFEF}' { + self.discard_bom = false; + 3 // length of BOM in UTF-8 + } else { + 0 + }; + + self.input_buffers.push_back(input, pos); + self.run(); + } + + fn process_token(&mut self, token: XToken) { + if self.opts.profile { + let (_, dt) = time!(self.sink.process_token(token)); + self.time_in_sink += dt; + } else { + self.sink.process_token(token); + } + } + + // Get the next input character, which might be the character + // 'c' that we already consumed from the buffers. + fn get_preprocessed_char(&mut self, mut c: char) -> Option { + if self.ignore_lf { + self.ignore_lf = false; + if c == '\n' { + c = unwrap_or_return!(self.input_buffers.next(), None); + } + } + + if c == '\r' { + self.ignore_lf = true; + c = '\n'; + } + + // Normalize \x00 into \uFFFD + if c == '\x00' { + c = '\u{FFFD}' + } + + h5e_debug!("got character {}", c); + self.current_char = c; + Some(c) + } + + fn bad_eof_error(&mut self) { + let msg = format_if!( + self.opts.exact_errors, + "Unexpected EOF", + "Saw EOF in state {:?}", self.state); + self.emit_error(msg); + } + + fn pop_except_from(&mut self, set: SmallCharSet) -> Option { + // Bail to the slow path for various corner cases. + // This means that `FromSet` can contain characters not in the set! + // It shouldn't matter because the fallback `FromSet` case should + // always do the same thing as the `NotFromSet` case. + if self.opts.exact_errors || self.reconsume || self.ignore_lf { + return self.get_char().map(|x| FromSet(x)); + } + + let d = self.input_buffers.pop_except_from(set); + h5e_debug!("got characters {:?}", d); + match d { + Some(FromSet(c)) => self.get_preprocessed_char(c).map(|x| FromSet(x)), + + // NB: We don't set self.current_char for a run of characters not + // in the set. It shouldn't matter for the codepaths that use + // this. + _ => d + } + } + + // Check if the next characters are an ASCII case-insensitive match. See + // BufferQueue::eat. + // + // NB: this doesn't do input stream preprocessing or set the current input + // character. + fn eat(&mut self, pat: &str) -> Option { + match self.input_buffers.eat(pat) { + None if self.at_eof => Some(false), + r => r, + } + } + + // Run the state machine for as long as we can. + fn run(&mut self) { + if self.opts.profile { + loop { + let state = self.state; + let old_sink = self.time_in_sink; + let (run, mut dt) = time!(self.step()); + dt -= (self.time_in_sink - old_sink); + let new = match self.state_profile.get_mut(&state) { + Some(x) => { + *x += dt; + false + } + None => true, + }; + if new { + // do this here because of borrow shenanigans + self.state_profile.insert(state, dt); + } + if !run { break; } + } + } else { + while self.step() { + } + } + } + + //§ tokenization + // Get the next input character, if one is available. + fn get_char(&mut self) -> Option { + if self.reconsume { + self.reconsume = false; + Some(self.current_char) + } else { + self.input_buffers.next() + .and_then(|c| self.get_preprocessed_char(c)) + } + } + + fn bad_char_error(&mut self) { + let msg = format_if!( + self.opts.exact_errors, + "Bad character", + "Saw {} in state {:?}", self.current_char, self.state); + self.emit_error(msg); + } + + fn discard_tag(&mut self) { + self.current_tag_name = String::new(); + self.current_tag_attrs = vec!(); + } + + fn create_tag(&mut self, kind: XTagKind, c: char) { + self.discard_tag(); + self.current_tag_name.push(c); + self.current_tag_kind = kind; + } + + // This method creates a PI token and + // sets its target to given char + fn create_pi(&mut self, c: char) { + self.current_pi_target = String::new(); + self.current_pi_data = String::new(); + self.current_pi_target.push(c); + } + + fn emit_char(&mut self, c: char) { + self.process_token(match c { + '\0' => CharacterXTokens('\u{FFFD}'.to_string()), + c => CharacterXTokens(c.to_string()), + }); + } + + fn emit_short_tag(&mut self) { + self.current_tag_kind = ShortXTag; + self.current_tag_name = String::new(); + self.emit_current_tag(); + } + + fn emit_empty_tag(&mut self) { + self.current_tag_kind = EmptyXTag; + self.emit_current_tag(); + } + + fn set_empty_tag(&mut self) { + self.current_tag_kind = EmptyXTag; + } + + fn emit_start_tag(&mut self) { + self.current_tag_kind = StartXTag; + self.emit_current_tag(); + } + + fn emit_current_tag(&mut self) { + self.finish_attribute(); + + let name = replace(&mut self.current_tag_name, String::new()); + let name = Atom::from_slice(&name); + + match self.current_tag_kind { + StartXTag | EmptyXTag => {}, + EndXTag => { + if !self.current_tag_attrs.is_empty() { + self.emit_error(Borrowed("Attributes on an end tag")); + } + }, + ShortXTag => { + if !self.current_tag_attrs.is_empty() { + self.emit_error(Borrowed("Attributes on a short tag")); + } + }, + } + + let token = XTagToken(XTag { kind: self.current_tag_kind, + name: name, + attrs: replace(&mut self.current_tag_attrs, vec!()), + }); + self.process_token(token); + - fn doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option { - match kind { - Public => &mut self.current_doctype.public_id, - System => &mut self.current_doctype.system_id, + if self.current_tag_kind == StartXTag { + match self.sink.query_state_change() { + None => (), + Some(s) => self.state = s, + } } } - fn clear_doctype_id(&mut self, kind: DoctypeIdKind) { - let id = self.doctype_id(kind); - match *id { - Some(ref mut s) => s.truncate(0), - None => *id = Some(empty_str()), - } + // The string must not contain '\0'! + fn emit_chars(&mut self, b: String) { + self.process_token(CharacterXTokens(b)); } - fn consume_char_ref(&mut self, addnl_allowed: Option) { + // Emits the current Processing Instruction + fn emit_pi(&mut self) { + let token = PIToken(XPi { + target: replace(&mut self.current_pi_target, String::new()), + data: replace(&mut self.current_pi_data, String::new()), + }); + self.process_token(token); + } + + fn consume_char_ref(&mut self) { // NB: The char ref tokenizer assumes we have an additional allowed // character iff we're tokenizing in an attribute value. - self.char_ref_tokenizer = Some(box CharRefTokenizer::new(addnl_allowed)); + self.char_ref_tokenizer = Some(box XCharRefTokenizer::new()); } fn emit_eof(&mut self) { - self.process_token(EOFToken); + self.process_token(EOFXToken); + } + + fn emit_error(&mut self, error: Cow<'static, str>) { + self.process_token(XParseError(error)); } fn peek(&mut self) -> Option { @@ -516,13 +947,7 @@ impl Tokenizer { fn unconsume(&mut self, buf: String) { self.input_buffers.push_front(buf); } - - fn emit_error(&mut self, error: Cow<'static, str>) { - self.process_token(ParseError(error)); - } } -//§ END - // Shorthand for common state machine behaviors. macro_rules! shorthand ( ( $me:ident : emit $c:expr ) => ( $me.emit_char($c); ); @@ -548,6 +973,10 @@ macro_rules! shorthand ( ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype(); ); ( $me:ident : error ) => ( $me.bad_char_error(); ); ( $me:ident : error_eof ) => ( $me.bad_eof_error(); ); + ( $me:ident : create_pi $c:expr ) => ( $me.create_pi($c); ); + ( $me:ident : push_pi_target $c:expr ) => ( $me.current_pi_target.push($c); ); + ( $me:ident : push_pi_data $c:expr ) => ( $me.current_pi_data.push($c); ); + ( $me:ident : set_empty_tag ) => ( $me.set_empty_tag(); ); ); // Tracing of tokenizer actions. This adds significant bloat and compile time, @@ -583,6 +1012,7 @@ macro_rules! go ( ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return true; }); ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return true; }); + ( $me:ident : consume_xchar_ref ) => ({ $me.consume_char_ref(); return true; }); // We have a default next state after emitting a tag, but the sink can override. ( $me:ident : emit_tag $s:ident ) => ({ @@ -591,6 +1021,31 @@ macro_rules! go ( return true; }); + // We have a special when dealing with empty and short tags in Xml + ( $me:ident : emit_short_tag $s:ident ) => ({ + $me.state = states::$s; + $me.emit_short_tag(); + return true; + }); + + ( $me:ident : emit_empty_tag $s:ident ) => ({ + $me.state = states::$s; + $me.emit_empty_tag(); + return true; + }); + + ( $me:ident : emit_start_tag $s:ident ) => ({ + $me.state = states::$s; + $me.emit_start_tag(); + return true; + }); + + ( $me:ident : emit_pi $s:ident ) => ({ + $me.state = states::$s; + $me.emit_pi(); + return true; + }); + ( $me:ident : eof ) => ({ $me.emit_eof(); return false; }); // If nothing else matched, it's a single command @@ -1318,6 +1773,424 @@ impl Tokenizer { } } +impl XmlTokenizer { + + // Run the state machine for a while. + // Return true if we should be immediately re-invoked + // (this just simplifies control flow vs. break / continue). + fn step(&mut self) -> bool { + if self.char_ref_tokenizer.is_some() { + return self.step_char_ref_tokenizer(); + } + + println!("processing in state {:?}", self.state); + match self.state { + //§ data-state + XmlState::XData => loop { + match pop_except_from!(self, small_char_set!('\r' '&' '<')) { + FromSet('&') => go!(self: consume_xchar_ref), + FromSet('<') => go!(self: to XTagState), + FromSet(c) => go!(self: emit c), + NotFromSet(b) => self.emit_chars(b), + } + }, + //§ tag-state + XmlState::XTagState => loop { match get_char!(self) { + '!' => go!(self: to MarkupDecl), + '/' => go!(self: to EndXTagState), + '?' => go!(self: to Pi), + '\t'| '\n' | ' '| + ':' | '<' | '>' => go!(self: error; emit '<'; reconsume XData), + cl => go!(self: create_tag StartXTag cl; to XTagName), + } + }, + //§ end-tag-state + XmlState::EndXTagState => loop { match get_char!(self) { + '>' => go!(self: emit_short_tag XData), + '\t' | '\n' | ' '| + '<' | ':' => go!(self: error; emit '<'; emit '/'; reconsume XData), + cl => go!(self: create_tag EndXTag cl; to EndXTagName) + } + }, + //§ end-tag-name-state + XmlState::EndXTagName => loop { match get_char!(self) { + '\t' | '\n' + | ' ' => go!(self: to EndXTagNameAfter), + '/' => go!(self: error; to EndXTagNameAfter), + '>' => go!(self: emit_tag XData), + cl => go!(self: push_tag cl), + } + }, + //§ end-tag-name-after-state + XmlState::EndXTagNameAfter => loop {match get_char!(self) { + '>' => go!(self: emit_tag XData), + '\t' | '\n' + | ' ' => (), + _ => self.emit_error(Borrowed("Unexpected element in tag name")), + } + }, + //§ pi-state + XmlState::Pi => loop { match get_char!(self) { + '\t' | '\n' + | ' ' => go!(self: error; reconsume BogusXComment), + cl => go!(self: create_pi cl; to PiTarget), + } + }, + //§ pi-target-state + XmlState::PiTarget => loop { match get_char!(self) { + '\t' | '\n' + | ' ' => go!(self: to PiTargetAfter), + '?' => go!(self: to PiAfter), + cl => go!(self: push_pi_target cl), + } + }, + //§ pi-target-after-state + XmlState::PiTargetAfter => loop { match get_char!(self) { + '\t' | '\n' | ' ' => (), + _ => go!(self: reconsume PiData), + } + }, + //§ pi-data-state + XmlState::PiData => loop { match get_char!(self) { + '?' => go!(self: to PiAfter), + cl => go!(self: push_pi_data cl), + } + }, + //§ pi-after-state + XmlState::PiAfter => loop { match get_char!(self) { + '>' => go!(self: emit_pi XData), + '?' => go!(self: to PiAfter), + cl => go!(self: push_pi_data cl), + } + }, + //§ markup-declaration-state + XmlState::MarkupDecl => loop { + if eat!(self, "--") { + go!(self: clear_comment; to XComment); + } else if eat!(self, "[CDATA[") { + go!(self: to Cdata); + } else if eat!(self, "DOCTYPE") { + go!(self: error; to XDoctype); + } else { + // FIXME: See with kmc for this! + // FIXME: CDATA, requires "adjusted current node" from tree builder + // FIXME: 'error' gives wrong message + go!(self: error; to BogusXComment); + } + }, + //§ comment-state + XmlState::XComment => loop { match get_char!(self) { + '-' => go!(self: to XCommentDash), + '>' => go!(self: error; emit_comment; to XData), + c => go!(self: push_comment c; to XComment), + } + }, + //§ comment-dash-state + XmlState::XCommentDash => loop { match get_char!(self) { + '-' => go!(self: to XCommentEnd), + c => go!(self: push_comment c), + } + }, + //§ comment-end-state + XmlState::XCommentEnd => loop { match get_char!(self) { + '>' => go!(self: emit_comment; to XData), + '-' => go!(self: push_comment '-'), + c => go!(self: append_comment "--"; push_comment c; to XComment), + } + }, + //§ cdata-state + XmlState::Cdata => loop { match get_char!(self) { + ']' => go!(self: to CdataBracket), + cl => go!(self: emit cl), + } + }, + //§ cdata-bracket-state + XmlState::CdataBracket => loop { match get_char!(self) { + ']' => go!(self: to CdataEnd), + cl => go!(self: emit ']'; emit cl), + } + }, + //§ cdata-end-state + XmlState::CdataEnd => loop { match get_char!(self) { + '>' => go!(self: to XData), + ']' => go!(self: emit ']'), + cl => go!(self: emit ']'; emit ']'; emit cl; to Cdata), + } + }, + //§ tag-name-state + XmlState::XTagName => loop { match get_char!(self) { + '\t' | '\n' + | ' ' => go!(self: to TagAttrNameBefore), + '>' => go!(self: emit_tag XData), + '/' => go!(self: set_empty_tag; to XTagEmpty), + cl => go!(self: push_tag cl), + } + }, + //§ empty-tag-state + XmlState::XTagEmpty => loop { match get_char!(self) { + '>' => go!(self: emit_empty_tag XData), + _ => go!(self: reconsume TagAttrValueBefore), + } + }, + //§ tag-attribute-name-before-state + XmlState::TagAttrNameBefore => loop { match get_char!(self) { + '\t' | '\n' + | ' ' => (), + '>' => go!(self: emit_tag XData), + '/' => go!(self: set_empty_tag; to XTagEmpty), + ':' => go!(self: error ), + cl => go!(self: create_attr cl; to TagAttrName), + } + }, + //§ tag-attribute-name-state + XmlState::TagAttrName => loop { match get_char!(self) { + '=' => go!(self: to TagAttrValueBefore), + '>' => go!(self: emit_tag XData), + '\t' | '\n' + | ' ' => go!(self: to TagAttrNameAfter), + '/' => go!(self: set_empty_tag; to XTagEmpty), + cl => go!(self: push_name cl), + } + }, + //§ tag-attribute-name-after-state + XmlState::TagAttrNameAfter => loop { match get_char!(self) { + '\t' | '\n' + | ' ' => (), + '=' => go!(self: to TagAttrValueBefore), + '>' => go!(self: emit_tag XData), + '/' => go!(self: set_empty_tag; to XTagEmpty), + cl => go!(self: create_attr cl; to TagAttrName), + } + }, + //§ tag-attribute-value-before-state + XmlState::TagAttrValueBefore => loop { match get_char!(self) { + '\t' | '\n' + | ' ' => (), + '"' => go!(self: to TagAttrValue(DoubleQuoted)), + '\'' => go!(self: to TagAttrValue(SingleQuoted)), + '&' => go!(self: reconsume TagAttrValue(Unquoted)), + '>' => go!(self: emit_tag XData), + cl => go!(self: push_value cl; to TagAttrValue(Unquoted)), + } + }, + //§ tag-attribute-value-double-quoted-state + XmlState::TagAttrValue(DoubleQuoted) => loop { + match pop_except_from!(self, small_char_set!('\n' '"' '&')) { + FromSet('"') => go!(self: to TagAttrNameBefore), + FromSet('&') => go!(self: consume_xchar_ref ), + FromSet(c) => go!(self: push_value c), + NotFromSet(b) => go!(self: append_value b), + } + }, + //§ tag-attribute-value-single-quoted-state + XmlState::TagAttrValue(SingleQuoted) => loop { + match pop_except_from!(self, small_char_set!('\n' '\'' '&')) { + FromSet('\'') => go!(self: to TagAttrNameBefore), + FromSet('&') => go!(self: consume_xchar_ref ), + FromSet(c) => go!(self: push_value c), + NotFromSet(b) => go!(self: append_value b), + } + }, + //§ tag-attribute-value-double-quoted-state + XmlState::TagAttrValue(Unquoted) => loop { + match pop_except_from!(self, small_char_set!('\n' '\t' ' ' '&' '>')) { + FromSet('\t') | FromSet('\n') | FromSet(' ') + => go!(self: to TagAttrNameBefore), + FromSet('&') => go!(self: consume_xchar_ref ), + FromSet('>') => go!(self: emit_tag XData), + FromSet(c) => go!(self: push_value c), + NotFromSet(b) => go!(self: append_value b), + } + }, + //§ bogus-comment-state + XmlState::BogusXComment => loop { match get_char!(self) { + '>' => go!(self: emit_comment; to XData), + c => go!(self: push_comment c), + } + }, + XmlState::XDoctype => {false}, + } + } + + /// Indicate that we have reached the end of the input. + // FIXME: Copy pasta review carefully + pub fn end(&mut self) { + // Handle EOF in the char ref sub-tokenizer, if there is one. + // Do this first because it might un-consume stuff. + match self.char_ref_tokenizer.take() { + None => (), + Some(mut tok) => { + tok.end_of_file(self); + self.process_char_ref(tok.get_result()); + } + } + + // Process all remaining buffered input. + // If we're waiting for lookahead, we're not gonna get it. + self.at_eof = true; + self.run(); + + while self.eof_step() { + // loop + } + + if self.opts.profile { + self.dump_profile(); + } + } + #[cfg(for_c)] + fn dump_profile(&self) { + unreachable!(); + } + + #[cfg(not(for_c))] + fn dump_profile(&self) { + let mut results: Vec<(states::XmlState, u64)> + = self.state_profile.iter().map(|(s, t)| (*s, *t)).collect(); + results.sort_by(|&(_, x), &(_, y)| y.cmp(&x)); + + let total: u64 = results.iter().map(|&(_, t)| t).sum(); + println!("\nTokenizer profile, in nanoseconds"); + println!("\n{:12} total in token sink", self.time_in_sink); + println!("\n{:12} total in tokenizer", total); + + for (k, v) in results.into_iter() { + let pct = 100.0 * (v as f64) / (total as f64); + println!("{:12} {:4.1}% {:?}", v, pct, k); + } + } + + + fn eof_step(&mut self) -> bool { + h5e_debug!("processing EOF in state {:?}", self.state); + match self.state { + XmlState::XData + => go!(self: eof), + XmlState::XTagState + => go!(self: error_eof; emit '<'; to XData), + XmlState::EndXTagState + => go!(self: error_eof; emit '<'; emit '/'; to XData), + XmlState::XTagEmpty + => go!(self: error_eof; to TagAttrNameBefore), + XmlState::Cdata + | XmlState::CdataBracket | XmlState::CdataEnd + | XmlState::XDoctype + => go!(self: error_eof; to XData), + XmlState::Pi + => go!(self: error_eof; to BogusXComment), + XmlState::PiTargetAfter | XmlState::PiAfter + => go!(self: reconsume PiData), + XmlState::MarkupDecl + => go!(self: error_eof; to BogusXComment), + XmlState::XComment | XmlState::XCommentDash + | XmlState::XCommentEnd + => go!(self: error_eof; emit_comment;to XData), + XmlState::XTagName | XmlState::TagAttrNameBefore + | XmlState::EndXTagName | XmlState::TagAttrNameAfter + | XmlState::EndXTagNameAfter | XmlState::TagAttrValueBefore + | XmlState::TagAttrValue(_) + => go!(self: error_eof; emit_tag XData), + XmlState::PiData | XmlState::PiTarget + => go!(self: error_eof; emit_pi XData), + XmlState::TagAttrName + => go!(self: error_eof; emit_start_tag XData), + XmlState::BogusXComment + => go!(self: emit_comment; to XData), + } + } + + + fn process_char_ref(&mut self, char_ref: XRef) { + match char_ref { + XRef::CharXData(cdata) => { + match self.state { + states::XData + => self.emit_chars(cdata), + + states::TagAttrValue(_) + => go!(self: append_value cdata), + + _ => panic!("state {:?} should not be reachable in process_char_ref", self.state), + } + + }, + XRef::NamedXRef(xref) => { + if !self.opts.safe_mod { + match self.state { + states::XData + => self.emit_chars(xref), // TODO entity replacement + + states::TagAttrValue(_) + => go!(self: append_value xref), // TODO entity replacement + + _ => panic!("state {:?} should not be eligible for entity expansion", + self.state), + } + } + }, + XRef::NoReturn => {}, + } + } + + fn step_char_ref_tokenizer(&mut self) -> bool { + let mut tok = self.char_ref_tokenizer.take().unwrap(); + let outcome = tok.step(self); + + let progress = match outcome { + char_ref::Done => { + self.process_char_ref(tok.get_result()); + return true; + } + + char_ref::Stuck => false, + char_ref::Progress => true, + }; + + self.char_ref_tokenizer = Some(tok); + progress + } + + fn emit_current_comment(&mut self) { + let comment = replace(&mut self.current_comment, empty_str()); + self.process_token(CommentXToken(comment)); + } + + fn finish_attribute(&mut self) { + if self.current_attr_name.len() == 0 { + return; + } + + // Check for a duplicate attribute. + // FIXME: the spec says we should error as soon as the name is finished. + // FIXME: linear time search, do we care? + let dup = { + let name = &self.current_attr_name[..]; + self.current_tag_attrs.iter().any(|a| a.name.local.as_slice() == name) + }; + + if dup { + self.emit_error(Borrowed("Duplicate attribute")); + self.current_attr_name.truncate(0); + self.current_attr_value.truncate(0); + } else { + let name = replace(&mut self.current_attr_name, String::new()); + self.current_tag_attrs.push(Attribute { + // The tree builder will adjust the namespace if necessary. + // This only happens in foreign elements. + name: QualName::new(ns!(""), Atom::from_slice(&name)), + value: replace(&mut self.current_attr_value, empty_str()), + }); + } + } + + fn create_attribute(&mut self, c: char) { + self.finish_attribute(); + + self.current_attr_name.push(c); + } + +} + #[cfg(test)] #[allow(non_snake_case)] mod test { diff --git a/src/tokenizer/states.rs b/src/tokenizer/states.rs index 1f04075c..17109aee 100644 --- a/src/tokenizer/states.rs +++ b/src/tokenizer/states.rs @@ -17,6 +17,7 @@ pub use self::DoctypeIdKind::*; pub use self::RawKind::*; pub use self::AttrValueKind::*; pub use self::State::*; +pub use self::XmlState::*; #[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)] pub enum ScriptEscapeKind { @@ -90,3 +91,42 @@ pub enum State { CdataSection, Quiescent, } +//FIXME remove these +#[allow(missing_copy_implementations)] +#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)] +pub enum QuoteKind { + SingleQuotes, + DoubleQuotes, +} + +//FIXME remove these +#[allow(missing_copy_implementations)] +#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)] +pub enum XmlState { + XData, + XTagState, + EndXTagState, + EndXTagName, + EndXTagNameAfter, + Pi, + PiTarget, + PiTargetAfter, + PiData, + PiAfter, + MarkupDecl, + XComment, + XCommentDash, + XCommentEnd, + Cdata, + CdataBracket, + CdataEnd, + XDoctype, + XTagName, + XTagEmpty, + TagAttrNameBefore, + TagAttrName, + TagAttrNameAfter, + TagAttrValueBefore, + TagAttrValue(AttrValueKind), + BogusXComment, +} diff --git a/src/tree_builder/actions.rs b/src/tree_builder/actions.rs index 6ecc2367..efdf8bbb 100644 --- a/src/tree_builder/actions.rs +++ b/src/tree_builder/actions.rs @@ -20,6 +20,8 @@ use tree_builder::rules::TreeBuilderStep; use tokenizer::{Attribute, Tag, StartTag, EndTag}; use tokenizer::states::{RawData, RawKind}; +use tokenizer::{XTag, XPi}; + use util::str::{AsciiExt, to_escaped_string}; use std::{slice, fmt}; @@ -1084,3 +1086,163 @@ impl TreeBuilderActions } } } + +pub trait XmlTreeBuilderActions { + fn current_node(&self) -> Handle; + fn insert_appropriately(&mut self, child: NodeOrText); + fn insert_tag(&mut self, tag: XTag) -> XmlProcessResult; + fn append_tag(&mut self, tag: XTag) -> XmlProcessResult; + fn append_tag_to_doc(&mut self, tag: XTag) -> Handle; + fn add_to_open_elems(&mut self, el: Handle) -> XmlProcessResult; + fn append_comment_to_doc(&mut self, comment: String) -> XmlProcessResult; + fn append_comment_to_tag(&mut self, text: String) -> XmlProcessResult; + fn append_pi_to_doc(&mut self, pi: XPi) -> XmlProcessResult; + fn append_pi_to_tag(&mut self, pi: XPi) -> XmlProcessResult; + fn append_text(&mut self, chars: String) -> XmlProcessResult; + fn tag_in_open_elems(&self, tag: &XTag) -> bool; + fn pop_until(&mut self, pred: TagSet) where TagSet: Fn(QualName) -> bool; + fn current_node_in(&self, set: TagSet) -> bool where TagSet: Fn(QualName) -> bool; + fn close_tag(&mut self, tag: XTag) -> XmlProcessResult; + fn no_open_elems(&self) -> bool; + fn pop(&mut self) -> Handle ; + fn stop_parsing(&mut self) -> XmlProcessResult; +} + +#[doc(hidden)] +impl XmlTreeBuilderActions + for super::XmlTreeBuilder + where Handle: Clone, + Sink: TreeSink, +{ + + fn current_node(&self) -> Handle { + self.open_elems.last().expect("no current element").clone() + } + + fn insert_appropriately(&mut self, child: NodeOrText){ + let target = self.current_node(); + self.sink.append(target, child); + } + + fn insert_tag(&mut self, tag: XTag) -> XmlProcessResult { + let child = self.sink.create_element(QualName::new(ns!(HTML), + tag.name), tag.attrs); + self.insert_appropriately(AppendNode(child.clone())); + self.add_to_open_elems(child) + } + + fn append_tag(&mut self, tag: XTag) -> XmlProcessResult { + let child = self.sink.create_element(QualName::new(ns!(HTML), + tag.name), tag.attrs); + self.insert_appropriately(AppendNode(child)); + XDone + } + + fn append_tag_to_doc(&mut self, tag: XTag) -> Handle { + let root = self.doc_handle.clone(); + let child = self.sink.create_element(QualName::new(ns!(HTML), + tag.name), tag.attrs); + + self.sink.append(root, AppendNode(child.clone())); + child + } + + fn add_to_open_elems(&mut self, el: Handle) -> XmlProcessResult { + self.open_elems.push(el); + + //FIXME remove this on final commit + println!("After add to open elems there are {} open elems", self.open_elems.len()); + XDone + } + + fn append_comment_to_doc(&mut self, text: String) -> XmlProcessResult { + let target = self.doc_handle.clone(); + let comment = self.sink.create_comment(text); + self.sink.append(target, AppendNode(comment)); + XDone + } + + fn append_comment_to_tag(&mut self, text: String) -> XmlProcessResult { + let target = self.current_node(); + let comment = self.sink.create_comment(text); + self.sink.append(target, AppendNode(comment)); + XDone + } + + fn append_pi_to_doc(&mut self, pi: XPi) -> XmlProcessResult { + let target = self.doc_handle.clone(); + let pi = self.sink.create_pi(pi.target, pi.data); + self.sink.append(target, AppendNode(pi)); + XDone + } + + fn append_pi_to_tag(&mut self, pi: XPi) -> XmlProcessResult { + let target = self.current_node(); + let pi = self.sink.create_pi(pi.target, pi.data); + self.sink.append(target, AppendNode(pi)); + XDone + } + + + fn append_text(&mut self, chars: String) + -> XmlProcessResult { + self.insert_appropriately(AppendText(chars)); + XDone + } + + fn tag_in_open_elems(&self, tag: &XTag) -> bool { + self.open_elems + .iter() + .any(|a| self.sink.elem_name(a) == QualName::new(ns!(HTML), tag.name.clone())) + } + + // Pop elements until an element from the set has been popped. Returns the + // number of elements popped. + fn pop_until

(&mut self, pred: P) + where P: Fn(QualName) -> bool + { + loop { + if self.current_node_in(|x| pred(x)) { + break; + } + self.open_elems.pop(); + } + } + + fn current_node_in(&self, set: TagSet) -> bool + where TagSet: Fn(QualName) -> bool + { + set(self.sink.elem_name(&self.current_node())) + } + + fn close_tag(&mut self, tag: XTag) -> XmlProcessResult { + println!("Close tag: current_node.name {:?} \n Current tag {:?}", + self.sink.elem_name(&self.current_node()), &tag.name); + if &self.sink.elem_name(&self.current_node()).local != &tag.name { + self.sink.parse_error(Borrowed("Current node doesn't match tag")); + } + // FIXME remove this part after debug + let is_closed = self.tag_in_open_elems(&tag); + println!("Close tag {:?}", is_closed); + + if(is_closed) { + // FIXME: Real namespace resolution + self.pop_until(|p| p == QualName::new(ns!(HTML), tag.name.clone())); + self.pop(); + } + XDone + } + + fn no_open_elems(&self) -> bool { + self.open_elems.is_empty() + } + + fn pop(&mut self) -> Handle { + self.open_elems.pop().expect("no current element") + } + + fn stop_parsing(&mut self) -> XmlProcessResult { + h5e_warn!("stop_parsing for XML5 not implemented, full speed ahead!"); + XDone + } +} diff --git a/src/tree_builder/mod.rs b/src/tree_builder/mod.rs index 0f910880..ee04fa55 100644 --- a/src/tree_builder/mod.rs +++ b/src/tree_builder/mod.rs @@ -16,6 +16,7 @@ pub use self::interface::{TreeSink, Tracer, NextParserState}; use self::types::*; use self::actions::TreeBuilderActions; use self::rules::TreeBuilderStep; +use self::rules::XmlTreeBuilderStep; use string_cache::QualName; @@ -23,6 +24,7 @@ use tokenizer; use tokenizer::{Doctype, Tag}; use tokenizer::TokenSink; use tokenizer::states as tok_state; +use tokenizer::XTokenSink; use util::str::{is_ascii_whitespace, char_run}; @@ -432,3 +434,147 @@ impl TokenSink self.next_tokenizer_state.take() } } + +// The XML tree builder. +pub struct XmlTreeBuilder { + /// Consumer of tree modifications. + sink: Sink, + + /// The document node, which is created by the sink. + doc_handle: Handle, + + /// Next state change for the tokenizer, if any. + next_tokenizer_state: Option, + + /// Stack of open elements, most recently added at end. + open_elems: Vec, + + /// Current element pointer. + curr_elem: Option, + + /// Current tree builder phase. + phase: XmlPhase, +} +impl XmlTreeBuilder + where Handle: Clone, + Sink: TreeSink, +{ + /// Create a new tree builder which sends tree modifications to a particular `TreeSink`. + /// + /// The tree builder is also a `TokenSink`. + pub fn new(mut sink: Sink) -> XmlTreeBuilder { + let doc_handle = sink.get_document(); + XmlTreeBuilder { + sink: sink, + doc_handle: doc_handle, + next_tokenizer_state: None, + open_elems: vec!(), + curr_elem: None, + phase: StartPhase, + } + } + + pub fn unwrap(self) -> Sink { + self.sink + } + + pub fn sink<'a>(&'a self) -> &'a Sink { + &self.sink + } + + pub fn sink_mut<'a>(&'a mut self) -> &'a mut Sink { + &mut self.sink + } + + /// Call the `Tracer`'s `trace_handle` method on every `Handle` in the tree builder's + /// internal state. This is intended to support garbage-collected DOMs. + pub fn trace_handles(&self, tracer: &Tracer) { + tracer.trace_handle(self.doc_handle.clone()); + for e in self.open_elems.iter() { + tracer.trace_handle(e.clone()); + } + self.curr_elem.as_ref().map(|h| tracer.trace_handle(h.clone())); + } + + // Debug helper + #[cfg(not(for_c))] + #[allow(dead_code)] + fn dump_state(&self, label: String) { + use string_cache::QualName; + + println!("dump_state on {}", label); + print!(" open_elems:"); + for node in self.open_elems.iter() { + let QualName { ns, local } = self.sink.elem_name(node); + print!(" {:?}:{:?}", ns,local); + + } + println!(""); + } + + #[cfg(for_c)] + fn debug_step(&self, _mode: XmlPhase, _token: &XToken) { + } + + #[cfg(not(for_c))] + fn debug_step(&self, mode: XmlPhase, token: &XToken) { + use util::str::to_escaped_string; + h5e_debug!("processing {} in insertion mode {:?}", to_escaped_string(token), mode); + } + + fn process_to_completion(&mut self, mut token: XToken) { + // Queue of additional tokens yet to be processed. + // This stays empty in the common case where we don't split whitespace. + let mut more_tokens = VecDeque::new(); + + loop { + let phase = self.phase; + match self.step(phase, token) { + XDone => { + token = unwrap_or_return!(more_tokens.pop_front(), ()); + } + XReprocess(m, t) => { + self.phase = m; + token = t; + } + + } + } + } +} + +impl XTokenSink + for XmlTreeBuilder + where Handle: Clone, + Sink: TreeSink, +{ + fn process_token(&mut self, token: tokenizer::XToken) { + //let ignore_lf = replace(&mut self.ignore_lf, false); + + // Handle `ParseError` and `DoctypeToken`; convert everything else to the local `Token` type. + let token = match token { + tokenizer::XParseError(e) => { + self.sink.parse_error(e); + return; + } + + tokenizer::DoctypeXToken(_) => { + panic!("Doctype not implemented!!"); + } + + tokenizer::PIToken(x) => PIToken(x), + tokenizer::XTagToken(x) => XTagToken(x), + tokenizer::CommentXToken(x) => CommentXToken(x), + tokenizer::NullCharacterXToken => NullCharacterXToken, + tokenizer::EOFXToken => EOFXToken, + tokenizer::CharacterXTokens(x) => CharacterXTokens(x), + + }; + + self.process_to_completion(token); + } + + fn query_state_change(&mut self) -> Option { + self.next_tokenizer_state.take() + } +} diff --git a/src/tree_builder/rules.rs b/src/tree_builder/rules.rs index 10ee1f48..be5414f2 100644 --- a/src/tree_builder/rules.rs +++ b/src/tree_builder/rules.rs @@ -11,13 +11,14 @@ use tree_builder::types::*; use tree_builder::tag_sets::*; -use tree_builder::actions::TreeBuilderActions; +use tree_builder::actions::{TreeBuilderActions, XmlTreeBuilderActions}; use tree_builder::interface::{TreeSink, Quirks, AppendNode, NextParserState}; use tokenizer::{Tag, StartTag, EndTag}; use tokenizer::states::{Rcdata, Rawtext, ScriptData, Plaintext, Quiescent}; use util::str::{AsciiExt, is_ascii_whitespace}; +use tokenizer::{XTag, StartXTag, EndXTag, ShortXTag, EmptyXTag}; use std::mem::replace; use std::borrow::Cow::Borrowed; @@ -33,7 +34,6 @@ pub trait TreeBuilderStep { fn step(&mut self, mode: InsertionMode, token: Token) -> ProcessResult; fn step_foreign(&mut self, token: Token) -> ProcessResult; } - #[doc(hidden)] impl TreeBuilderStep for super::TreeBuilder @@ -1385,3 +1385,136 @@ impl TreeBuilderStep }) } } + +// FIXME: Merge with TreeBuilderStep +pub trait XmlTreeBuilderStep { + fn step(&mut self, mode: XmlPhase, token: XToken) -> XmlProcessResult; +} + +#[doc(hidden)] +impl XmlTreeBuilderStep + for super::XmlTreeBuilder + where Handle: Clone, + Sink: TreeSink, +{ + + fn step(&mut self, mode: XmlPhase, token: XToken) -> XmlProcessResult { + self.debug_step(mode, &token); + + match mode { + StartPhase => match token { + XTagToken(XTag{kind: StartXTag, name, attrs}) => { + let tag = XTag { + kind: StartXTag, + name: name, + attrs: attrs + }; + self.phase = MainPhase; + let handle = self.append_tag_to_doc(tag); + self.add_to_open_elems(handle) + + }, + XTagToken(XTag{kind: EmptyXTag, name, attrs}) => { + let tag = XTag { + kind: StartXTag, + name: name, + attrs: attrs + }; + self.phase = EndPhase; + self.append_tag_to_doc(tag); + XDone + }, + CommentXToken(comment) => { + self.append_comment_to_doc(comment) + }, + PIToken(pi) => { + self.append_pi_to_doc(pi) + }, + CharacterXTokens(ref chars) + if !any_not_whitespace(chars) => { + XDone + }, + EOFXToken => { + self.sink.parse_error(Borrowed("Unexpected EOF in start phase")); + XReprocess(EndPhase, EOFXToken) + }, + _ => { + self.sink.parse_error(Borrowed("Unexpected element in start phase")); + XDone + }, + }, + MainPhase => match token { + CharacterXTokens(chs) => { + self.append_text(chs) + }, + XTagToken(XTag{kind: StartXTag, name, attrs}) => { + let tag = XTag { + kind: StartXTag, + name: name, + attrs: attrs + }; + + self.insert_tag(tag) + }, + XTagToken(XTag{kind: EmptyXTag, name, attrs}) => { + let tag = XTag { + kind: StartXTag, + name: name, + attrs: attrs + }; + self.append_tag(tag) + }, + XTagToken(XTag{kind: EndXTag, name, attrs}) => { + let tag = XTag { + kind: StartXTag, + name: name, + attrs: attrs + }; + println!("Enter EndXTag in MainPhase"); + let retval = self.close_tag(tag); + if self.no_open_elems() { + println!("No open elems, switch to EndPhase"); + self.phase = EndPhase; + } + retval + }, + XTagToken(XTag{kind: ShortXTag, ..}) => { + self.pop(); + if self.no_open_elems() { + self.phase = EndPhase; + } + XDone + }, + CommentXToken(comment) => { + self.append_comment_to_tag(comment) + }, + PIToken(pi) => { + self.append_pi_to_tag(pi) + }, + EOFXToken | NullCharacterXToken=> { + XReprocess(EndPhase, EOFXToken) + } + }, + EndPhase => match token { + CommentXToken(comment) => { + self.append_comment_to_doc(comment) + }, + PIToken(pi) => { + self.append_pi_to_doc(pi) + }, + CharacterXTokens(ref chars) + if !any_not_whitespace(chars) => { + XDone + }, + EOFXToken => { + self.stop_parsing() + } + _ => { + self.sink.parse_error(Borrowed("Unexpected element in end phase")); + XDone + } + }, + + } + } +} diff --git a/src/tree_builder/types.rs b/src/tree_builder/types.rs index 5ed8ab12..bb2a8a0a 100644 --- a/src/tree_builder/types.rs +++ b/src/tree_builder/types.rs @@ -10,11 +10,15 @@ //! Types used within the tree builder code. Not exported to users. use tokenizer::Tag; +use tokenizer::{XTag, XPi}; pub use self::InsertionMode::*; +pub use self::XmlPhase::*; pub use self::SplitStatus::*; pub use self::Token::*; +pub use self::XToken::*; pub use self::ProcessResult::*; +pub use self::XmlProcessResult::*; pub use self::FormatEntry::*; #[derive(PartialEq, Eq, Copy, Clone, Debug)] @@ -43,6 +47,12 @@ pub enum InsertionMode { AfterAfterBody, AfterAfterFrameset, } +#[derive(PartialEq, Eq, Copy, Clone, Debug)] +pub enum XmlPhase { + StartPhase, + MainPhase, + EndPhase, +} #[derive(PartialEq, Eq, Copy, Clone, Debug)] pub enum SplitStatus { @@ -62,6 +72,23 @@ pub enum Token { EOFToken, } +/// A subset/refinement of `tokenizer::XToken`. Everything else is handled +/// specially at the beginning of `process_token`. +#[derive(PartialEq, Eq, Clone, Debug)] +pub enum XToken { + XTagToken(XTag), + CommentXToken(String), + CharacterXTokens(String), + PIToken(XPi), + NullCharacterXToken, + EOFXToken, +} + +pub enum XmlProcessResult { + XDone, + XReprocess(XmlPhase, XToken), +} + pub enum ProcessResult { Done, DoneAckSelfClosing, diff --git a/src/util/str.rs b/src/util/str.rs index 70a5c19b..ca55cf6c 100644 --- a/src/util/str.rs +++ b/src/util/str.rs @@ -199,6 +199,23 @@ pub fn char_run(mut pred: Pred, buf: &str) -> Option<(usize, bool)> Some((buf.len(), matches)) } +/// Determines if the character is a valid name character +/// according to XML 1.1 spec +pub fn is_xml_namechar(c: &char) -> bool { + match *c { + 'A'...'Z' | 'a'...'z' | '0'...'9' + | ':' | '_' | '-' | '.' | '\u{B7}' | '\u{C0}'...'\u{D6}' + | '\u{D8}'...'\u{F6}' | '\u{370}'...'\u{37D}' + | '\u{37F}'...'\u{1FFF}' | '\u{200C}'...'\u{200D}' + | '\u{0300}'...'\u{036F}' | '\u{203F}'...'\u{2040}' + | '\u{2070}'...'\u{218F}' | '\u{2C00}'...'\u{2FEF}' + | '\u{3001}'...'\u{D7FF}' | '\u{F900}'...'\u{FDCF}' + | '\u{FDF0}'...'\u{FFFD}' | '\u{10000}'...'\u{EFFFF}' + => true, + _ => false, + } +} + #[cfg(test)] #[allow(non_snake_case)] mod test { diff --git a/xml5lib-tests/AUTHORS.rst b/xml5lib-tests/AUTHORS.rst new file mode 100644 index 00000000..cab486cf --- /dev/null +++ b/xml5lib-tests/AUTHORS.rst @@ -0,0 +1,9 @@ +Credits +======= + +The ``xml5lib`` test data is maintained by: + +- Daniel Fath + +Contributors +------------ diff --git a/xml5lib-tests/LICENSE b/xml5lib-tests/LICENSE new file mode 100644 index 00000000..b0fc926e --- /dev/null +++ b/xml5lib-tests/LICENSE @@ -0,0 +1,21 @@ +Copyright (c) 2006-2013 James Graham, Geoffrey Sneddon and +other contributors + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/xml5lib-tests/tokenizer/README.md b/xml5lib-tests/tokenizer/README.md new file mode 100644 index 00000000..4218c26b --- /dev/null +++ b/xml5lib-tests/tokenizer/README.md @@ -0,0 +1,104 @@ +Tokenizer tests +=============== + +The test format is [JSON](http://www.json.org/). This has the advantage +that the syntax allows backward-compatible extensions to the tests and +the disadvantage that it is relatively verbose. + +Basic Structure +--------------- + + {"tests": [ +     {"description": "Test description", +     "input": "input_string", +     "output": [expected_output_tokens], +     "initialStates": [initial_states], +     "lastStartTag": last_start_tag, +     "ignoreErrorOrder": ignore_error_order +     } + ]} + +Multiple tests per file are allowed simply by adding more objects to the +"tests" list. + +`description`, `input` and `output` are always present. The other values +are optional. + +### Test set-up + +`test.input` is a string containing the characters to pass to the +tokenizer. Specifically, it represents the characters of the **input +stream**, and so implementations are expected to perform the processing +described in the spec's **Preprocessing the input stream** section +before feeding the result to the tokenizer. + +If `test.doubleEscaped` is present and `true`, then `test.input` is not +quite as described above. Instead, it must first be subjected to another +round of unescaping (i.e., in addition to any unescaping involved in the +JSON import), and the result of *that* represents the characters of the +input stream. Currently, the only unescaping required by this option is +to convert each sequence of the form \\uHHHH (where H is a hex digit) +into the corresponding Unicode code point. (Note that this option also +affects the interpretation of `test.output`.) + +`test.initialStates` is a list of strings, each being the name of a +tokenizer state. The test should be run once for each string, using it +to set the tokenizer's initial state for that run. If +`test.initialStates` is omitted, it defaults to `["data state"]`. + +`test.lastStartTag` is a lowercase string that should be used as "the +tag name of the last start tag to have been emitted from this +tokenizer", referenced in the spec's definition of **appropriate end tag +token**. If it is omitted, it is treated as if "no start tag has been +emitted from this tokenizer". + +### Test results + +`test.output` is a list of tokens, ordered with the first produced by +the tokenizer the first (leftmost) in the list. The list must mach the +**complete** list of tokens that the tokenizer should produce. Valid +tokens are: + + ["DOCTYPE", name, public_id, system_id, correctness] + ["StartTag", name, {attributes}*, true*] + ["StartTag", name, {attributes}] + ["EndTag", name] + ["Comment", data] + ["Character", data] + "ParseError" + +`public_id` and `system_id` are either strings or `null`. `correctness` +is either `true` or `false`; `true` corresponds to the force-quirks flag +being false, and vice-versa. + +When the self-closing flag is set, the `StartTag` array has `true` as +its fourth entry. When the flag is not set, the array has only three +entries for backwards compatibility. + +All adjacent character tokens are coalesced into a single +`["Character", data]` token. + +If `test.doubleEscaped` is present and `true`, then every string within +`test.output` must be further unescaped (as described above) before +comparing with the tokenizer's output. + +`test.ignoreErrorOrder` is a boolean value indicating that the order of +`ParseError` tokens relative to other tokens in the output stream is +unimportant, and implementations should ignore such differences between +their output and `expected_output_tokens`. (This is used for errors +emitted by the input stream preprocessing stage, since it is useful to +test that code but it is undefined when the errors occur). If it is +omitted, it defaults to `false`. + +xmlViolation tests +------------------ + +`tokenizer/xmlViolation.test` differs from the above in a couple of +ways: + +- The name of the single member of the top-level JSON object is + "xmlViolationTests" instead of "tests". +- Each test's expected output assumes that implementation is applying + the tweaks given in the spec's "Coercing an HTML DOM into an + infoset" section. + diff --git a/xml5lib-tests/tokenizer/eof.test b/xml5lib-tests/tokenizer/eof.test new file mode 100644 index 00000000..0e436f6c --- /dev/null +++ b/xml5lib-tests/tokenizer/eof.test @@ -0,0 +1,113 @@ +{"tests": [ + +{"description":"Data state EOF", +"input":"", +"output":[] +}, + +{"description":"Tag state EOF", +"input":"<", +"output":["ParseError", ["Character", "<"]] +}, + +{"description":"End tag state premature EOF", +"input":"", +"output":[["StartTag", "z", {}], ["EndTag", "z"], ["ShortTag", ""], ["EmptyTag", "a", {}]] +}, + +{"description":"Test longer tags", +"input":"", +"output":[["StartTag", "az",{}],["EndTag", "xyz"]] +}, + +{"description":"Attributes DoubleQuoted", +"input":"", +"output":[["StartTag", "a", {"ax":"test"}]]}, + +{"description":"Attributes SingleQuoted", +"input":"", +"output":[["StartTag", "b", {"ay":"test"}]]}, + +{"description":"Attributes UnQuoted", +"input":"", +"output":[["StartTag", "c", {"az":"test"}]]}, + +{"description":"Start tag multiple attributes", +"input":"", +"output":[["StartTag", "c", {"a":"test1", "b":"test2", "c":"test3"}]]}, + +{"description":"Empty Tag multiple attributes", +"input":"", +"output":[["EmptyTag", "c", {"a":"test1", "b":"test2", "c":"test3"}]] +}, + +{"description":"Tag state Error >", +"input":"<>", +"output":["ParseError", ["Character", "<>"]] +}, + +{"description":"Tag state Error :", +"input":"<:", +"output":["ParseError", ["Character", "<:"] +]}, + +{"description":"Tag state Error <", +"input":"<<", +"output":["ParseError", ["Character", "<"], "ParseError", ["Character", "<"]] +}, + +{"description":"Tag state Error ' '", +"input":"< ", +"output":["ParseError", ["Character", "< "]] +}, + +{"description":"Tag state Error '\\t'", +"input":"<\t", +"output":["ParseError", ["Character", "<\t"]] +}, + +{"description":"Tag state Error '\\n'", +"input":"<\n", +"output":["ParseError", ["Character", "<\n"]] +}, + +{"description":"End tag state Error '\\t'", +"input":"", +"output":[["EndTag", "a"]] +}, + +{"description":"End tag name after state with attr", +"input":"", +"output":["ParseError", "ParseError", "ParseError", "ParseError", "ParseError", ["EndTag", "a"]] +}, + +{"description":"PI tag", +"input":"", +"output":[["PI", "xslt", "ma"]] +}, + +{"description":"PI tag", +"input":"", +"output":[["PI", "xslt", "m"]] +}, + +{"description":"PI tag with '?' in target", +"input":"", +"output":[["PI", "?xml", "m"]] +}, + +{"description":"Comment", +"input":"", +"output":[["Comment", "comment"]] +}, + +{"description":"Comment", +"input":"", +"output":[["Comment", "--comment "]] +}, + +{"description":"Comment", +"input":"", +"output":[["Comment", "--comment-"]] +}, + +{"description":"Tiny Bogus Comment", +"input":"", +"output":["ParseError", ["Comment", ""]] +}, + +{"description":"Short Bogus Comment", +"input":"", +"output":["ParseError", ["Comment", ""]] +}, + +{"description":"CDATA state", +"input":"", +"output":[["Character", "&ing"]] +}, + +{"description":"CDATA state", +"input":"", +"output":[["Character", "&ing ]"]] +}, + +{"description":"CDATA state", +"input":"", +"output":[["Character", "&ing]] "]] +} + +]} \ No newline at end of file diff --git a/xml5lib-tests/tokenizer/test2.test b/xml5lib-tests/tokenizer/test2.test new file mode 100644 index 00000000..1f4d570c --- /dev/null +++ b/xml5lib-tests/tokenizer/test2.test @@ -0,0 +1,64 @@ +{"tests": [ + +{"description":"EOF", +"input":"", +"output":[] +}, + +{"description":"End tag state premature EOF", +"input":"", +"output":["ParseError", ["Comment", "\txml m?"]] +}, + +{"description":"PI tag with '\\n' in target", +"input":"", +"output":["ParseError", ["Comment", "\nxml m?"]] +}, + +{"description":"PI tag with ' ' in target", +"input":"", +"output":["ParseError", ["Comment", " xml m?"]] +}, + +{"description":"Double end element", +"input":"
", +"output":["ParseError", ["EndTag", "br"]] +}, + +{"description":"Start tag double attributes, one unfinished", +"input":"", +"output":["ParseError", ["StartTag", "b", {"ay":"test"}]] +}, + +{"description":"Start tag double attributes", +"input":"", +"output":["ParseError", ["StartTag", "b", {"ay":"test"}]] +}, + +{"description":"Empty tag double attributes, one unfinished", +"input":"", +"output":["ParseError", ["EmptyTag", "b", {"ay":"test"}]] +}, + +{"description":"Empty tag double attributes", +"input":"", +"output":["ParseError", ["EmptyTag", "b", {"ay":"test"}]] +} + +]} \ No newline at end of file diff --git a/xml5lib-tests/tree-construction/README.md b/xml5lib-tests/tree-construction/README.md new file mode 100644 index 00000000..be41fa44 --- /dev/null +++ b/xml5lib-tests/tree-construction/README.md @@ -0,0 +1,104 @@ +Tree Construction Tests +======================= + +Each file containing tree construction tests consists of any number of +tests separated by two newlines (LF) and a single newline before the end +of the file. For instance: + + [TEST]LF + LF + [TEST]LF + LF + [TEST]LF + +Where [TEST] is the following format: + +Each test must begin with a string "\#data" followed by a newline (LF). +All subsequent lines until a line that says "\#errors" are the test data +and must be passed to the system being tested unchanged, except with the +final newline (on the last line) removed. + +Then there must be a line that says "\#errors". It must be followed by +one line per parse error that a conformant checker would return. It +doesn't matter what those lines are, although they can't be +"\#document-fragment", "\#document", "\#script-off", "\#script-on", or +empty, the only thing that matters is that there be the right number +of parse errors. + +Then there \*may\* be a line that says "\#document-fragment", which must +be followed by a newline (LF), followed by a string of characters that +indicates the context element, followed by a newline (LF). If the string +of characters starts with "svg ", the context element is in the SVG +namespace and the substring after "svg " is the local name. If the +string of characters starts with "math ", the context element is in the +MathML namespace and the substring after "math " is the local name. +Otherwise, the context element is in the HTML namespace and the string +is the local name. If this line is present the "\#data" must be parsed +using the HTML fragment parsing algorithm with the context element as +context. + +Then there \*may\* be a line that says "\#script-off" or +"\#script-in". If a line that says "\#script-off" is present, the +parser must set the scripting flag to disabled. If a line that says +"\#script-on" is present, it must set it to enabled. Otherwise, the +test should be run in both modes. + +Then there must be a line that says "\#document", which must be followed +by a dump of the tree of the parsed DOM. Each node must be represented +by a single line. Each line must start with "| ", followed by two spaces +per parent node that the node has before the root document node. + +- Element nodes must be represented by a "`<`" then the *tag name + string* "`>`", and all the attributes must be given, sorted + lexicographically by UTF-16 code unit according to their *attribute + name string*, on subsequent lines, as if they were children of the + element node. +- Attribute nodes must have the *attribute name string*, then an "=" + sign, then the attribute value in double quotes ("). +- Text nodes must be the string, in double quotes. Newlines aren't + escaped. +- Comments must be "`<`" then "`!-- `" then the data then "` -->`". +- DOCTYPEs must be "``". +- Processing instructions must be "``". (The HTML parser cannot emit + processing instructions, but scripts can, and the WebVTT to DOM + rules can emit them.) +- Template contents are represented by the string "content" with the + children below it. + +The *tag name string* is the local name prefixed by a namespace +designator. For the HTML namespace, the namespace designator is the +empty string, i.e. there's no prefix. For the SVG namespace, the +namespace designator is "svg ". For the MathML namespace, the namespace +designator is "math ". + +The *attribute name string* is the local name prefixed by a namespace +designator. For no namespace, the namespace designator is the empty +string, i.e. there's no prefix. For the XLink namespace, the namespace +designator is "xlink ". For the XML namespace, the namespace designator +is "xml ". For the XMLNS namespace, the namespace designator is "xmlns +". Note the difference between "xlink:href" which is an attribute in no +namespace with the local name "xlink:href" and "xlink href" which is an +attribute in the xlink namespace with the local name "href". + +If there is also a "\#document-fragment" the bit following "\#document" +must be a representation of the HTML fragment serialization for the +context element given by "\#document-fragment". + +For example: + + #data +

One

Two + #errors + 3: Missing document type declaration + #document + | + | + | + |

+ | "One" + |

+ | "Two" diff --git a/xml5lib-tests/tree-construction/test1.dat b/xml5lib-tests/tree-construction/test1.dat new file mode 100644 index 00000000..73f30c57 --- /dev/null +++ b/xml5lib-tests/tree-construction/test1.dat @@ -0,0 +1,102 @@ +#data + +#document +| + +#data + +#document +| +| + +#data + +#document +| +| + +#data + +#document +| +| + +#data + +#document +| + +#data +Text +#document +| +| "Text" + + +#data + +#document +| +| + +#data + +#document +| +| + +#data + +#document +| +| + +#data + +#document +| +| + +#data +Text +#document +| +| +| "Text" + +#data +Text +#document +| +| +| "Text" + +#data + +#document +| +| + +#data + +#document +| +| + +#data + +#document +| +| + +#data + +#document +| +| + +#data + +#document +| +| \ No newline at end of file From b0bd7762dd3caaa45fb34a029ad346556385fe43 Mon Sep 17 00:00:00 2001 From: Daniel Fath Date: Mon, 13 Apr 2015 14:06:23 +0200 Subject: [PATCH 3/6] Add Processing Instruction node type to tree builder - Add Processing Instruction as a separate type of Nodes. This is a prerequisite for proper XML support. --- dom_sink/src/common.rs | 5 ++++- dom_sink/src/owned_dom.rs | 9 ++++++++- dom_sink/src/rcdom.rs | 9 ++++++++- examples/noop-tree-builder.rs | 4 ++++ examples/print-rcdom.rs | 7 +++++-- examples/print-tree-actions.rs | 6 ++++++ src/tree_builder/interface.rs | 3 +++ tests/tree_builder.rs | 10 +++++++++- 8 files changed, 47 insertions(+), 6 deletions(-) diff --git a/dom_sink/src/common.rs b/dom_sink/src/common.rs index 7fb1c6fe..69725676 100644 --- a/dom_sink/src/common.rs +++ b/dom_sink/src/common.rs @@ -11,7 +11,7 @@ use html5ever::tokenizer::Attribute; use string_cache::QualName; -pub use self::NodeEnum::{Document, Doctype, Text, Comment, Element}; +pub use self::NodeEnum::{Document, Doctype, Text, Comment, Element, PI}; /// The different kinds of nodes in the DOM. #[derive(Debug)] @@ -30,4 +30,7 @@ pub enum NodeEnum { /// An element with attributes. Element(QualName, Vec), + + /// A Processing instruction. + PI(String, String), } diff --git a/dom_sink/src/owned_dom.rs b/dom_sink/src/owned_dom.rs index 737fdd4b..c5ff9540 100644 --- a/dom_sink/src/owned_dom.rs +++ b/dom_sink/src/owned_dom.rs @@ -18,7 +18,7 @@ //! been thoroughly audited, and the performance gains vs. RcDom //! have not been demonstrated. -use common::{NodeEnum, Document, Doctype, Text, Comment, Element}; +use common::{NodeEnum, Document, Doctype, Text, Comment, Element, PI}; use html5ever::tokenizer::Attribute; use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText}; @@ -212,6 +212,10 @@ impl TreeSink for Sink { self.new_node(Comment(text)) } + fn create_pi(&mut self, target: String, data: String) -> Handle { + self.new_node(PI(target, data)) + } + fn append(&mut self, parent: Handle, child: NodeOrText) { // Append to an existing Text node if we have one. match child { @@ -382,6 +386,9 @@ impl Serializable for Node { (IncludeNode, &Text(ref text)) => serializer.write_text(&text), (IncludeNode, &Comment(ref text)) => serializer.write_comment(&text), + (IncludeNode, &PI(ref target, ref data)) + => serializer.write_processing_instruction(&target, &data), + (IncludeNode, &Document) => panic!("Can't serialize Document node itself"), } } diff --git a/dom_sink/src/rcdom.rs b/dom_sink/src/rcdom.rs index 5f0b56fa..842306df 100644 --- a/dom_sink/src/rcdom.rs +++ b/dom_sink/src/rcdom.rs @@ -12,7 +12,7 @@ //! This is sufficient as a static parse tree, but don't build a //! web browser using it. :) -use common::{NodeEnum, Document, Doctype, Text, Comment, Element}; +use common::{NodeEnum, Document, Doctype, Text, Comment, Element,PI}; use html5ever::tokenizer::Attribute; use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText}; @@ -163,6 +163,10 @@ impl TreeSink for RcDom { new_node(Comment(text)) } + fn create_pi(&mut self, target: String, data: String) -> Handle { + new_node(PI(target, data)) + } + fn append(&mut self, parent: Handle, child: NodeOrText) { // Append to an existing Text node if we have one. match child { @@ -305,6 +309,9 @@ impl Serializable for Handle { (IncludeNode, &Text(ref text)) => serializer.write_text(&text), (IncludeNode, &Comment(ref text)) => serializer.write_comment(&text), + (IncludeNode, &PI(ref target, ref data)) + => serializer.write_processing_instruction(&target, &data), + (IncludeNode, &Document) => panic!("Can't serialize Document node itself"), } } diff --git a/examples/noop-tree-builder.rs b/examples/noop-tree-builder.rs index cb0d35c2..27e6b892 100644 --- a/examples/noop-tree-builder.rs +++ b/examples/noop-tree-builder.rs @@ -60,6 +60,10 @@ impl TreeSink for Sink { self.get_id() } + fn create_pi(&mut self, _target: String, _data: String) -> usize { + self.get_id() + } + fn append_before_sibling(&mut self, _sibling: usize, _new_node: NodeOrText) -> Result<(), NodeOrText> { diff --git a/examples/print-rcdom.rs b/examples/print-rcdom.rs index afd3952b..fe7b925e 100644 --- a/examples/print-rcdom.rs +++ b/examples/print-rcdom.rs @@ -22,7 +22,7 @@ use std::default::Default; use std::string::String; use html5ever::{parse, one_input}; -use html5ever_dom_sink::common::{Document, Doctype, Text, Comment, Element}; +use html5ever_dom_sink::common::{Document, Doctype, Text, Comment, Element, PI}; use html5ever_dom_sink::rcdom::{RcDom, Handle}; // This is not proper HTML serialization, of course. @@ -44,6 +44,9 @@ fn walk(indent: usize, handle: Handle) { Comment(ref text) => println!("", text.escape_default()), + PI(ref target, ref data) + => println!("", target, data), + Element(ref name, ref attrs) => { assert!(name.ns == ns!(html)); print!("<{}", name.local); @@ -63,7 +66,7 @@ fn walk(indent: usize, handle: Handle) { fn main() { let mut input = String::new(); io::stdin().read_to_string(&mut input).unwrap(); - let dom: RcDom = parse(one_input(input), Default::default()); + let dom: RcDom = parse_xml(one_input(input), Default::default()); walk(0, dom.document); if !dom.errors.is_empty() { diff --git a/examples/print-tree-actions.rs b/examples/print-tree-actions.rs index 26958f0a..09c5a31d 100644 --- a/examples/print-tree-actions.rs +++ b/examples/print-tree-actions.rs @@ -73,6 +73,12 @@ impl TreeSink for Sink { id } + fn create_pi(&mut self, target: String, data: String) -> usize { + let id = self.get_id(); + println!("Created Processing Instruction: {} {}", target, data); + id + } + fn append(&mut self, parent: usize, child: NodeOrText) { match child { AppendNode(n) diff --git a/src/tree_builder/interface.rs b/src/tree_builder/interface.rs index c3fdd82e..d04e358b 100644 --- a/src/tree_builder/interface.rs +++ b/src/tree_builder/interface.rs @@ -76,6 +76,9 @@ pub trait TreeSink { /// Create a comment node. fn create_comment(&mut self, text: String) -> Self::Handle; + /// Create a Processing Instruction node. + fn create_pi(&mut self, target: String, data: String) -> Self::Handle; + /// Append a node as the last child of the given node. If this would /// produce adjacent sibling text nodes, it should concatenate the text /// instead. diff --git a/tests/tree_builder.rs b/tests/tree_builder.rs index ddff71f9..b9784ee9 100644 --- a/tests/tree_builder.rs +++ b/tests/tree_builder.rs @@ -32,7 +32,7 @@ use test::{TestDesc, TestDescAndFn, DynTestName, DynTestFn}; use test::ShouldPanic::No; use html5ever::{parse, parse_fragment, one_input}; -use html5ever_dom_sink::common::{Document, Doctype, Text, Comment, Element}; +use html5ever_dom_sink::common::{Document, Doctype, Text, Comment, Element, PI}; use html5ever_dom_sink::rcdom::{RcDom, Handle}; use string_cache::Atom; @@ -104,6 +104,14 @@ fn serialize(buf: &mut String, indent: usize, handle: Handle) { buf.push_str("\"\n"); } + PI(ref target, ref data) => { + buf.push_str("\n"); + } + Comment(ref text) => { buf.push_str("