From 56e2ef2dae694773f326269501d354785ee15099 Mon Sep 17 00:00:00 2001 From: Keegan McAllister Date: Sat, 21 Mar 2015 15:23:30 -0700 Subject: [PATCH 1/5] Read stdin in noop-tokenize --- examples/noop-tokenize.rs | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/examples/noop-tokenize.rs b/examples/noop-tokenize.rs index 0e7ce85e..0f5725f2 100644 --- a/examples/noop-tokenize.rs +++ b/examples/noop-tokenize.rs @@ -14,7 +14,7 @@ extern crate test; extern crate html5ever; -use std::{fs, env}; +use std::io; use std::io::prelude::*; use std::default::Default; @@ -34,15 +34,10 @@ impl TokenSink for Sink { } fn main() { - let mut path = env::current_exe().unwrap(); - path.push("../data/bench/"); - path.push(env::args().nth(1).unwrap()); + let mut input = String::new(); + io::stdin().read_to_string(&mut input).unwrap(); - let mut file = fs::File::open(&path).unwrap(); - let mut file_input = String::new(); - file.read_to_string(&mut file_input).unwrap(); - - tokenize_to(Sink, one_input(file_input), TokenizerOpts { + tokenize_to(Sink, one_input(input), TokenizerOpts { profile: true, .. Default::default() }); From 180e924eefbea4badc3d21102408cbbe5239bb74 Mon Sep 17 00:00:00 2001 From: Keegan McAllister Date: Sat, 21 Mar 2015 17:39:51 -0700 Subject: [PATCH 2/5] Rework BeforeAttributeValue tokenizer state --- src/tokenizer/mod.rs | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 476db92e..4e70369b 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -529,6 +529,7 @@ macro_rules! shorthand ( ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c); ); ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push($c); ); ( $me:ident : discard_tag ) => ( $me.discard_tag(); ); + ( $me:ident : discard_char ) => ( $me.discard_char(); ); ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push($c); ); ( $me:ident : emit_temp ) => ( $me.emit_temp_buf(); ); ( $me:ident : clear_temp ) => ( $me.clear_temp_buf(); ); @@ -613,6 +614,10 @@ macro_rules! get_char ( ($me:expr) => ( unwrap_or_return!($me.get_char(), false) )); +macro_rules! peek ( ($me:expr) => ( + unwrap_or_return!($me.peek(), false) +)); + macro_rules! pop_except_from ( ($me:expr, $set:expr) => ( unwrap_or_return!($me.pop_except_from($set), false) )); @@ -912,18 +917,16 @@ impl Tokenizer { }}, //§ before-attribute-value-state - states::BeforeAttributeValue => loop { match get_char!(self) { - '\t' | '\n' | '\x0C' | ' ' => (), - '"' => go!(self: to AttributeValue DoubleQuoted), - '&' => go!(self: reconsume AttributeValue Unquoted), - '\'' => go!(self: to AttributeValue SingleQuoted), - '\0' => go!(self: error; push_value '\u{fffd}'; to AttributeValue Unquoted), - '>' => go!(self: error; emit_tag Data), - c => { - go_match!(self: c, - '<' , '=' , '`' => error); - go!(self: push_value c; to AttributeValue Unquoted); - } + // Use peek so we can handle the first attr character along with the rest, + // hopefully in the same zero-copy buffer. + states::BeforeAttributeValue => loop { match peek!(self) { + '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char), + '"' => go!(self: discard_char; to AttributeValue DoubleQuoted), + '&' => go!(self: to AttributeValue Unquoted), + '\'' => go!(self: discard_char; to AttributeValue SingleQuoted), + '\0' => go!(self: discard_char; error; push_value '\u{fffd}'; to AttributeValue Unquoted), + '>' => go!(self: discard_char; error; emit_tag Data), + _ => go!(self: to AttributeValue Unquoted), }}, //§ attribute-value-(double-quoted)-state From 1e1f5a1911ccfb7822661ad3c3865899f7592f0f Mon Sep 17 00:00:00 2001 From: Keegan McAllister Date: Wed, 29 Apr 2015 20:52:38 -0700 Subject: [PATCH 3/5] Disable profiling in noop-tokenize --- examples/noop-tokenize.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/examples/noop-tokenize.rs b/examples/noop-tokenize.rs index 0f5725f2..2f77f71e 100644 --- a/examples/noop-tokenize.rs +++ b/examples/noop-tokenize.rs @@ -20,7 +20,7 @@ use std::default::Default; use test::black_box; -use html5ever::tokenizer::{TokenSink, Token, TokenizerOpts}; +use html5ever::tokenizer::{TokenSink, Token}; use html5ever::driver::{tokenize_to, one_input}; struct Sink; @@ -37,8 +37,5 @@ fn main() { let mut input = String::new(); io::stdin().read_to_string(&mut input).unwrap(); - tokenize_to(Sink, one_input(input), TokenizerOpts { - profile: true, - .. Default::default() - }); + tokenize_to(Sink, one_input(input), Default::default()); } From 7be620c441ebc86e92826a445836d89a90709d46 Mon Sep 17 00:00:00 2001 From: Keegan McAllister Date: Sat, 21 Mar 2015 20:02:43 -0700 Subject: [PATCH 4/5] Implement zero-copy parsing Based on #60 and #114. Fixes #20. Fixes #115. --- Cargo.toml | 3 + benches/tokenizer.rs | 11 ++- capi/Cargo.toml | 3 + capi/src/lib.rs | 9 ++ capi/src/tokenizer.rs | 3 +- dom_sink/src/common.rs | 7 +- dom_sink/src/owned_dom.rs | 10 ++- dom_sink/src/rcdom.rs | 10 ++- examples/html2html.rs | 11 ++- examples/noop-tokenize.rs | 9 +- examples/noop-tree-builder.rs | 16 ++-- examples/print-rcdom.rs | 7 +- examples/print-tree-actions.rs | 19 ++-- examples/tokenize.rs | 8 +- src/driver.rs | 49 +++++------ src/lib.rs | 2 + src/tokenizer/buffer_queue.rs | 113 ++++++++++-------------- src/tokenizer/char_ref/mod.rs | 24 ++--- src/tokenizer/interface.rs | 13 +-- src/tokenizer/mod.rs | 155 ++++++++++++++------------------- src/tree_builder/actions.rs | 17 ++-- src/tree_builder/data.rs | 19 ++-- src/tree_builder/interface.rs | 10 ++- src/tree_builder/mod.rs | 36 ++++---- src/tree_builder/rules.rs | 6 +- src/tree_builder/types.rs | 8 +- src/util/smallcharset.rs | 10 +-- src/util/str.rs | 47 +--------- tests/serializer.rs | 10 ++- tests/tokenizer.rs | 48 ++++++---- tests/tree_builder.rs | 4 + 31 files changed, 344 insertions(+), 353 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ad0c633b..3ce84266 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,9 @@ git = "https://github.com/servo/string-cache" [dependencies.string_cache_plugin] git = "https://github.com/servo/string-cache" +[dependencies.tendril] +git = "https://github.com/kmcallister/tendril" + [dependencies.mac] git = "https://github.com/reem/rust-mac" diff --git a/benches/tokenizer.rs b/benches/tokenizer.rs index 8da759d4..36de0b7e 100644 --- a/benches/tokenizer.rs +++ b/benches/tokenizer.rs @@ -10,6 +10,7 @@ #![feature(box_syntax, std_misc, start, test)] extern crate test; +extern crate tendril; extern crate html5ever; use std::{fs, env, cmp, rt}; @@ -21,6 +22,7 @@ use test::{black_box, Bencher, TestDesc, TestDescAndFn}; use test::{DynTestName, DynBenchFn, TDynBenchFn}; use test::ShouldPanic::No; +use tendril::{ByteTendril, StrTendril, ReadExt, SliceExt}; use html5ever::tokenizer::{TokenSink, Token, Tokenizer, TokenizerOpts}; struct Sink; @@ -36,7 +38,7 @@ impl TokenSink for Sink { // This could almost be the TokenSink too, but it's not // mut within run(). struct Bench { - input: Vec, + input: Vec, clone_only: bool, opts: TokenizerOpts, } @@ -50,8 +52,9 @@ impl Bench { let mut file = fs::File::open(&path).ok().expect("can't open file"); // Read the file and treat it as an infinitely repeating sequence of characters. - let mut file_input = String::new(); - file.read_to_string(&mut file_input).ok().expect("can't read file"); + let mut file_input = ByteTendril::new(); + file.read_to_tendril(&mut file_input).ok().expect("can't read file"); + let file_input: StrTendril = file_input.try_reinterpret().unwrap(); let size = size.unwrap_or(file_input.len()); let mut stream = file_input.chars().cycle(); @@ -63,7 +66,7 @@ impl Bench { // The by_ref() call is important, otherwise we get wrong results! // See rust-lang/rust#18045. let sz = cmp::min(1024, size - total); - input.push(stream.by_ref().take(sz).collect()); + input.push(stream.by_ref().take(sz).collect::().to_tendril()); total += sz; } diff --git a/capi/Cargo.toml b/capi/Cargo.toml index d665d752..9cd2e9a8 100644 --- a/capi/Cargo.toml +++ b/capi/Cargo.toml @@ -18,3 +18,6 @@ path = "../" git = "https://github.com/servo/string-cache" [dependencies.string_cache_plugin] git = "https://github.com/servo/string-cache" + +[dependencies.tendril] +git = "https://github.com/kmcallister/tendril" diff --git a/capi/src/lib.rs b/capi/src/lib.rs index 2f1dbc62..5a84df3a 100644 --- a/capi/src/lib.rs +++ b/capi/src/lib.rs @@ -9,6 +9,7 @@ extern crate libc; extern crate string_cache; +extern crate tendril; extern crate html5ever; use std::{ptr, slice, str}; @@ -19,6 +20,8 @@ use libc::{size_t, c_int, c_char, strlen}; use string_cache::Atom; +use tendril::StrTendril; + #[repr(C)] pub struct h5e_buf { data: *const u8, @@ -86,6 +89,12 @@ impl AsLifetimeBuf for String { } } +impl AsLifetimeBuf for StrTendril { + fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> { + LifetimeBuf::from_str(self) + } +} + impl AsLifetimeBuf for Atom { fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> { LifetimeBuf::from_str(self) diff --git a/capi/src/tokenizer.rs b/capi/src/tokenizer.rs index d1fe7559..b718401b 100644 --- a/capi/src/tokenizer.rs +++ b/capi/src/tokenizer.rs @@ -14,6 +14,7 @@ use {LifetimeBuf, AsLifetimeBuf, h5e_buf, c_bool}; use html5ever::tokenizer::{TokenSink, Token, Doctype, Tag, ParseError, DoctypeToken}; use html5ever::tokenizer::{CommentToken, CharacterTokens, NullCharacterToken}; use html5ever::tokenizer::{TagToken, StartTag, EndTag, EOFToken, Tokenizer}; +use html5ever::Tendril; use std::mem; use std::default::Default; @@ -71,7 +72,7 @@ impl TokenSink for *mut h5e_token_sink { ($name:ident) => (call!($name,)); // bleh } - fn opt_str_to_buf<'a>(s: &'a Option) -> LifetimeBuf<'a> { + fn opt_str_to_buf<'a>(s: &'a Option) -> LifetimeBuf<'a> { match *s { None => LifetimeBuf::null(), Some(ref s) => s.as_lifetime_buf(), diff --git a/dom_sink/src/common.rs b/dom_sink/src/common.rs index 7fb1c6fe..9f021b0c 100644 --- a/dom_sink/src/common.rs +++ b/dom_sink/src/common.rs @@ -10,6 +10,7 @@ use html5ever::tokenizer::Attribute; use string_cache::QualName; +use tendril::StrTendril; pub use self::NodeEnum::{Document, Doctype, Text, Comment, Element}; @@ -20,13 +21,13 @@ pub enum NodeEnum { Document, /// A `DOCTYPE` with name, public id, and system id. - Doctype(String, String, String), + Doctype(StrTendril, StrTendril, StrTendril), /// A text node. - Text(String), + Text(StrTendril), /// A comment. - Comment(String), + Comment(StrTendril), /// An element with attributes. Element(QualName, Vec), diff --git a/dom_sink/src/owned_dom.rs b/dom_sink/src/owned_dom.rs index 9b478165..eef02808 100644 --- a/dom_sink/src/owned_dom.rs +++ b/dom_sink/src/owned_dom.rs @@ -38,6 +38,7 @@ use std::collections::HashSet; use std::ops::{Deref, DerefMut}; use string_cache::QualName; +use tendril::StrTendril; /// The internal type we use for nodes during parsing. pub struct SquishyNode { @@ -135,7 +136,7 @@ fn get_parent_and_index(child: Handle) -> Option<(Handle, usize)> { fn append_to_existing_text(mut prev: Handle, text: &str) -> bool { match prev.deref_mut().node { Text(ref mut existing) => { - existing.push_str(text); + existing.push_slice(text); true } _ => false, @@ -208,7 +209,7 @@ impl TreeSink for Sink { self.new_node(Element(name, attrs)) } - fn create_comment(&mut self, text: String) -> Handle { + fn create_comment(&mut self, text: StrTendril) -> Handle { self.new_node(Comment(text)) } @@ -262,7 +263,10 @@ impl TreeSink for Sink { Ok(()) } - fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String) { + fn append_doctype_to_document(&mut self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril) { append(self.document, self.new_node(Doctype(name, public_id, system_id))); } diff --git a/dom_sink/src/rcdom.rs b/dom_sink/src/rcdom.rs index a09b2a88..f5500d60 100644 --- a/dom_sink/src/rcdom.rs +++ b/dom_sink/src/rcdom.rs @@ -30,6 +30,7 @@ use std::io::{self, Write}; use std::ops::{Deref, DerefMut}; use string_cache::QualName; +use tendril::StrTendril; /// A DOM node. pub struct Node { @@ -99,7 +100,7 @@ fn get_parent_and_index(target: &Handle) -> Option<(Handle, usize)> { fn append_to_existing_text(prev: &Handle, text: &str) -> bool { match prev.borrow_mut().deref_mut().node { Text(ref mut existing) => { - existing.push_str(text); + existing.push_slice(text); true } _ => false, @@ -159,7 +160,7 @@ impl TreeSink for RcDom { new_node(Element(name, attrs)) } - fn create_comment(&mut self, text: String) -> Handle { + fn create_comment(&mut self, text: StrTendril) -> Handle { new_node(Comment(text)) } @@ -214,7 +215,10 @@ impl TreeSink for RcDom { Ok(()) } - fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String) { + fn append_doctype_to_document(&mut self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril) { append(&self.document, new_node(Doctype(name, public_id, system_id))); } diff --git a/examples/html2html.rs b/examples/html2html.rs index 2620ec46..69c18f76 100644 --- a/examples/html2html.rs +++ b/examples/html2html.rs @@ -15,20 +15,25 @@ //! //! where htmlparser-1.4.jar comes from http://about.validator.nu/htmlparser/ +extern crate tendril; extern crate html5ever; extern crate html5ever_dom_sink; -use std::io::{self, Read, Write}; +use std::io::{self, Write}; use std::default::Default; +use tendril::{ByteTendril, ReadExt}; + +use html5ever::sink::rcdom::RcDom; use html5ever::driver::ParseOpts; use html5ever_dom_sink::rcdom::RcDom; use html5ever::tree_builder::TreeBuilderOpts; use html5ever::{parse, one_input, serialize}; fn main() { - let mut input = String::new(); - io::stdin().read_to_string(&mut input).unwrap(); + let mut input = ByteTendril::new(); + io::stdin().read_to_tendril(&mut input).unwrap(); + let input = input.try_reinterpret().unwrap(); let dom: RcDom = parse(one_input(input), ParseOpts { tree_builder: TreeBuilderOpts { drop_doctype: true, diff --git a/examples/noop-tokenize.rs b/examples/noop-tokenize.rs index 2f77f71e..b6de3753 100644 --- a/examples/noop-tokenize.rs +++ b/examples/noop-tokenize.rs @@ -13,13 +13,15 @@ extern crate test; extern crate html5ever; +extern crate tendril; use std::io; -use std::io::prelude::*; use std::default::Default; use test::black_box; +use tendril::{ByteTendril, ReadExt}; + use html5ever::tokenizer::{TokenSink, Token}; use html5ever::driver::{tokenize_to, one_input}; @@ -34,8 +36,9 @@ impl TokenSink for Sink { } fn main() { - let mut input = String::new(); - io::stdin().read_to_string(&mut input).unwrap(); + let mut input = ByteTendril::new(); + io::stdin().read_to_tendril(&mut input).unwrap(); + let input = input.try_reinterpret().unwrap(); tokenize_to(Sink, one_input(input), Default::default()); } diff --git a/examples/noop-tree-builder.rs b/examples/noop-tree-builder.rs index ce1306b8..998c46b1 100644 --- a/examples/noop-tree-builder.rs +++ b/examples/noop-tree-builder.rs @@ -8,16 +8,17 @@ // except according to those terms. extern crate string_cache; - +extern crate tendril; extern crate html5ever; -use std::io::{self, Read}; +use std::io; use std::default::Default; -use std::string::String; use std::collections::HashMap; use std::borrow::Cow; use string_cache::QualName; +use tendril::{StrTendril, ByteTendril, ReadExt}; + use html5ever::{parse_to, one_input}; use html5ever::tokenizer::Attribute; use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText}; @@ -56,7 +57,7 @@ impl TreeSink for Sink { id } - fn create_comment(&mut self, _text: String) -> usize { + fn create_comment(&mut self, _text: StrTendril) -> usize { self.get_id() } @@ -72,7 +73,7 @@ impl TreeSink for Sink { fn set_quirks_mode(&mut self, _mode: QuirksMode) { } fn append(&mut self, _parent: usize, _child: NodeOrText) { } - fn append_doctype_to_document(&mut self, _name: String, _public_id: String, _system_id: String) { } + fn append_doctype_to_document(&mut self, _: StrTendril, _: StrTendril, _: StrTendril) { } fn add_attrs_if_missing(&mut self, _target: usize, _attrs: Vec) { } fn remove_from_parent(&mut self, _target: usize) { } fn reparent_children(&mut self, _node: usize, _new_parent: usize) { } @@ -85,7 +86,8 @@ fn main() { names: HashMap::new(), }; - let mut input = String::new(); - io::stdin().read_to_string(&mut input).unwrap(); + let mut input = ByteTendril::new(); + io::stdin().read_to_tendril(&mut input).unwrap(); + let input = input.try_reinterpret().unwrap(); parse_to(sink, one_input(input), Default::default()); } diff --git a/examples/print-rcdom.rs b/examples/print-rcdom.rs index afd3952b..103759a5 100644 --- a/examples/print-rcdom.rs +++ b/examples/print-rcdom.rs @@ -15,12 +15,14 @@ extern crate html5ever_dom_sink; #[macro_use] extern crate string_cache; +extern crate tendril; use std::io::{self, Read}; use std::iter::repeat; use std::default::Default; use std::string::String; +use tendril::{ByteTendril, ReadExt}; use html5ever::{parse, one_input}; use html5ever_dom_sink::common::{Document, Doctype, Text, Comment, Element}; use html5ever_dom_sink::rcdom::{RcDom, Handle}; @@ -61,8 +63,9 @@ fn walk(indent: usize, handle: Handle) { } fn main() { - let mut input = String::new(); - io::stdin().read_to_string(&mut input).unwrap(); + let mut input = ByteTendril::new(); + io::stdin().read_to_tendril(&mut input).unwrap(); + let input = input.try_reinterpret().unwrap(); let dom: RcDom = parse(one_input(input), Default::default()); walk(0, dom.document); diff --git a/examples/print-tree-actions.rs b/examples/print-tree-actions.rs index 0da21099..82ac1487 100644 --- a/examples/print-tree-actions.rs +++ b/examples/print-tree-actions.rs @@ -10,16 +10,17 @@ #![feature(collections)] extern crate string_cache; - +extern crate tendril; extern crate html5ever; -use std::io::{self, Read}; +use std::io; use std::default::Default; -use std::string::String; use std::collections::HashMap; use std::borrow::Cow; use string_cache::QualName; +use tendril::{ByteTendril, StrTendril, ReadExt}; + use html5ever::{parse_to, one_input}; use html5ever::tokenizer::Attribute; use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText}; @@ -67,7 +68,7 @@ impl TreeSink for Sink { id } - fn create_comment(&mut self, text: String) -> usize { + fn create_comment(&mut self, text: StrTendril) -> usize { let id = self.get_id(); println!("Created comment \"{}\" as {}", text.escape_default(), id); id @@ -97,7 +98,10 @@ impl TreeSink for Sink { Ok(()) } - fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String) { + fn append_doctype_to_document(&mut self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril) { println!("Append doctype: {} {} {}", name, public_id, system_id); } @@ -127,7 +131,8 @@ fn main() { names: HashMap::new(), }; - let mut input = String::new(); - io::stdin().read_to_string(&mut input).unwrap(); + let mut input = ByteTendril::new(); + io::stdin().read_to_tendril(&mut input).unwrap(); + let input = input.try_reinterpret().unwrap(); parse_to(sink, one_input(input), Default::default()); } diff --git a/examples/tokenize.rs b/examples/tokenize.rs index 636a1905..c6aade45 100644 --- a/examples/tokenize.rs +++ b/examples/tokenize.rs @@ -9,11 +9,14 @@ #![feature(collections)] +extern crate tendril; extern crate html5ever; use std::io::{self, Read}; use std::default::Default; +use tendril::{ByteTendril, ReadExt}; + use html5ever::tokenizer::{TokenSink, Token, TokenizerOpts, ParseError}; use html5ever::tokenizer::{CharacterTokens, NullCharacterToken, TagToken, StartTag, EndTag}; use html5ever::driver::{tokenize_to, one_input}; @@ -80,8 +83,9 @@ fn main() { let mut sink = TokenPrinter { in_char_run: false, }; - let mut input = String::new(); - io::stdin().read_to_string(&mut input).unwrap(); + let mut input = ByteTendril::new(); + io::stdin().read_to_tendril(&mut input).unwrap(); + let input = input.try_reinterpret().unwrap(); tokenize_to(sink, one_input(input), TokenizerOpts { profile: true, .. Default::default() diff --git a/src/driver.rs b/src/driver.rs index 6fb359de..a488d59c 100644 --- a/src/driver.rs +++ b/src/driver.rs @@ -16,9 +16,10 @@ use std::option; use std::default::Default; use string_cache::{Atom, QualName}; +use tendril::StrTendril; -/// Convenience function to turn a single `String` into an iterator. -pub fn one_input(x: String) -> option::IntoIter { +/// Convenience function to turn a single value into an iterator. +pub fn one_input(x: T) -> option::IntoIter { Some(x).into_iter() } @@ -30,14 +31,10 @@ pub fn one_input(x: String) -> option::IntoIter { /// let mut sink = MySink; /// tokenize_to(&mut sink, one_input(my_str), Default::default()); /// ``` -pub fn tokenize_to< - Sink: TokenSink, - It: Iterator - >( - sink: Sink, - input: It, - opts: TokenizerOpts) -> Sink { - +pub fn tokenize_to(sink: Sink, input: It, opts: TokenizerOpts) -> Sink + where Sink: TokenSink, + It: Iterator, +{ let mut tok = Tokenizer::new(sink, opts); for s in input { tok.feed(s); @@ -64,14 +61,10 @@ pub struct ParseOpts { /// let mut sink = MySink; /// parse_to(&mut sink, one_input(my_str), Default::default()); /// ``` -pub fn parse_to< - Sink: TreeSink, - It: Iterator - >( - sink: Sink, - input: It, - opts: ParseOpts) -> Sink { - +pub fn parse_to(sink: Sink, input: It, opts: ParseOpts) -> Sink + where Sink: TreeSink, + It: Iterator, +{ let tb = TreeBuilder::new(sink, opts.tree_builder); let mut tok = Tokenizer::new(tb, opts.tokenizer); for s in input { @@ -89,15 +82,13 @@ pub fn parse_to< /// let mut sink = MySink; /// parse_fragment_to(&mut sink, one_input(my_str), context_token, Default::default()); /// ``` -pub fn parse_fragment_to< - Sink: TreeSink, - It: Iterator - >( - sink: Sink, - input: It, - context: Atom, - opts: ParseOpts) -> Sink { - +pub fn parse_fragment_to(sink: Sink, + input: It, + context: Atom, + opts: ParseOpts) -> Sink + where Sink: TreeSink, + It: Iterator +{ let mut sink = sink; let context_elem = sink.create_element(QualName::new(ns!(HTML), context), vec!()); let tb = TreeBuilder::new_for_fragment(sink, context_elem, None, opts.tree_builder); @@ -131,7 +122,7 @@ pub trait ParseResult { /// ``` pub fn parse(input: It, opts: ParseOpts) -> Output where Output: ParseResult, - It: Iterator, + It: Iterator, { let sink = parse_to(Default::default(), input, opts); ParseResult::get_result(sink) @@ -146,7 +137,7 @@ pub fn parse(input: It, opts: ParseOpts) -> Output /// ``` pub fn parse_fragment(input: It, context: Atom, opts: ParseOpts) -> Output where Output: ParseResult, - It: Iterator, + It: Iterator, { let sink = parse_fragment_to(Default::default(), input, context, opts); ParseResult::get_result(sink) diff --git a/src/lib.rs b/src/lib.rs index a7ac0f21..50b70db1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,6 +24,8 @@ extern crate log; #[macro_use] extern crate string_cache; +extern crate tendril; + #[macro_use] extern crate mac; diff --git a/src/tokenizer/buffer_queue.rs b/src/tokenizer/buffer_queue.rs index 791e99ae..c10463b1 100644 --- a/src/tokenizer/buffer_queue.rs +++ b/src/tokenizer/buffer_queue.rs @@ -13,77 +13,61 @@ use util::smallcharset::SmallCharSet; use std::str::CharRange; use std::collections::VecDeque; -pub use self::SetResult::{FromSet, NotFromSet}; +use tendril::StrTendril; -struct Buffer { - /// Byte position within the buffer. - pub pos: usize, - /// The buffer. - pub buf: String, -} +pub use self::SetResult::{FromSet, NotFromSet}; /// Result from `pop_except_from`. #[derive(PartialEq, Eq, Debug)] pub enum SetResult { FromSet(char), - NotFromSet(String), + NotFromSet(StrTendril), } /// A queue of owned string buffers, which supports incrementally /// consuming characters. pub struct BufferQueue { /// Buffers to process. - buffers: VecDeque, + buffers: VecDeque, } impl BufferQueue { /// Create an empty BufferQueue. pub fn new() -> BufferQueue { BufferQueue { - buffers: VecDeque::with_capacity(3), + buffers: VecDeque::with_capacity(16), } } /// Add a buffer to the beginning of the queue. - pub fn push_front(&mut self, buf: String) { - if buf.len() == 0 { + pub fn push_front(&mut self, buf: StrTendril) { + if buf.len32() == 0 { return; } - self.buffers.push_front(Buffer { - pos: 0, - buf: buf, - }); + self.buffers.push_front(buf); } /// Add a buffer to the end of the queue. - /// 'pos' can be non-zero to remove that many bytes - /// from the beginning. - pub fn push_back(&mut self, buf: String, pos: usize) { - if pos >= buf.len() { + pub fn push_back(&mut self, buf: StrTendril) { + if buf.len32() == 0 { return; } - self.buffers.push_back(Buffer { - pos: pos, - buf: buf, - }); + self.buffers.push_back(buf); } /// Look at the next available character, if any. pub fn peek(&mut self) -> Option { - match self.buffers.front() { - Some(&Buffer { pos, ref buf }) => Some(buf.char_at(pos)), - None => None, - } + // Invariant: all buffers in the queue are non-empty. + self.buffers.front().map(|b| b.char_at(0)) } /// Get the next character, if one is available. pub fn next(&mut self) -> Option { let (result, now_empty) = match self.buffers.front_mut() { None => (None, false), - Some(&mut Buffer { ref mut pos, ref buf }) => { - let CharRange { ch, next } = buf.char_range_at(*pos); - *pos = next; - (Some(ch), next >= buf.len()) + Some(buf) => { + let c = buf.pop_front_char().expect("empty buffer in queue"); + (Some(c), buf.is_empty()) } }; @@ -95,25 +79,26 @@ impl BufferQueue { } /// Pops and returns either a single character from the given set, or - /// a `String` of characters none of which are in the set. The set + /// a `StrTendril` of characters none of which are in the set. The set /// is represented as a bitmask and so can only contain the first 64 /// ASCII characters. pub fn pop_except_from(&mut self, set: SmallCharSet) -> Option { let (result, now_empty) = match self.buffers.front_mut() { - Some(&mut Buffer { ref mut pos, ref buf }) => { - let n = set.nonmember_prefix_len(&buf[*pos..]); + None => (None, false), + Some(buf) => { + let n = set.nonmember_prefix_len(&buf); if n > 0 { - let new_pos = *pos + n; - let out = String::from_str(&buf[*pos..new_pos]); - *pos = new_pos; - (Some(NotFromSet(out)), new_pos >= buf.len()) + let out; + unsafe { + out = buf.unsafe_subtendril(0, n); + buf.unsafe_pop_front(n); + } + (Some(NotFromSet(out)), buf.is_empty()) } else { - let CharRange { ch, next } = buf.char_range_at(*pos); - *pos = next; - (Some(FromSet(ch)), next >= buf.len()) + let c = buf.pop_front_char().expect("empty buffer in queue"); + (Some(FromSet(c)), buf.is_empty()) } } - _ => (None, false), }; // Unborrow self for this part. @@ -131,11 +116,11 @@ impl BufferQueue { // If they do not match, return Some(false). // If not enough characters are available to know, return None. pub fn eat(&mut self, pat: &str) -> Option { - let mut buffers_exhausted = 0usize; - let mut consumed_from_last = match self.buffers.front() { - None => return None, - Some(ref buf) => buf.pos, - }; + let mut buffers_exhausted = 0; + let mut consumed_from_last = 0; + if self.buffers.front().is_none() { + return None; + } for c in pat.chars() { if buffers_exhausted >= self.buffers.len() { @@ -143,7 +128,7 @@ impl BufferQueue { } let ref buf = self.buffers[buffers_exhausted]; - let d = buf.buf.char_at(consumed_from_last); + let d = buf.char_at(consumed_from_last); match (c.to_ascii_opt(), d.to_ascii_opt()) { (Some(c), Some(d)) => if c.eq_ignore_case(d) { () } else { return Some(false) }, _ => return Some(false), @@ -151,7 +136,7 @@ impl BufferQueue { // d was an ASCII character; size must be 1 byte consumed_from_last += 1; - if consumed_from_last >= buf.buf.len() { + if consumed_from_last >= buf.len() { buffers_exhausted += 1; consumed_from_last = 0; } @@ -164,7 +149,7 @@ impl BufferQueue { match self.buffers.front_mut() { None => assert_eq!(consumed_from_last, 0), - Some(ref mut buf) => buf.pos = consumed_from_last, + Some(ref mut buf) => buf.pop_front(consumed_from_last as u32), } Some(true) @@ -174,6 +159,7 @@ impl BufferQueue { #[cfg(test)] #[allow(non_snake_case)] mod test { + use tendril::{StrTendril, SliceExt}; use super::{BufferQueue, FromSet, NotFromSet}; #[test] @@ -182,7 +168,7 @@ mod test { assert_eq!(bq.peek(), None); assert_eq!(bq.next(), None); - bq.push_back(String::from_str("abc"), 0); + bq.push_back("abc".to_tendril()); assert_eq!(bq.peek(), Some('a')); assert_eq!(bq.next(), Some('a')); assert_eq!(bq.peek(), Some('b')); @@ -197,10 +183,10 @@ mod test { #[test] fn can_unconsume() { let mut bq = BufferQueue::new(); - bq.push_back(String::from_str("abc"), 0); + bq.push_back("abc".to_tendril()); assert_eq!(bq.next(), Some('a')); - bq.push_front(String::from_str("xy")); + bq.push_front("xy".to_tendril()); assert_eq!(bq.next(), Some('x')); assert_eq!(bq.next(), Some('y')); assert_eq!(bq.next(), Some('b')); @@ -211,31 +197,22 @@ mod test { #[test] fn can_pop_except_set() { let mut bq = BufferQueue::new(); - bq.push_back(String::from_str("abc&def"), 0); + bq.push_back("abc&def".to_tendril()); let mut pop = || bq.pop_except_from(small_char_set!('&')); - assert_eq!(pop(), Some(NotFromSet(String::from_str("abc")))); + assert_eq!(pop(), Some(NotFromSet("abc".to_tendril()))); assert_eq!(pop(), Some(FromSet('&'))); - assert_eq!(pop(), Some(NotFromSet(String::from_str("def")))); + assert_eq!(pop(), Some(NotFromSet("def".to_tendril()))); assert_eq!(pop(), None); } - #[test] - fn can_push_truncated() { - let mut bq = BufferQueue::new(); - bq.push_back(String::from_str("abc"), 1); - assert_eq!(bq.next(), Some('b')); - assert_eq!(bq.next(), Some('c')); - assert_eq!(bq.next(), None); - } - #[test] fn can_eat() { // This is not very comprehensive. We rely on the tokenizer // integration tests for more thorough testing with many // different input buffer splits. let mut bq = BufferQueue::new(); - bq.push_back(String::from_str("a"), 0); - bq.push_back(String::from_str("bc"), 0); + bq.push_back("a".to_tendril()); + bq.push_back("bc".to_tendril()); assert_eq!(bq.eat("abcd"), None); assert_eq!(bq.eat("ax"), Some(false)); assert_eq!(bq.eat("ab"), Some(true)); diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs index 5141de4f..db1fc5f3 100644 --- a/src/tokenizer/char_ref/mod.rs +++ b/src/tokenizer/char_ref/mod.rs @@ -9,7 +9,9 @@ use super::{Tokenizer, TokenSink}; -use util::str::{is_ascii_alnum, empty_str}; +use util::str::{is_ascii_alnum}; + +use tendril::StrTendril; use std::char::from_u32; use std::borrow::Cow::Borrowed; @@ -54,7 +56,7 @@ pub struct CharRefTokenizer { seen_digit: bool, hex_marker: Option, - name_buf_opt: Option, + name_buf_opt: Option, name_match: Option<&'static [u32; 2]>, name_len: usize, } @@ -83,12 +85,12 @@ impl CharRefTokenizer { self.result.expect("get_result called before done") } - fn name_buf<'t>(&'t self) -> &'t String { + fn name_buf<'t>(&'t self) -> &'t StrTendril { self.name_buf_opt.as_ref() .expect("name_buf missing in named character reference") } - fn name_buf_mut<'t>(&'t mut self) -> &'t mut String { + fn name_buf_mut<'t>(&'t mut self) -> &'t mut StrTendril { self.name_buf_opt.as_mut() .expect("name_buf missing in named character reference") } @@ -142,7 +144,7 @@ impl CharRefTokenizer { _ => { self.state = Named; - self.name_buf_opt = Some(empty_str()); + self.name_buf_opt = Some(StrTendril::new()); Progress } } @@ -199,9 +201,9 @@ impl CharRefTokenizer { } fn unconsume_numeric(&mut self, tokenizer: &mut Tokenizer) -> Status { - let mut unconsume = String::from_str("#"); + let mut unconsume = StrTendril::from_char('#'); match self.hex_marker { - Some(c) => unconsume.push(c), + Some(c) => unconsume.push_char(c), None => (), } @@ -245,7 +247,7 @@ impl CharRefTokenizer { fn do_named(&mut self, tokenizer: &mut Tokenizer) -> Status { let c = unwrap_or_return!(tokenizer.get_char(), Stuck); - self.name_buf_mut().push(c); + self.name_buf_mut().push_char(c); match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { // We have either a full match or a prefix of one. Some(m) => { @@ -345,7 +347,7 @@ impl CharRefTokenizer { self.unconsume_name(tokenizer); self.finish_none() } else { - tokenizer.unconsume(String::from_str(&self.name_buf()[name_len..])); + tokenizer.unconsume(StrTendril::from_slice(&self.name_buf()[name_len..])); self.result = Some(CharRef { chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], num_chars: if c2 == 0 { 1 } else { 2 }, @@ -358,7 +360,7 @@ impl CharRefTokenizer { fn do_bogus_name(&mut self, tokenizer: &mut Tokenizer) -> Status { let c = unwrap_or_return!(tokenizer.get_char(), Stuck); - self.name_buf_mut().push(c); + self.name_buf_mut().push_char(c); match c { _ if is_ascii_alnum(c) => return Progress, ';' => self.emit_name_error(tokenizer), @@ -389,7 +391,7 @@ impl CharRefTokenizer { } Octothorpe => { - tokenizer.unconsume(String::from_str("#")); + tokenizer.unconsume(StrTendril::from_slice("#")); tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); self.finish_none(); } diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs index 28123a67..8264ab4e 100644 --- a/src/tokenizer/interface.rs +++ b/src/tokenizer/interface.rs @@ -12,6 +12,7 @@ use tokenizer::states; use std::borrow::Cow; use string_cache::{Atom, QualName}; +use tendril::StrTendril; pub use self::TagKind::{StartTag, EndTag}; pub use self::Token::{DoctypeToken, TagToken, CommentToken, CharacterTokens}; @@ -21,9 +22,9 @@ pub use self::Token::{NullCharacterToken, EOFToken, ParseError}; // FIXME: already exists in Servo DOM #[derive(PartialEq, Eq, Clone, Debug)] pub struct Doctype { - pub name: Option, - pub public_id: Option, - pub system_id: Option, + pub name: Option, + pub public_id: Option, + pub system_id: Option, pub force_quirks: bool, } @@ -47,7 +48,7 @@ impl Doctype { #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug)] pub struct Attribute { pub name: QualName, - pub value: String, + pub value: StrTendril, } #[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)] @@ -86,8 +87,8 @@ impl Tag { pub enum Token { DoctypeToken(Doctype), TagToken(Tag), - CommentToken(String), - CharacterTokens(String), + CommentToken(StrTendril), + CharacterTokens(StrTendril), NullCharacterToken, EOFToken, ParseError(Cow<'static, str>), diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 4e70369b..5ec0526e 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -26,7 +26,7 @@ use self::char_ref::{CharRef, CharRefTokenizer}; use self::buffer_queue::{BufferQueue, SetResult, FromSet, NotFromSet}; -use util::str::{lower_ascii, lower_ascii_letter, empty_str}; +use util::str::{lower_ascii, lower_ascii_letter}; use util::smallcharset::SmallCharSet; use std::mem::replace; @@ -35,24 +35,17 @@ use std::borrow::Cow::{self, Borrowed}; use std::collections::BTreeMap; use string_cache::{Atom, QualName}; +use tendril::StrTendril; pub mod states; mod interface; mod char_ref; mod buffer_queue; -fn option_push(opt_str: &mut Option, c: char) { +fn option_push(opt_str: &mut Option, c: char) { match *opt_str { - Some(ref mut s) => s.push(c), - None => *opt_str = Some(c.to_string()), - } -} - -fn append_strings(lhs: &mut String, rhs: String) { - if lhs.is_empty() { - *lhs = rhs; - } else { - lhs.push_str(&rhs); + Some(ref mut s) => s.push_char(c), + None => *opt_str = Some(StrTendril::from_char(c)), } } @@ -77,6 +70,9 @@ pub struct TokenizerOpts { /// Last start tag. Only the test runner should use a /// non-`None` value! + /// + /// FIXME: Can't use Tendril because we want TokenizerOpts + /// to be Send. pub last_start_tag_name: Option, } @@ -132,7 +128,7 @@ pub struct Tokenizer { current_tag_kind: TagKind, /// Current tag name. - current_tag_name: String, + current_tag_name: StrTendril, /// Current tag is self-closing? current_tag_self_closing: bool, @@ -141,13 +137,13 @@ pub struct Tokenizer { current_tag_attrs: Vec, /// Current attribute name. - current_attr_name: String, + current_attr_name: StrTendril, /// Current attribute value. - current_attr_value: String, + current_attr_value: StrTendril, /// Current comment. - current_comment: String, + current_comment: StrTendril, /// Current doctype token. current_doctype: Doctype, @@ -156,7 +152,7 @@ pub struct Tokenizer { last_start_tag_name: Option, /// The "temporary buffer" mentioned in the spec. - temp_buf: String, + temp_buf: StrTendril, /// Record of how many ns we spent in each state, if profiling is enabled. state_profile: BTreeMap, @@ -184,15 +180,15 @@ impl Tokenizer { ignore_lf: false, discard_bom: discard_bom, current_tag_kind: StartTag, - current_tag_name: empty_str(), + current_tag_name: StrTendril::new(), current_tag_self_closing: false, current_tag_attrs: vec!(), - current_attr_name: empty_str(), - current_attr_value: empty_str(), - current_comment: empty_str(), + current_attr_name: StrTendril::new(), + current_attr_value: StrTendril::new(), + current_comment: StrTendril::new(), current_doctype: Doctype::new(), last_start_tag_name: start_tag_name, - temp_buf: empty_str(), + temp_buf: StrTendril::new(), state_profile: BTreeMap::new(), time_in_sink: 0, } @@ -211,19 +207,16 @@ impl Tokenizer { } /// Feed an input string into the tokenizer. - pub fn feed(&mut self, input: String) { - if input.len() == 0 { + pub fn feed(&mut self, mut input: StrTendril) { + if input.is_empty() { return; } - let pos = if self.discard_bom && input.char_at(0) == '\u{feff}' { - self.discard_bom = false; - 3 // length of BOM in UTF-8 - } else { - 0 + if self.discard_bom && input.char_at(0) == '\u{feff}' { + input.pop_front(3); }; - self.input_buffers.push_back(input, pos); + self.input_buffers.push_back(input); self.run(); } @@ -357,20 +350,20 @@ impl Tokenizer { fn emit_char(&mut self, c: char) { self.process_token(match c { '\0' => NullCharacterToken, - _ => CharacterTokens(c.to_string()), + _ => CharacterTokens(StrTendril::from_char(c)), }); } // The string must not contain '\0'! - fn emit_chars(&mut self, b: String) { + fn emit_chars(&mut self, b: StrTendril) { self.process_token(CharacterTokens(b)); } fn emit_current_tag(&mut self) { self.finish_attribute(); - let name = replace(&mut self.current_tag_name, String::new()); - let name = Atom::from_slice(&name); + let name = Atom::from_slice(&self.current_tag_name); + self.current_tag_name.clear(); match self.current_tag_kind { StartTag => { @@ -401,29 +394,29 @@ impl Tokenizer { fn emit_temp_buf(&mut self) { // FIXME: Make sure that clearing on emit is spec-compatible. - let buf = replace(&mut self.temp_buf, empty_str()); + let buf = replace(&mut self.temp_buf, StrTendril::new()); self.emit_chars(buf); } fn clear_temp_buf(&mut self) { // Do this without a new allocation. - self.temp_buf.truncate(0); + self.temp_buf.clear(); } fn emit_current_comment(&mut self) { - let comment = replace(&mut self.current_comment, empty_str()); + let comment = replace(&mut self.current_comment, StrTendril::new()); self.process_token(CommentToken(comment)); } fn discard_tag(&mut self) { - self.current_tag_name = String::new(); + self.current_tag_name.clear(); self.current_tag_self_closing = false; self.current_tag_attrs = vec!(); } fn create_tag(&mut self, kind: TagKind, c: char) { self.discard_tag(); - self.current_tag_name.push(c); + self.current_tag_name.push_char(c); self.current_tag_kind = kind; } @@ -439,7 +432,7 @@ impl Tokenizer { fn create_attribute(&mut self, c: char) { self.finish_attribute(); - self.current_attr_name.push(c); + self.current_attr_name.push_char(c); } fn finish_attribute(&mut self) { @@ -451,21 +444,22 @@ impl Tokenizer { // FIXME: the spec says we should error as soon as the name is finished. // FIXME: linear time search, do we care? let dup = { - let name = &self.current_attr_name[..]; + let name = &*self.current_attr_name; self.current_tag_attrs.iter().any(|a| &*a.name.local == name) }; if dup { self.emit_error(Borrowed("Duplicate attribute")); - self.current_attr_name.truncate(0); - self.current_attr_value.truncate(0); + self.current_attr_name.clear(); + self.current_attr_value.clear(); } else { - let name = replace(&mut self.current_attr_name, String::new()); + let name = Atom::from_slice(&self.current_attr_name); + self.current_attr_name.clear(); self.current_tag_attrs.push(Attribute { // The tree builder will adjust the namespace if necessary. // This only happens in foreign elements. - name: QualName::new(ns!(""), Atom::from_slice(&name)), - value: replace(&mut self.current_attr_value, empty_str()), + name: QualName::new(ns!(""), name), + value: replace(&mut self.current_attr_value, StrTendril::new()), }); } } @@ -475,7 +469,7 @@ impl Tokenizer { self.process_token(DoctypeToken(doctype)); } - fn doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option { + fn doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option { match kind { Public => &mut self.current_doctype.public_id, System => &mut self.current_doctype.system_id, @@ -485,8 +479,8 @@ impl Tokenizer { fn clear_doctype_id(&mut self, kind: DoctypeIdKind) { let id = self.doctype_id(kind); match *id { - Some(ref mut s) => s.truncate(0), - None => *id = Some(empty_str()), + Some(ref mut s) => s.clear(), + None => *id = Some(StrTendril::new()), } } @@ -513,7 +507,7 @@ impl Tokenizer { assert!(c.is_some()); } - fn unconsume(&mut self, buf: String) { + fn unconsume(&mut self, buf: StrTendril) { self.input_buffers.push_front(buf); } @@ -527,20 +521,20 @@ impl Tokenizer { macro_rules! shorthand ( ( $me:ident : emit $c:expr ) => ( $me.emit_char($c); ); ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c); ); - ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push($c); ); + ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push_char($c); ); ( $me:ident : discard_tag ) => ( $me.discard_tag(); ); ( $me:ident : discard_char ) => ( $me.discard_char(); ); - ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push($c); ); + ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push_char($c); ); ( $me:ident : emit_temp ) => ( $me.emit_temp_buf(); ); ( $me:ident : clear_temp ) => ( $me.clear_temp_buf(); ); ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c); ); - ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push($c); ); - ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push($c); ); - ( $me:ident : append_value $c:expr ) => ( append_strings(&mut $me.current_attr_value, $c); ); - ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push($c); ); - ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_str($c); ); + ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push_char($c); ); + ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push_char($c); ); + ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_tendril($c); ); + ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push_char($c); ); + ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_slice($c); ); ( $me:ident : emit_comment ) => ( $me.emit_current_comment(); ); - ( $me:ident : clear_comment ) => ( $me.current_comment.truncate(0); ); + ( $me:ident : clear_comment ) => ( $me.current_comment.clear(); ); ( $me:ident : create_doctype ) => ( $me.current_doctype = Doctype::new(); ); ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.name, $c); ); ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c); ); @@ -805,7 +799,7 @@ impl Tokenizer { let c = get_char!(self); match c { '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => { - let esc = if self.temp_buf == "script" { DoubleEscaped } else { Escaped }; + let esc = if &*self.temp_buf == "script" { DoubleEscaped } else { Escaped }; go!(self: emit c; to RawData ScriptDataEscaped esc); } _ => match lower_ascii_letter(c) { @@ -855,7 +849,7 @@ impl Tokenizer { let c = get_char!(self); match c { '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => { - let esc = if self.temp_buf == "script" { Escaped } else { DoubleEscaped }; + let esc = if &*self.temp_buf == "script" { Escaped } else { DoubleEscaped }; go!(self: emit c; to RawData ScriptDataEscaped esc); } _ => match lower_ascii_letter(c) { @@ -936,7 +930,7 @@ impl Tokenizer { FromSet('&') => go!(self: consume_char_ref '"'), FromSet('\0') => go!(self: error; push_value '\u{fffd}'), FromSet(c) => go!(self: push_value c), - NotFromSet(b) => go!(self: append_value b), + NotFromSet(ref b) => go!(self: append_value b), } }, @@ -947,7 +941,7 @@ impl Tokenizer { FromSet('&') => go!(self: consume_char_ref '\''), FromSet('\0') => go!(self: error; push_value '\u{fffd}'), FromSet(c) => go!(self: push_value c), - NotFromSet(b) => go!(self: append_value b), + NotFromSet(ref b) => go!(self: append_value b), } }, @@ -964,7 +958,7 @@ impl Tokenizer { '"' , '\'' , '<' , '=' , '`' => error); go!(self: push_value c); } - NotFromSet(b) => go!(self: append_value b), + NotFromSet(ref b) => go!(self: append_value b), } }, @@ -1324,46 +1318,27 @@ impl Tokenizer { #[cfg(test)] #[allow(non_snake_case)] mod test { - use super::{option_push, append_strings}; // private items + use super::option_push; // private items + use tendril::{StrTendril, SliceExt}; #[test] fn push_to_None_gives_singleton() { - let mut s: Option = None; + let mut s: Option = None; option_push(&mut s, 'x'); - assert_eq!(s, Some(String::from_str("x"))); + assert_eq!(s, Some("x".to_tendril())); } #[test] fn push_to_empty_appends() { - let mut s: Option = Some(String::new()); + let mut s: Option = Some(StrTendril::new()); option_push(&mut s, 'x'); - assert_eq!(s, Some(String::from_str("x"))); + assert_eq!(s, Some("x".to_tendril())); } #[test] fn push_to_nonempty_appends() { - let mut s: Option = Some(String::from_str("y")); + let mut s: Option = Some(StrTendril::from_slice("y")); option_push(&mut s, 'x'); - assert_eq!(s, Some(String::from_str("yx"))); - } - - #[test] - fn append_appends() { - let mut s = String::from_str("foo"); - append_strings(&mut s, String::from_str("bar")); - assert_eq!(s, String::from_str("foobar")); - } - - #[test] - fn append_to_empty_does_not_copy() { - let mut lhs: String = String::from_str(""); - let rhs: Vec = vec![b'f', b'o', b'o']; - let ptr_old = rhs[0] as *const u8; - - append_strings(&mut lhs, String::from_utf8(rhs).unwrap()); - assert_eq!(lhs, String::from_str("foo")); - - let ptr_new = lhs.into_bytes()[0] as *const u8; - assert_eq!(ptr_old, ptr_new); + assert_eq!(s, Some("yx".to_tendril())); } } diff --git a/src/tree_builder/actions.rs b/src/tree_builder/actions.rs index 25bc99ca..8189b898 100644 --- a/src/tree_builder/actions.rs +++ b/src/tree_builder/actions.rs @@ -28,6 +28,7 @@ use std::iter::{Rev, Enumerate}; use std::borrow::Cow::Borrowed; use string_cache::{Atom, Namespace, QualName}; +use tendril::StrTendril; pub use self::PushFlag::*; @@ -61,10 +62,10 @@ pub trait TreeBuilderActions { fn assert_named(&mut self, node: Handle, name: Atom); fn clear_active_formatting_to_marker(&mut self); fn create_formatting_element_for(&mut self, tag: Tag) -> Handle; - fn append_text(&mut self, text: String) -> ProcessResult; - fn append_comment(&mut self, text: String) -> ProcessResult; - fn append_comment_to_doc(&mut self, text: String) -> ProcessResult; - fn append_comment_to_html(&mut self, text: String) -> ProcessResult; + fn append_text(&mut self, text: StrTendril) -> ProcessResult; + fn append_comment(&mut self, text: StrTendril) -> ProcessResult; + fn append_comment_to_doc(&mut self, text: StrTendril) -> ProcessResult; + fn append_comment_to_html(&mut self, text: StrTendril) -> ProcessResult; fn insert_appropriately(&mut self, child: NodeOrText, override_target: Option); fn insert_phantom(&mut self, name: Atom) -> Handle; fn insert_and_pop_element_for(&mut self, tag: Tag) -> Handle; @@ -722,25 +723,25 @@ impl TreeBuilderActions self.clear_active_formatting_to_marker(); } - fn append_text(&mut self, text: String) -> ProcessResult { + fn append_text(&mut self, text: StrTendril) -> ProcessResult { self.insert_appropriately(AppendText(text), None); Done } - fn append_comment(&mut self, text: String) -> ProcessResult { + fn append_comment(&mut self, text: StrTendril) -> ProcessResult { let comment = self.sink.create_comment(text); self.insert_appropriately(AppendNode(comment), None); Done } - fn append_comment_to_doc(&mut self, text: String) -> ProcessResult { + fn append_comment_to_doc(&mut self, text: StrTendril) -> ProcessResult { let target = self.doc_handle.clone(); let comment = self.sink.create_comment(text); self.sink.append(target, AppendNode(comment)); Done } - fn append_comment_to_html(&mut self, text: String) -> ProcessResult { + fn append_comment_to_html(&mut self, text: StrTendril) -> ProcessResult { let target = self.html_elem(); let comment = self.sink.create_comment(text); self.sink.append(target, AppendNode(comment)); diff --git a/src/tree_builder/data.rs b/src/tree_builder/data.rs index a38f2f9a..389e6454 100644 --- a/src/tree_builder/data.rs +++ b/src/tree_builder/data.rs @@ -11,6 +11,8 @@ use tokenizer::Doctype; use tree_builder::interface::{QuirksMode, Quirks, LimitedQuirks, NoQuirks}; use util::str::AsciiExt; +use tendril::StrTendril; + // These should all be lowercase, for ASCII-case-insensitive matching. static QUIRKY_PUBLIC_PREFIXES: &'static [&'static str] = &[ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", @@ -90,17 +92,24 @@ static HTML4_PUBLIC_PREFIXES: &'static [&'static str] = &[ ]; pub fn doctype_error_and_quirks(doctype: &Doctype, iframe_srcdoc: bool) -> (bool, QuirksMode) { - fn opt_as_slice<'t>(x: &'t Option) -> Option<&'t str> { + fn opt_string_as_slice<'t>(x: &'t Option) -> Option<&'t str> { x.as_ref().map(|y| &y[..]) } + fn opt_tendril_as_slice<'t>(x: &'t Option) -> Option<&'t str> { + match *x { + Some(ref t) => Some(t), + None => None, + } + } + fn opt_to_ascii_lower(x: Option<&str>) -> Option { x.map(|y| y.to_ascii_lower()) } - let name = opt_as_slice(&doctype.name); - let public = opt_as_slice(&doctype.public_id); - let system = opt_as_slice(&doctype.system_id); + let name = opt_tendril_as_slice(&doctype.name); + let public = opt_tendril_as_slice(&doctype.public_id); + let system = opt_tendril_as_slice(&doctype.system_id); let err = match (name, public, system) { (Some("html"), None, None) @@ -126,7 +135,7 @@ pub fn doctype_error_and_quirks(doctype: &Doctype, iframe_srcdoc: bool) -> (bool let public = opt_to_ascii_lower(public); let system = opt_to_ascii_lower(system); - let quirk = match (opt_as_slice(&public), opt_as_slice(&system)) { + let quirk = match (opt_string_as_slice(&public), opt_string_as_slice(&system)) { _ if doctype.force_quirks => Quirks, _ if name != Some("html") => Quirks, diff --git a/src/tree_builder/interface.rs b/src/tree_builder/interface.rs index a6086af5..5aee28f8 100644 --- a/src/tree_builder/interface.rs +++ b/src/tree_builder/interface.rs @@ -15,6 +15,7 @@ use tokenizer::Attribute; use std::borrow::Cow; use string_cache::QualName; +use tendril::StrTendril; pub use self::QuirksMode::{Quirks, LimitedQuirks, NoQuirks}; pub use self::NodeOrText::{AppendNode, AppendText}; @@ -33,7 +34,7 @@ pub enum QuirksMode { /// the sink may not want to allocate a `Handle` for each. pub enum NodeOrText { AppendNode(Handle), - AppendText(String), + AppendText(StrTendril), } /// Whether to interrupt further parsing of the current input until @@ -74,7 +75,7 @@ pub trait TreeSink { fn create_element(&mut self, name: QualName, attrs: Vec) -> Self::Handle; /// Create a comment node. - fn create_comment(&mut self, text: String) -> Self::Handle; + fn create_comment(&mut self, text: StrTendril) -> Self::Handle; /// Append a node as the last child of the given node. If this would /// produce adjacent sibling text nodes, it should concatenate the text @@ -97,7 +98,10 @@ pub trait TreeSink { new_node: NodeOrText) -> Result<(), NodeOrText>; /// Append a `DOCTYPE` element to the `Document` node. - fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String); + fn append_doctype_to_document(&mut self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril); /// Add each attribute to the given element, if no attribute /// with that name already exists. diff --git a/src/tree_builder/mod.rs b/src/tree_builder/mod.rs index 3d9e88b5..f0b75f29 100644 --- a/src/tree_builder/mod.rs +++ b/src/tree_builder/mod.rs @@ -7,6 +7,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +#![allow(warnings)] + //! The HTML5 tree builder. pub use self::interface::{QuirksMode, Quirks, LimitedQuirks, NoQuirks}; @@ -18,13 +20,14 @@ use self::actions::TreeBuilderActions; use self::rules::TreeBuilderStep; use string_cache::QualName; +use tendril::StrTendril; use tokenizer; use tokenizer::{Doctype, Tag}; use tokenizer::TokenSink; use tokenizer::states as tok_state; -use util::str::{is_ascii_whitespace, char_run}; +use util::str::is_ascii_whitespace; use std::default::Default; use std::mem::replace; @@ -100,7 +103,7 @@ pub struct TreeBuilder { template_modes: Vec, /// Pending table character tokens. - pending_table_text: Vec<(SplitStatus, String)>, + pending_table_text: Vec<(SplitStatus, StrTendril)>, /// Quirks mode as set by the parser. /// FIXME: can scripts etc. change this? @@ -343,17 +346,14 @@ impl TreeBuilder ReprocessForeign(t) => { token = t; } - SplitWhitespace(buf) => { - let (len, is_ws) = unwrap_or_return!( - char_run(is_ascii_whitespace, &buf), ()); - - token = CharacterTokens( - if is_ws { Whitespace } else { NotWhitespace }, - String::from_str(&buf[..len])); - - if len < buf.len() { - more_tokens.push_back( - CharacterTokens(NotSplit, String::from_str(&buf[len..]))); + SplitWhitespace(mut buf) => { + let p = buf.pop_front_char_run(is_ascii_whitespace); + let (first, is_ws) = unwrap_or_return!(p, ()); + let status = if is_ws { Whitespace } else { NotWhitespace }; + token = CharacterTokens(status, first); + + if buf.len32() > 0 { + more_tokens.push_back(CharacterTokens(NotSplit, buf)); } } } @@ -392,9 +392,9 @@ impl TokenSink let Doctype { name, public_id, system_id, force_quirks: _ } = dt; if !self.opts.drop_doctype { self.sink.append_doctype_to_document( - name.unwrap_or(String::new()), - public_id.unwrap_or(String::new()), - system_id.unwrap_or(String::new()) + name.unwrap_or(StrTendril::new()), + public_id.unwrap_or(StrTendril::new()), + system_id.unwrap_or(StrTendril::new()) ); } self.set_quirks_mode(quirk); @@ -415,8 +415,8 @@ impl TokenSink tokenizer::EOFToken => EOFToken, tokenizer::CharacterTokens(mut x) => { - if ignore_lf && x.len() >= 1 && x.char_at(0) == '\n' { - x.remove(0); + if !x.is_empty() && ignore_lf && x.char_at(0) == '\n' { + x.pop_front(1); } if x.is_empty() { return; diff --git a/src/tree_builder/rules.rs b/src/tree_builder/rules.rs index ce01bfab..bc16c338 100644 --- a/src/tree_builder/rules.rs +++ b/src/tree_builder/rules.rs @@ -23,7 +23,9 @@ use std::mem::replace; use std::borrow::Cow::Borrowed; use std::borrow::ToOwned; -fn any_not_whitespace(x: &String) -> bool { +use tendril::{StrTendril, SliceExt}; + +fn any_not_whitespace(x: &StrTendril) -> bool { // FIXME: this might be much faster as a byte scan x.chars().any(|c| !is_ascii_whitespace(c)) } @@ -1314,7 +1316,7 @@ impl TreeBuilderStep match_token!(token { NullCharacterToken => { self.unexpected(&token); - self.append_text("\u{fffd}".to_owned()) + self.append_text("\u{fffd}".to_tendril()) } CharacterTokens(_, text) => { diff --git a/src/tree_builder/types.rs b/src/tree_builder/types.rs index 5ed8ab12..86135043 100644 --- a/src/tree_builder/types.rs +++ b/src/tree_builder/types.rs @@ -11,6 +11,8 @@ use tokenizer::Tag; +use tendril::StrTendril; + pub use self::InsertionMode::*; pub use self::SplitStatus::*; pub use self::Token::*; @@ -56,8 +58,8 @@ pub enum SplitStatus { #[derive(PartialEq, Eq, Clone, Debug)] pub enum Token { TagToken(Tag), - CommentToken(String), - CharacterTokens(SplitStatus, String), + CommentToken(StrTendril), + CharacterTokens(SplitStatus, StrTendril), NullCharacterToken, EOFToken, } @@ -65,7 +67,7 @@ pub enum Token { pub enum ProcessResult { Done, DoneAckSelfClosing, - SplitWhitespace(String), + SplitWhitespace(StrTendril), Reprocess(InsertionMode, Token), ReprocessForeign(Token), } diff --git a/src/util/smallcharset.rs b/src/util/smallcharset.rs index 673fd2b3..11369a1b 100644 --- a/src/util/smallcharset.rs +++ b/src/util/smallcharset.rs @@ -22,7 +22,7 @@ impl SmallCharSet { /// Count the number of bytes of characters at the beginning /// of `buf` which are not in the set. /// See `tokenizer::buffer_queue::pop_except_from`. - pub fn nonmember_prefix_len(&self, buf: &str) -> usize { + pub fn nonmember_prefix_len(&self, buf: &str) -> u32 { let mut n = 0; for b in buf.bytes() { if b >= 64 || !self.contains(b) { @@ -48,11 +48,11 @@ mod test { #[test] fn nonmember_prefix() { for &c in ['&', '\0'].iter() { - for x in 0 .. 48usize { - for y in 0 .. 48usize { - let mut s = repeat("x").take(x).collect::(); + for x in 0 .. 48u32 { + for y in 0 .. 48u32 { + let mut s = repeat("x").take(x as usize).collect::(); s.push(c); - s.push_str(&repeat("x").take(y).collect::()); + s.push_str(&repeat("x").take(y as usize).collect::()); let set = small_char_set!('&' '\0'); assert_eq!(x, set.nonmember_prefix_len(&s)); diff --git a/src/util/str.rs b/src/util/str.rs index 70a5c19b..02e2b965 100644 --- a/src/util/str.rs +++ b/src/util/str.rs @@ -61,7 +61,7 @@ pub static ASCII_LOWER_MAP: [u8; 256] = [ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, ]; -#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)] +#[derive(Copy, Clone, PartialEq, PartialOrd, Ord, Eq, Hash)] pub struct Ascii { chr: u8, } @@ -102,6 +102,7 @@ pub trait AsciiCast { } impl AsciiCast for char { + #[inline] fn to_ascii_opt(&self) -> Option { let n = *self as u32; if n < 0x80 { @@ -166,11 +167,6 @@ pub fn is_ascii_alnum(c: char) -> bool { c.to_ascii_opt().map_or(false, |a| a.is_alphanumeric()) } -/// Allocate an empty string with a small non-zero capacity. -pub fn empty_str() -> String { - String::with_capacity(4) -} - /// ASCII whitespace characters, as defined by /// tree construction modes that treat them specially. pub fn is_ascii_whitespace(c: char) -> bool { @@ -180,29 +176,10 @@ pub fn is_ascii_whitespace(c: char) -> bool { } } -/// Count how many bytes at the beginning of the string -/// either all match or all don't match the predicate, -/// and also return whether they match. -/// -/// Returns `None` on an empty string. -pub fn char_run(mut pred: Pred, buf: &str) -> Option<(usize, bool)> - where Pred: FnMut(char) -> bool, -{ - let (first, rest) = unwrap_or_return!(buf.slice_shift_char(), None); - let matches = pred(first); - - for (idx, ch) in rest.char_indices() { - if matches != pred(ch) { - return Some((idx + first.len_utf8(), matches)); - } - } - Some((buf.len(), matches)) -} - #[cfg(test)] #[allow(non_snake_case)] mod test { - use super::{char_run, is_ascii_whitespace, is_ascii_alnum, lower_ascii, lower_ascii_letter}; + use super::{is_ascii_alnum, lower_ascii, lower_ascii_letter}; test_eq!(lower_letter_a_is_a, lower_ascii_letter('a'), Some('a')); test_eq!(lower_letter_A_is_a, lower_ascii_letter('A'), Some('a')); @@ -219,22 +196,4 @@ mod test { test_eq!(is_alnum_1, is_ascii_alnum('1'), true); test_eq!(is_not_alnum_symbol, is_ascii_alnum('!'), false); test_eq!(is_not_alnum_nonascii, is_ascii_alnum('\u{a66e}'), false); - - macro_rules! test_char_run ( ($name:ident, $input:expr, $expect:expr) => ( - test_eq!($name, char_run(is_ascii_whitespace, $input), $expect); - )); - - test_char_run!(run_empty, "", None); - test_char_run!(run_one_t, " ", Some((1, true))); - test_char_run!(run_one_f, "x", Some((1, false))); - test_char_run!(run_t, " \t \n", Some((6, true))); - test_char_run!(run_f, "xyzzy", Some((5, false))); - test_char_run!(run_tf, " xyzzy", Some((3, true))); - test_char_run!(run_ft, "xyzzy ", Some((5, false))); - test_char_run!(run_tft, " xyzzy ", Some((3, true))); - test_char_run!(run_ftf, "xyzzy hi", Some((5, false))); - test_char_run!(run_multibyte_0, "中 ", Some((3, false))); - test_char_run!(run_multibyte_1, " 中 ", Some((1, true))); - test_char_run!(run_multibyte_2, " 中 ", Some((2, true))); - test_char_run!(run_multibyte_3, " 中 ", Some((3, true))); } diff --git a/tests/serializer.rs b/tests/serializer.rs index 3e7b9ea9..06e36b4e 100644 --- a/tests/serializer.rs +++ b/tests/serializer.rs @@ -11,30 +11,32 @@ #![plugin(string_cache_plugin)] extern crate string_cache; +extern crate tendril; extern crate html5ever; extern crate html5ever_dom_sink; use std::default::Default; -use std::borrow::ToOwned; + +use tendril::{StrTendril, SliceExt}; use html5ever::driver::ParseOpts; use html5ever::{parse_fragment, one_input, serialize}; use html5ever_dom_sink::rcdom::RcDom; -fn parse_and_serialize(input: String) -> String { +fn parse_and_serialize(input: StrTendril) -> StrTendril { let dom: RcDom = parse_fragment(one_input(input), atom!(body), ParseOpts::default()); let inner = &dom.document.borrow().children[0]; let mut result = vec![]; serialize(&mut result, inner, Default::default()).unwrap(); - String::from_utf8(result).unwrap() + StrTendril::try_from_byte_slice(&result).unwrap() } macro_rules! test { ($name:ident, $input:expr, $output:expr) => { #[test] fn $name() { - assert_eq!($output, parse_and_serialize($input.to_owned())); + assert_eq!($output, &*parse_and_serialize($input.to_tendril())); } }; diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs index 8e188f30..6a7c9ef5 100644 --- a/tests/tokenizer.rs +++ b/tests/tokenizer.rs @@ -13,6 +13,7 @@ extern crate test; extern crate rustc_serialize; extern crate string_cache; +extern crate tendril; extern crate html5ever; extern crate test_util; @@ -37,12 +38,13 @@ use html5ever::tokenizer::{TokenSink, Tokenizer, TokenizerOpts}; use html5ever::tokenizer::states::{Plaintext, RawData, Rcdata, Rawtext}; use string_cache::{Atom, QualName}; +use tendril::{StrTendril, SliceExt}; // Return all ways of splitting the string into at most n // possibly-empty pieces. -fn splits(s: &str, n: usize) -> Vec> { +fn splits(s: &str, n: usize) -> Vec> { if n == 1 { - return vec!(vec!(s.to_string())); + return vec!(vec!(s.to_tendril())); } let mut points: Vec = s.char_indices().map(|(n,_)| n).collect(); @@ -53,7 +55,7 @@ fn splits(s: &str, n: usize) -> Vec> { for p in points.into_iter() { let y = &s[p..]; for mut x in splits(&s[..p], n-1).into_iter() { - x.push(y.to_string()); + x.push(y.to_tendril()); out.push(x); } } @@ -64,7 +66,7 @@ fn splits(s: &str, n: usize) -> Vec> { struct TokenLogger { tokens: Vec, - current_str: String, + current_str: StrTendril, exact_errors: bool, } @@ -72,7 +74,7 @@ impl TokenLogger { fn new(exact_errors: bool) -> TokenLogger { TokenLogger { tokens: vec!(), - current_str: String::new(), + current_str: StrTendril::new(), exact_errors: exact_errors, } } @@ -85,7 +87,7 @@ impl TokenLogger { fn finish_str(&mut self) { if self.current_str.len() > 0 { - let s = replace(&mut self.current_str, String::new()); + let s = replace(&mut self.current_str, StrTendril::new()); self.tokens.push(CharacterTokens(s)); } } @@ -100,11 +102,11 @@ impl TokenSink for TokenLogger { fn process_token(&mut self, token: Token) { match token { CharacterTokens(b) => { - self.current_str.push_str(&b); + self.current_str.push_slice(&b); } NullCharacterToken => { - self.current_str.push('\0'); + self.current_str.push_char('\0'); } ParseError(_) => if self.exact_errors { @@ -132,7 +134,7 @@ impl TokenSink for TokenLogger { } } -fn tokenize(input: Vec, opts: TokenizerOpts) -> Vec { +fn tokenize(input: Vec, opts: TokenizerOpts) -> Vec { let sink = TokenLogger::new(opts.exact_errors); let mut tok = Tokenizer::new(sink, opts); for chunk in input.into_iter() { @@ -144,7 +146,8 @@ fn tokenize(input: Vec, opts: TokenizerOpts) -> Vec { trait JsonExt { fn get_str(&self) -> String; - fn get_nullable_str(&self) -> Option; + fn get_tendril(&self) -> StrTendril; + fn get_nullable_tendril(&self) -> Option; fn get_bool(&self) -> bool; fn get_obj<'t>(&'t self) -> &'t BTreeMap; fn get_list<'t>(&'t self) -> &'t Vec; @@ -159,11 +162,18 @@ impl JsonExt for Json { } } - fn get_nullable_str(&self) -> Option { + fn get_tendril(&self) -> StrTendril { + match *self { + Json::String(ref s) => s.to_tendril(), + _ => panic!("Json::get_tendril: not a String"), + } + } + + fn get_nullable_tendril(&self) -> Option { match *self { Json::Null => None, - Json::String(ref s) => Some(s.to_string()), - _ => panic!("Json::get_nullable_str: not a String"), + Json::String(ref s) => Some(s.to_tendril()), + _ => panic!("Json::get_nullable_tendril: not a String"), } } @@ -200,9 +210,9 @@ fn json_to_token(js: &Json) -> Token { let args: Vec<&Json> = parts[1..].iter().collect(); match (&parts[0].get_str()[..], &args[..]) { ("DOCTYPE", [name, public_id, system_id, correct]) => DoctypeToken(Doctype { - name: name.get_nullable_str(), - public_id: public_id.get_nullable_str(), - system_id: system_id.get_nullable_str(), + name: name.get_nullable_tendril(), + public_id: public_id.get_nullable_tendril(), + system_id: system_id.get_nullable_tendril(), force_quirks: !correct.get_bool(), }), @@ -212,7 +222,7 @@ fn json_to_token(js: &Json) -> Token { attrs: attrs.get_obj().iter().map(|(k,v)| { Attribute { name: QualName::new(ns!(""), Atom::from_slice(&k)), - value: v.get_str() + value: v.get_tendril() } }).collect(), self_closing: match rest { @@ -228,9 +238,9 @@ fn json_to_token(js: &Json) -> Token { self_closing: false }), - ("Comment", [txt]) => CommentToken(txt.get_str()), + ("Comment", [txt]) => CommentToken(txt.get_tendril()), - ("Character", [txt]) => CharacterTokens(txt.get_str()), + ("Character", [txt]) => CharacterTokens(txt.get_tendril()), // We don't need to produce NullCharacterToken because // the TokenLogger will convert them to CharacterTokens. diff --git a/tests/tree_builder.rs b/tests/tree_builder.rs index ddff71f9..4609baf5 100644 --- a/tests/tree_builder.rs +++ b/tests/tree_builder.rs @@ -13,6 +13,7 @@ extern crate test; extern crate string_cache; +extern crate tendril; extern crate html5ever; extern crate html5ever_dom_sink; @@ -36,6 +37,7 @@ use html5ever_dom_sink::common::{Document, Doctype, Text, Comment, Element}; use html5ever_dom_sink::rcdom::{RcDom, Handle}; use string_cache::Atom; +use tendril::StrTendril; fn parse_tests>(mut lines: It) -> Vec> { let mut tests = vec!(); @@ -175,6 +177,8 @@ fn make_test( should_panic: No, }, testfn: DynTestFn(Box::new(move || { + // Do this here because Tendril isn't Send. + let data = StrTendril::from_slice(&data); let mut result = String::new(); match context { None => { From 9e4783df6f930f61aca1f0f9802d7bc800fd8e37 Mon Sep 17 00:00:00 2001 From: Keegan McAllister Date: Wed, 6 May 2015 14:31:00 -0700 Subject: [PATCH 5/5] WORK IN PROGRESS -- fix up the C API --- capi/include/html5ever.h | 27 ++++------ capi/src/lib.rs | 108 +++------------------------------------ capi/src/tokenizer.rs | 100 ++++++++++++++---------------------- dom_sink/Cargo.toml | 3 ++ dom_sink/src/lib.rs | 2 + examples/html2html.rs | 3 +- 6 files changed, 59 insertions(+), 184 deletions(-) diff --git a/capi/include/html5ever.h b/capi/include/html5ever.h index e6c674d1..020abd3b 100644 --- a/capi/include/html5ever.h +++ b/capi/include/html5ever.h @@ -11,26 +11,19 @@ #define __HTML5EVER_H #include - -struct h5e_buf { - unsigned char *data; - size_t len; -}; - -struct h5e_buf h5e_buf_from_cstr(const char *str); +#include "tendril.h" +#include "string_cache.h" struct h5e_token_ops { - void (*do_doctype)(void *user, struct h5e_buf name, - struct h5e_buf pub, struct h5e_buf sys, int force_quirks); - void (*do_start_tag)(void *user, struct h5e_buf name, - int self_closing, size_t num_attrs); - void (*do_tag_attr)(void *user, struct h5e_buf name, struct h5e_buf value); - void (*do_end_tag)(void *user, struct h5e_buf name); - void (*do_comment)(void *user, struct h5e_buf text); - void (*do_chars)(void *user, struct h5e_buf text); + void (*do_doctype)(void *user, tendril name, tendril pub, tendril sys, int force_quirks); + void (*do_start_tag)(void *user, scache_atom name, int self_closing, size_t num_attrs); + void (*do_tag_attr)(void *user, scache_atom name, tendril value); + void (*do_end_tag)(void *user, scache_atom name); + void (*do_comment)(void *user, tendril text); + void (*do_chars)(void *user, tendril text); void (*do_null_char)(void *user); void (*do_eof)(void *user); - void (*do_error)(void *user, struct h5e_buf message); + void (*do_error)(void *user, tendril message); }; struct h5e_token_sink { @@ -42,7 +35,7 @@ struct h5e_tokenizer; struct h5e_tokenizer *h5e_tokenizer_new(struct h5e_token_sink *sink); void h5e_tokenizer_free(struct h5e_tokenizer *tok); -void h5e_tokenizer_feed(struct h5e_tokenizer *tok, struct h5e_buf buf); +void h5e_tokenizer_feed(struct h5e_tokenizer *tok, tendril buf); void h5e_tokenizer_end(struct h5e_tokenizer *tok); #endif diff --git a/capi/src/lib.rs b/capi/src/lib.rs index 5a84df3a..bc436866 100644 --- a/capi/src/lib.rs +++ b/capi/src/lib.rs @@ -7,115 +7,19 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +#![feature(plugin, box_syntax)] +#![plugin(string_cache_plugin)] + extern crate libc; extern crate string_cache; extern crate tendril; extern crate html5ever; -use std::{ptr, slice, str}; -use std::marker::PhantomData; -use std::borrow::Cow; - -use libc::{size_t, c_int, c_char, strlen}; - -use string_cache::Atom; - -use tendril::StrTendril; - -#[repr(C)] -pub struct h5e_buf { - data: *const u8, - len: size_t, -} - -impl Copy for h5e_buf { } -impl Clone for h5e_buf { - fn clone(&self) -> h5e_buf { - *self - } -} - -impl h5e_buf { - pub fn null() -> h5e_buf { - h5e_buf { - data: ptr::null(), - len: 0, - } - } - - pub unsafe fn as_slice(&self) -> &str { - str::from_utf8_unchecked(slice::from_raw_parts(self.data, self.len as usize)) - } -} - -pub struct LifetimeBuf<'a> { - buf: h5e_buf, - marker: PhantomData<&'a [u8]>, -} - -impl<'a> LifetimeBuf<'a> { - pub fn from_str(x: &'a str) -> LifetimeBuf<'a> { - LifetimeBuf { - buf: h5e_buf { - data: x.as_bytes().as_ptr(), - len: x.len() as size_t, - }, - marker: PhantomData, - } - } - - pub fn null() -> LifetimeBuf<'a> { - LifetimeBuf { - buf: h5e_buf::null(), - marker: PhantomData, - } - } +use libc::c_int; - #[inline] - pub fn get(self) -> h5e_buf { - self.buf - } -} - -// Or we could just make `LifetimeBuf::from_str` generic over ; -// see rust-lang/rust#16738. -pub trait AsLifetimeBuf { - fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a>; -} - -impl AsLifetimeBuf for String { - fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> { - LifetimeBuf::from_str(self) - } -} - -impl AsLifetimeBuf for StrTendril { - fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> { - LifetimeBuf::from_str(self) - } -} - -impl AsLifetimeBuf for Atom { - fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> { - LifetimeBuf::from_str(self) - } -} - -impl<'b> AsLifetimeBuf for Cow<'b, str> { - fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> { - LifetimeBuf::from_str(self) - } -} - -#[no_mangle] -pub unsafe extern "C" fn h5e_buf_from_cstr(s: *const c_char) -> h5e_buf { - h5e_buf { - data: s as *const u8, - len: strlen(s), - } -} +pub mod tokenizer; -pub fn c_bool(x: bool) -> c_int { +fn c_bool(x: bool) -> c_int { match x { false => 0, true => 1, diff --git a/capi/src/tokenizer.rs b/capi/src/tokenizer.rs index b718401b..90a6c871 100644 --- a/capi/src/tokenizer.rs +++ b/capi/src/tokenizer.rs @@ -7,122 +7,96 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![allow(non_camel_case_types)] +#![allow(non_camel_case_types, raw_pointer_derive)] -use {LifetimeBuf, AsLifetimeBuf, h5e_buf, c_bool}; +use c_bool; use html5ever::tokenizer::{TokenSink, Token, Doctype, Tag, ParseError, DoctypeToken}; use html5ever::tokenizer::{CommentToken, CharacterTokens, NullCharacterToken}; use html5ever::tokenizer::{TagToken, StartTag, EndTag, EOFToken, Tokenizer}; -use html5ever::Tendril; use std::mem; use std::default::Default; use libc::{c_void, c_int, size_t}; +use string_cache::Atom; +use tendril::{StrTendril, SliceExt}; #[repr(C)] +#[derive(Copy, Clone)] pub struct h5e_token_ops { - do_doctype: Option, + do_doctype: Option, - do_start_tag: Option, - do_tag_attr: Option, - do_end_tag: Option, - do_comment: Option, - do_chars: Option, + do_tag_attr: Option, + + do_end_tag: Option, + do_comment: Option, + do_chars: Option, do_null_char: Option, do_eof: Option, - do_error: Option, -} - -impl Copy for h5e_token_ops { } -impl Clone for h5e_token_ops { - fn clone(&self) -> h5e_token_ops { - *self - } + do_error: Option, } #[repr(C)] +#[derive(Copy, Clone)] pub struct h5e_token_sink { ops: *const h5e_token_ops, user: *mut c_void, } -impl Copy for h5e_token_sink { } -impl Clone for h5e_token_sink { - fn clone(&self) -> h5e_token_sink { - *self - } -} - -impl TokenSink for *mut h5e_token_sink { +impl TokenSink for h5e_token_sink { fn process_token(&mut self, token: Token) { macro_rules! call { ($name:ident, $($arg:expr),*) => ( unsafe { - match (*(**self).ops).$name { + match (*self.ops).$name { None => (), - Some(f) => f((**self).user $(, $arg)*), + Some(f) => f((*self).user $(, $arg)*), } } ); ($name:ident) => (call!($name,)); // bleh } - fn opt_str_to_buf<'a>(s: &'a Option) -> LifetimeBuf<'a> { - match *s { - None => LifetimeBuf::null(), - Some(ref s) => s.as_lifetime_buf(), - } - } - match token { DoctypeToken(Doctype { name, public_id, system_id, force_quirks }) => { - let name = opt_str_to_buf(&name); - let public_id = opt_str_to_buf(&public_id); - let system_id = opt_str_to_buf(&system_id); - call!(do_doctype, name.get(), public_id.get(), system_id.get(), + // Empty tendril doesn't allocate. + call!(do_doctype, name.unwrap_or(StrTendril::new()), + public_id.unwrap_or(StrTendril::new()), + system_id.unwrap_or(StrTendril::new()), c_bool(force_quirks)); } TagToken(Tag { kind, name, self_closing, attrs }) => { - let name = name.as_lifetime_buf(); match kind { StartTag => { - call!(do_start_tag, name.get(), c_bool(self_closing), + call!(do_start_tag, name, c_bool(self_closing), attrs.len() as size_t); for attr in attrs.into_iter() { // All attribute names from the tokenizer are local. assert!(attr.name.ns == ns!("")); - let name = attr.name.local.as_lifetime_buf(); - let value = attr.value.as_lifetime_buf(); - call!(do_tag_attr, name.get(), value.get()); + call!(do_tag_attr, attr.name.local, attr.value); } } - EndTag => call!(do_end_tag, name.get()), + EndTag => call!(do_end_tag, name), } } - CommentToken(text) => { - let text = text.as_lifetime_buf(); - call!(do_comment, text.get()); - } + CommentToken(text) => call!(do_comment, text), - CharacterTokens(text) => { - let text = text.as_lifetime_buf(); - call!(do_chars, text.get()); - } + CharacterTokens(text) => call!(do_chars, text), NullCharacterToken => call!(do_null_char), EOFToken => call!(do_eof), ParseError(msg) => { - let msg = msg.as_lifetime_buf(); - call!(do_error, msg.get()); + let msg = msg.to_tendril(); + call!(do_error, msg); } } } @@ -131,26 +105,26 @@ impl TokenSink for *mut h5e_token_sink { pub type h5e_tokenizer_ptr = *const (); #[no_mangle] -pub unsafe extern "C" fn h5e_tokenizer_new(sink: *mut h5e_token_sink) -> h5e_tokenizer_ptr { - let tok: Box> - = box Tokenizer::new(sink, Default::default()); +pub unsafe extern "C" fn h5e_tokenizer_new(sink: *const h5e_token_sink) -> h5e_tokenizer_ptr { + let tok: Box> + = box Tokenizer::new(*sink, Default::default()); mem::transmute(tok) } #[no_mangle] pub unsafe extern "C" fn h5e_tokenizer_free(tok: h5e_tokenizer_ptr) { - let _: Box> = mem::transmute(tok); + let _: Box> = mem::transmute(tok); } #[no_mangle] -pub unsafe extern "C" fn h5e_tokenizer_feed(tok: h5e_tokenizer_ptr, buf: h5e_buf) { - let tok: &mut Tokenizer<*mut h5e_token_sink> = mem::transmute(tok); - tok.feed(String::from_str(buf.as_slice())); +pub unsafe extern "C" fn h5e_tokenizer_feed(tok: h5e_tokenizer_ptr, buf: StrTendril) { + let tok: &mut Tokenizer = mem::transmute(tok); + tok.feed(buf); } #[no_mangle] pub unsafe extern "C" fn h5e_tokenizer_end(tok: h5e_tokenizer_ptr) { - let tok: &mut Tokenizer<*mut h5e_token_sink> = mem::transmute(tok); + let tok: &mut Tokenizer = mem::transmute(tok); tok.end(); } diff --git a/dom_sink/Cargo.toml b/dom_sink/Cargo.toml index 10749c51..39146d57 100644 --- a/dom_sink/Cargo.toml +++ b/dom_sink/Cargo.toml @@ -14,3 +14,6 @@ git = "https://github.com/reem/rust-mac" [dependencies.string_cache] git = "https://github.com/servo/string-cache" + +[dependencies.tendril] +git = "https://github.com/kmcallister/tendril" diff --git a/dom_sink/src/lib.rs b/dom_sink/src/lib.rs index 8194715f..c1d1db59 100644 --- a/dom_sink/src/lib.rs +++ b/dom_sink/src/lib.rs @@ -17,6 +17,8 @@ extern crate html5ever; #[macro_use] extern crate string_cache; +extern crate tendril; + #[macro_use] extern crate mac; diff --git a/examples/html2html.rs b/examples/html2html.rs index 69c18f76..51f0dda3 100644 --- a/examples/html2html.rs +++ b/examples/html2html.rs @@ -24,11 +24,10 @@ use std::default::Default; use tendril::{ByteTendril, ReadExt}; -use html5ever::sink::rcdom::RcDom; use html5ever::driver::ParseOpts; -use html5ever_dom_sink::rcdom::RcDom; use html5ever::tree_builder::TreeBuilderOpts; use html5ever::{parse, one_input, serialize}; +use html5ever_dom_sink::rcdom::RcDom; fn main() { let mut input = ByteTendril::new();