From e28e8a26f653e1093271bbb636831daf874a23cf Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 3 Oct 2016 18:38:35 +0200 Subject: [PATCH 1/2] =?UTF-8?q?Use=20a=20hand-written=20parser=20a=20strin?= =?UTF-8?q?g=20formatting=20for=20the=20`match=5Ftoken!`=20macro=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … instead of the parser and quasi-quoting from Rust’s (unstable) libsyntax. This has significantly worse diagnostics when encountering unexpected syntax (e.g. no indication of which line has the offending code) but this removes all usage of unstable compiler internals that constantly need to be fixed when updating the compiler. Fixes #216. --- Cargo.toml | 3 +- STRUCTURE.md | 2 +- build.rs | 69 +----- macros/Cargo.toml | 15 -- macros/match_token.rs | 502 ++++++++++++++++++++++++++++++++++++++ macros/src/lib.rs | 36 --- macros/src/match_token.rs | 485 ------------------------------------ macros/src/pre_expand.rs | 130 ---------- src/tree_builder/mod.rs | 6 +- src/tree_builder/rules.rs | 8 +- 10 files changed, 518 insertions(+), 738 deletions(-) delete mode 100644 macros/Cargo.toml create mode 100644 macros/match_token.rs delete mode 100644 macros/src/lib.rs delete mode 100644 macros/src/match_token.rs delete mode 100644 macros/src/pre_expand.rs diff --git a/Cargo.toml b/Cargo.toml index 27517e55..d3a0ac78 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,7 +33,7 @@ harness = false [features] unstable = ["tendril/unstable", "string_cache/unstable"] heap_size = ["heapsize", "heapsize_plugin"] -codegen = ["html5ever_macros"] +codegen = [] # Now unused, remove at the next breaking change [dependencies] log = "0" @@ -51,7 +51,6 @@ rustc-test = "0.1.3" [build-dependencies] phf_codegen = "0.7.3" rustc-serialize = "0.3.15" -html5ever_macros = { version = "0.2.6", path = "macros", optional = true } [profile.dev] debug = false diff --git a/STRUCTURE.md b/STRUCTURE.md index 00a7f33c..ab72b427 100644 --- a/STRUCTURE.md +++ b/STRUCTURE.md @@ -14,7 +14,7 @@ The module structure is also documented in the output produced by `cargo doc`, a `dom_sink/`: Types that html5ever can use to represent the DOM, if you do not provide your own DOM implementation. -`macros/`: Rust syntax extensions used within html5ever. Users of the library do not need this crate. +`macros/`: Code used at build-time to expand the `match_token!` "macro" in `src/tree_builder/rules.rs`. `tests/`: Integration tests. This is a single executable crate that runs html5ever on the various [html5lib-tests](https://github.com/html5lib/html5lib-tests). There are also unit tests throughout the library code. See `README.md` for information on running tests. diff --git a/build.rs b/build.rs index b0146389..1a6c1643 100644 --- a/build.rs +++ b/build.rs @@ -18,14 +18,16 @@ use std::fs::File; use std::io::Write; use std::path::Path; +#[path = "macros/match_token.rs"] +mod match_token; + fn main() { let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); let rules_rs = Path::new(&manifest_dir).join("src/tree_builder/rules.rs"); - expand_match_tokens( + match_token::expand_match_tokens( &rules_rs, - // Keep the expanded file in the source directory, so that `cargo publish` ships it. - &rules_rs.with_extension("expanded.rs")); + &Path::new(&env::var("OUT_DIR").unwrap()).join("rules.rs")); named_entities_to_phf( &Path::new(&manifest_dir).join("data/entities.json"), @@ -34,67 +36,6 @@ fn main() { println!("cargo:rerun-if-changed={}", rules_rs.display()); } -#[cfg(feature = "codegen")] -fn expand_match_tokens(from: &Path, to: &Path) { - extern crate html5ever_macros; - - html5ever_macros::pre_expand(from, to); -} - -#[cfg(not(feature = "codegen"))] -fn expand_match_tokens(from: &Path, to: &Path) { - use std::io::stderr; - use std::process::exit; - - if let Err(error) = check_hash(from, to) { - writeln!( - stderr(), - r" -{} is missing or not up to date with {}: -{} - -Run `cargo build --features codegen` to update it. - -If you’re using html5ever as a dependency, this is a bad release. -Please file an issue at https://github.com/servo/html5ever/issues/new -with the output of `cargo pkgid html5ever`. -", - to.file_name().unwrap().to_string_lossy(), - from.file_name().unwrap().to_string_lossy(), - error - ).unwrap(); - exit(1); - } -} - -#[cfg(not(feature = "codegen"))] -fn check_hash(from: &Path, to: &Path) -> Result<(), String> { - use std::hash::{Hash, Hasher, SipHasher}; - use std::io::Read; - - // Unwrap here as the source file is expected to exist. - let mut file_from = File::open(from).unwrap(); - let mut source = String::new(); - let mut hasher = SipHasher::new(); - file_from.read_to_string(&mut source).unwrap(); - source.hash(&mut hasher); - let source_hash = hasher.finish(); - - // IO errors from here indicate we need to regenerate the expanded file. - let mut file_to = try!(File::open(to).map_err(|e| e.to_string())); - let mut expanded = String::new(); - try!(file_to.read_to_string(&mut expanded).map_err(|e| e.to_string())); - let prefix = "// source SipHash: "; - let line = try!(expanded.lines().find(|line| line.starts_with(prefix)) - .ok_or("source hash not found".to_string())); - let expected_hash = try!(line[prefix.len()..].parse::().map_err(|e| e.to_string())); - if source_hash == expected_hash { - Ok(()) - } else { - Err("different hash".to_string()) - } -} - fn named_entities_to_phf(from: &Path, to: &Path) { // A struct matching the entries in entities.json. #[derive(RustcDecodable)] diff --git a/macros/Cargo.toml b/macros/Cargo.toml deleted file mode 100644 index 02e9f406..00000000 --- a/macros/Cargo.toml +++ /dev/null @@ -1,15 +0,0 @@ -[package] - -name = "html5ever_macros" -version = "0.2.7" -authors = [ "The html5ever Project Developers" ] -license = "MIT / Apache-2.0" -repository = "https://github.com/servo/html5ever" -description = "High-performance browser-grade HTML5 parser − compiler plugins" - -[lib] -name = "html5ever_macros" -plugin = true - -[dependencies] -mac = "0" diff --git a/macros/match_token.rs b/macros/match_token.rs new file mode 100644 index 00000000..e4962661 --- /dev/null +++ b/macros/match_token.rs @@ -0,0 +1,502 @@ +// Copyright 2014 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/*! + +Implements the `match_token!()` macro for use by the HTML tree builder +in `src/tree_builder/rules.rs`. + + +## Example + +```rust +match_token!(token { + CommentToken(text) => 1, + + tag @ => 2, + + => 3, + +
=> else, + + tag @ => 4, + + token => 5, +}) +``` + + +## Syntax + +Because of the simplistic parser, the macro invocation must +start with exactly `match_token!(token {` (with whitespace as specified) +and end with exactly `})`. + +The left-hand side of each match arm is an optional `name @` binding, followed by + + - an ordinary Rust pattern that starts with an identifier or an underscore, or + + - a sequence of HTML tag names as identifiers, each inside "<...>" or "" + to match an open or close tag respectively, or + + - a "wildcard tag" "<_>" or "" to match all open tags or all close tags + respectively. + +The right-hand side is either an expression or the keyword `else`. + +Note that this syntax does not support guards or pattern alternation like +`Foo | Bar`. This is not a fundamental limitation; it's done for implementation +simplicity. + + +## Semantics + +Ordinary Rust patterns match as usual. If present, the `name @` binding has +the usual meaning. + +A sequence of named tags matches any of those tags. A single sequence can +contain both open and close tags. If present, the `name @` binding binds (by +move) the `Tag` struct, not the outer `Token`. That is, a match arm like + +```rust +tag @ => ... +``` + +expands to something like + +```rust +TagToken(tag @ Tag { name: atom!("html"), kind: StartTag }) +| TagToken(tag @ Tag { name: atom!("head"), kind: StartTag }) => ... +``` + +A wildcard tag matches any tag of the appropriate kind, *unless* it was +previously matched with an `else` right-hand side (more on this below). + +The expansion of this macro reorders code somewhat, to satisfy various +restrictions arising from moves. However it provides the semantics of in-order +matching, by enforcing the following restrictions on its input: + + - The last pattern must be a variable or the wildcard "_". In other words + it must match everything. + + - Otherwise, ordinary Rust patterns and specific-tag patterns cannot appear + after wildcard tag patterns. + + - No tag name may appear more than once. + + - A wildcard tag pattern may not occur in the same arm as any other tag. + "<_> => ..." and "<_> => ..." are both forbidden. + + - The right-hand side "else" may only appear with specific-tag patterns. + It means that these specific tags should be handled by the last, + catch-all case arm, rather than by any wildcard tag arm. This situation + is common in the HTML5 syntax. +*/ + +use std::collections::{HashMap, HashSet}; +use std::fmt::Write as FmtWrite; +use std::fs::File; +use std::io::{Read, Write}; +use std::path::Path; + +#[derive(Clone)] +struct Source<'a> { + src: &'a str, +} + +impl<'a> Source<'a> { + fn consume(&mut self, n: usize) -> &'a str { + let (before, after) = self.src.split_at(n); + self.src = after; + before + } + + fn find(&mut self, s: &str) -> Option<&'a str> { + self.src.find(s).map(|position| { + let before = self.consume(position); + self.consume(s.len()); + before + }) + } + + fn consume_if_present(&mut self, s: &str) -> bool { + let present = self.src.starts_with(s); + if present { + self.consume(s.len()); + } + present + } + + fn expect(&mut self, s: &str) { + assert!(self.consume_if_present(s), "{:?}… does not start with {:?}", &self.src[..50], s); + } + + /// Not exactly Rust whitespace, but close enough + fn consume_whitespace(&mut self) { + while self.src.starts_with(&[' ', '\t', '\n', '\r'][..]) { + self.consume(1); + } + } + + /// Not exactly the syntax of a Rust identifier, but close enough + fn consume_ident(&mut self) -> Option<&'a str> { + let end = self.src.find(|c: char| !c.is_alphanumeric() && c != '_').unwrap_or(self.src.len()); + if end > 0 { + Some(self.consume(end)) + } else { + None + } + } + + fn find_top_level(&mut self, start_at: usize, delimeter: u8) -> usize { + let mut i = start_at; + let bytes = self.src.as_bytes(); + loop { + let b = *bytes.get(i).expect("unbalanced brackets"); + i += 1; + if b == delimeter { + return i + } + match b { + b'{' => i = self.find_top_level(i, b'}'), + b'[' => i = self.find_top_level(i, b']'), + b'(' => i = self.find_top_level(i, b')'), + _ => {} + } + } + } +} + +pub fn expand_match_tokens(from: &Path, to: &Path) { + let mut source = String::new(); + File::open(from).unwrap().read_to_string(&mut source).unwrap(); + + let mut source = Source { src: &*source }; + + let mut file = File::create(to).unwrap(); + let mut write = |s: &str| file.write_all(s.as_bytes()).unwrap(); + while let Some(before) = source.find("match_token!") { + write(before); + source.expect("(token {"); + let mut arms = Vec::new(); + loop { + source.consume_whitespace(); + if source.consume_if_present("})") { + break + } + arms.push(parse_arm(&mut source)); + } + write_match_token(arms, &mut write); + } + write(source.src); +} + +#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)] +enum TagKind { + StartTag, + EndTag, +} + +/// A single tag, as may appear in an LHS. +/// +/// `name` is `None` for wildcards. +#[derive(PartialEq, Eq, Hash, Clone, Debug)] +struct Tag { + kind: TagKind, + name: Option, +} + +/// Left-hand side of a pattern-match arm. +#[derive(Debug)] +enum LHS { + Pattern(String), + Tags(Vec), +} + +/// Right-hand side of a pattern-match arm. +#[derive(Debug)] +enum RHS { + Else, + Expression(String), +} + +/// A whole arm, including optional outer `name @` binding. +#[derive(Debug)] +struct Arm { + binding: Option, + lhs: LHS, + rhs: RHS, +} + +fn parse_arm(source: &mut Source) -> Arm { + loop { + source.consume_whitespace(); + if source.consume_if_present("//") { + source.find("\n"); + } else { + break + } + } + let start = source.clone(); + let mut binding = None; + if let Some(ident) = source.consume_ident() { + source.consume_whitespace(); + if source.consume_if_present("@") { + binding = Some(ident.to_owned()) + } else { + *source = start + } + } + + Arm { + binding: binding, + lhs: parse_lhs(source), + rhs: parse_rhs(source), + } +} + +fn parse_lhs(source: &mut Source) -> LHS { + source.consume_whitespace(); + if source.consume_if_present("<") { + let mut tags = Vec::new(); + loop { + tags.push(Tag { + kind: if source.consume_if_present("/") { + TagKind::EndTag + } else { + TagKind::StartTag + }, + name: if source.consume_if_present("_") { + None + } else { + Some(source.consume_ident().expect("expected identifier (tag name)").to_owned()) + } + }); + assert!(source.consume_if_present(">"), "expected '>' closing a tag pattern"); + source.consume_whitespace(); + if !source.consume_if_present("<") { + break + } + } + source.consume_whitespace(); + assert!(source.consume_if_present("=>")); + LHS::Tags(tags) + } else { + LHS::Pattern(source.find("=>").expect("did not find =>").to_owned()) + } +} + +fn parse_rhs(source: &mut Source) -> RHS { + source.consume_whitespace(); + let start_at; + let delimeter; + if source.consume_if_present("else,") { + return RHS::Else + } else if source.src.starts_with("{") { + start_at = 1; + delimeter = b'}'; + } else { + start_at = 0; + delimeter = b','; + } + let end = source.find_top_level(start_at, delimeter); + let expr = source.consume(end); + if delimeter == b'}' { + source.consume_whitespace(); + source.consume_if_present(","); + } + RHS::Expression(expr.to_owned()) +} + +/// Description of a wildcard match arm. +/// +/// We defer generating code for these until we process the last, catch-all +/// arm. This isn't part of the AST produced by `parse()`; it's created +/// while processing that AST. +struct WildcardArm { + binding: String, + kind: TagKind, + expr: String, +} + +fn write_match_token(mut arms: Vec, write: &mut F) where F: FnMut(&str) { + write("match token {\n"); + + // Handle the last arm specially at the end. + let last_arm = arms.pop().unwrap(); + + // Tags we've seen, used for detecting duplicates. + let mut seen_tags: HashSet = HashSet::new(); + + // Case arms for wildcard matching. We collect these and + // emit them later. + let mut wildcards: Vec = Vec::new(); + + // Tags excluded (by an 'else' RHS) from wildcard matching. + let mut wild_excluded: HashMap> = HashMap::new(); + + for Arm { binding, lhs, rhs } in arms { + // Build Rust syntax for the `name @` binding, if any. + let binding = match binding { + Some(ident) => format!("{} @ ", ident), + None => String::new(), + }; + + match (lhs, rhs) { + (LHS::Pattern(_), RHS::Else) => panic!("'else' may not appear with an ordinary pattern"), + + // ordinary pattern => expression + (LHS::Pattern(pat), RHS::Expression(expr)) => { + if !wildcards.is_empty() { + panic!("ordinary patterns may not appear after wildcard tags {:?} {:?}", pat, expr); + } + write(&format!(" {}{} => {}\n", binding, pat, expr)); + } + + // ... => else + (LHS::Tags(tags), RHS::Else) => { + for tag in tags { + if !seen_tags.insert(tag.clone()) { + panic!("duplicate tag"); + } + if tag.name.is_none() { + panic!("'else' may not appear with a wildcard tag"); + } + wild_excluded.entry(tag.kind).or_insert_with(Vec::new).push(tag.clone()); + } + } + + // <_> => expression + // ... => expression + (LHS::Tags(tags), RHS::Expression(expr)) => { + // Is this arm a tag wildcard? + // `None` if we haven't processed the first tag yet. + let mut wildcard = None; + for tag in tags { + if !seen_tags.insert(tag.clone()) { + panic!("duplicate tag"); + } + + match tag.name { + // + Some(_) => { + if !wildcards.is_empty() { + panic!("specific tags may not appear after wildcard tags"); + } + + if wildcard == Some(true) { + panic!("wildcard tags must appear alone"); + } + + if wildcard.is_some() { + // Push the delimeter `|` if it's not the first tag. + write(" |\n "); + } else { + write(" "); + } + write(&make_tag_pattern(&binding, tag)); + + wildcard = Some(false); + } + + // <_> + None => { + if wildcard.is_some() { + panic!("wildcard tags must appear alone"); + } + wildcard = Some(true); + wildcards.push(WildcardArm { + binding: binding.clone(), + kind: tag.kind, + expr: expr.clone(), + }); + } + } + } + + match wildcard { + None => panic!("[internal macro error] tag arm with no tags"), + Some(false) => { + write(" =>\n "); + write(&expr); + write("\n"); + } + Some(true) => {} // codegen for wildcards is deferred + } + } + } + } + + // Time to process the last, catch-all arm. We will generate something like + // + // last_arm_token => { + // let enable_wildcards = match last_arm_token { + // TagToken(Tag { kind: EndTag, name: atom!("body"), .. }) => false, + // TagToken(Tag { kind: EndTag, name: atom!("html"), .. }) => false, + // // ... + // _ => true, + // }; + // + // match (enable_wildcards, last_arm_token) { + // (true, TagToken(name @ Tag { kind: StartTag, .. })) + // => ..., // wildcard action for start tags + // + // (true, TagToken(name @ Tag { kind: EndTag, .. })) + // => ..., // wildcard action for end tags + // + // (_, token) => ... // using the pattern from that last arm + // } + // } + + let Arm { binding, lhs, rhs } = last_arm; + + let (last_pat, last_expr) = match (binding, lhs, rhs) { + (Some(_), _, _) => panic!("the last arm cannot have an @-binding"), + (None, LHS::Tags(_), _) => panic!("the last arm cannot have tag patterns"), + (None, _, RHS::Else) => panic!("the last arm cannot use 'else'"), + (None, LHS::Pattern(p), RHS::Expression(e)) => (p, e) + }; + + write(" last_arm_token => {\n"); + write(" let enable_wildcards = match last_arm_token {\n"); + + // Code for the `false` arms inside `let enable_wildcards = ...`. + for (_, tags) in wild_excluded { + for tag in tags { + write(&format!(" {} => false,\n", make_tag_pattern("", tag))); + } + } + + write(" _ => true,\n"); + write(" };\n"); + write(" match (enable_wildcards, last_arm_token) {\n"); + + // Code for the wildcard actions. + for WildcardArm { binding, kind, expr } in wildcards { + let pat = make_tag_pattern(&binding, Tag { kind: kind, name: None }); + write(&format!(" (true, {}) =>\n", pat)); + write(&format!(" {}\n", expr)); + } + + write(&format!(" (_, {}) => {}\n", last_pat, last_expr)); + write(" }\n"); + write(" }\n"); + write("}\n"); +} + +fn make_tag_pattern(binding: &str, tag: Tag) -> String { + let mut s = format!( + "::tree_builder::types::TagToken({}::tokenizer::Tag {{ kind: {:?}, ", + binding, tag.kind); + if let Some(name) = tag.name { + write!(s, "name: atom!({:?}), ", name).unwrap(); + } + s.push_str(".. })"); + s +} diff --git a/macros/src/lib.rs b/macros/src/lib.rs deleted file mode 100644 index bd976011..00000000 --- a/macros/src/lib.rs +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2014 The html5ever Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -#![feature(quote, rustc_private)] -#![deny(warnings)] - -extern crate syntax; - -#[macro_use] -extern crate mac; - -// See https://github.com/rust-lang/rust/pull/23857 -macro_rules! panictry { - ($e:expr) => ({ - use syntax::errors::FatalError; - match $e { - Ok(e) => e, - Err(mut e) => { - e.emit(); - panic!(FatalError); - } - } - }) -} - -// Make these public so that rustdoc will generate documentation for them. -pub mod match_token; -pub mod pre_expand; - -pub use pre_expand::pre_expand; diff --git a/macros/src/match_token.rs b/macros/src/match_token.rs deleted file mode 100644 index 9271dbaf..00000000 --- a/macros/src/match_token.rs +++ /dev/null @@ -1,485 +0,0 @@ -// Copyright 2014 The html5ever Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -/*! - -Implements the `match_token!()` macro for use by the HTML tree builder -in `src/tree_builder/mod.rs`. - - -## Example - -```rust -match_token!(token { - CommentToken(text) => 1, - - tag @ => 2, - - => 3, - -
=> else, - - tag @ => 4, - - token => 5, -}) -``` - - -## Syntax - -The left-hand side of each match arm is an optional `name @` binding, followed by - - - an ordinary Rust pattern that starts with an identifier or an underscore, or - - - a sequence of HTML tag names as identifiers, each inside "<...>" or "" - to match an open or close tag respectively, or - - - a "wildcard tag" "<_>" or "" to match all open tags or all close tags - respectively. - -The right-hand side is either an expression or the keyword `else`. - -Note that this syntax does not support guards or pattern alternation like -`Foo | Bar`. This is not a fundamental limitation; it's done for implementation -simplicity. - - -## Semantics - -Ordinary Rust patterns match as usual. If present, the `name @` binding has -the usual meaning. - -A sequence of named tags matches any of those tags. A single sequence can -contain both open and close tags. If present, the `name @` binding binds (by -move) the `Tag` struct, not the outer `Token`. That is, a match arm like - -```rust -tag @ => ... -``` - -expands to something like - -```rust -TagToken(tag @ Tag { name: atom!("html"), kind: StartTag }) -| TagToken(tag @ Tag { name: atom!("head"), kind: StartTag }) => ... -``` - -A wildcard tag matches any tag of the appropriate kind, *unless* it was -previously matched with an `else` right-hand side (more on this below). - -The expansion of this macro reorders code somewhat, to satisfy various -restrictions arising from moves. However it provides the semantics of in-order -matching, by enforcing the following restrictions on its input: - - - The last pattern must be a variable or the wildcard "_". In other words - it must match everything. - - - Otherwise, ordinary Rust patterns and specific-tag patterns cannot appear - after wildcard tag patterns. - - - No tag name may appear more than once. - - - A wildcard tag pattern may not occur in the same arm as any other tag. - "<_> => ..." and "<_> => ..." are both forbidden. - - - The right-hand side "else" may only appear with specific-tag patterns. - It means that these specific tags should be handled by the last, - catch-all case arm, rather than by any wildcard tag arm. This situation - is common in the HTML5 syntax. -*/ - -#![allow(unused_imports)] // for quotes - -use std::collections::{HashSet, HashMap}; -use std::collections::hash_map::Entry::{Occupied, Vacant}; - -use syntax::ast; -use syntax::codemap::{Span, Spanned, spanned}; -use syntax::errors::DiagnosticBuilder; -use syntax::ext::base::{ExtCtxt, MacResult, MacEager}; -use syntax::parse; -use syntax::parse::{token, parser, classify}; -use syntax::parse::parser::{Parser, Restrictions}; -use syntax::ptr::P; -use syntax::tokenstream::TokenTree; - -use self::TagKind::{StartTag, EndTag}; -use self::LHS::{Pat, Tags}; -use self::RHS::{Else, Expr}; - -type Tokens = Vec; - -// FIXME: duplicated in src/tokenizer/interface.rs -#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)] -enum TagKind { - StartTag, - EndTag, -} - -impl TagKind { - /// Turn this `TagKind` into syntax for a literal `tokenizer::TagKind`. - fn lift(self, cx: &mut ExtCtxt) -> Tokens { - match self { - StartTag => quote_tokens!(&mut *cx, ::tokenizer::StartTag), - EndTag => quote_tokens!(&mut *cx, ::tokenizer::EndTag), - } - } -} - -/// A single tag, as may appear in an LHS. -/// -/// `name` is `None` for wildcards. -#[derive(PartialEq, Eq, Hash, Clone)] -struct Tag { - kind: TagKind, - name: Option, -} - -/// Left-hand side of a pattern-match arm. -enum LHS { - Pat(P), - Tags(Vec>), -} - -/// Right-hand side of a pattern-match arm. -enum RHS { - Else, - Expr(P), -} - -/// A whole arm, including optional outer `name @` binding. -struct Arm { - binding: Option, - lhs: Spanned, - rhs: Spanned, -} - -/// A parsed `match_token!` invocation. -struct Match { - discriminant: P, - arms: Vec, -} - -fn push_all(lhs: &mut Vec, rhs: Vec) { - lhs.extend(rhs.into_iter()); -} - -fn parse_spanned_ident<'a>(parser: &mut Parser<'a>) -> Result> { - let lo = parser.span.lo; - let ident = try!(parser.parse_ident()); - let hi = parser.last_span.hi; - Ok(spanned(lo, hi, ident)) -} - -fn parse_tag<'a>(parser: &mut Parser<'a>) -> Result, DiagnosticBuilder<'a>> { - let lo = parser.span.lo; - try!(parser.expect(&token::Lt)); - - let kind = match parser.eat(&token::BinOp(token::Slash)) { - true => EndTag, - false => StartTag, - }; - let name = match parser.eat(&token::Underscore) { - true => None, - false => Some((*try!(parser.parse_ident()).name.as_str()).to_owned()), - }; - - try!(parser.expect(&token::Gt)); - Ok(spanned(lo, parser.last_span.hi, Tag { - kind: kind, - name: name, - })) -} - -/// Parse a `match_token!` invocation into the little AST defined above. -fn parse<'a>(cx: &'a mut ExtCtxt, toks: &[TokenTree]) -> Result> { - let mut parser = parse::new_parser_from_tts(cx.parse_sess(), cx.cfg(), toks.to_vec()); - - let discriminant = try!(parser.parse_expr_res(Restrictions::RESTRICTION_NO_STRUCT_LITERAL, None)); - try!(parser.expect(&token::OpenDelim(token::Brace))); - - let mut arms: Vec = Vec::new(); - while parser.token != token::CloseDelim(token::Brace) { - let mut binding = None; - if parser.look_ahead(1, |t| *t == token::At) { - binding = Some(try!(parse_spanned_ident(&mut parser))); - parser.bump(); // Consume the @ - } - - let lhs_lo = parser.span.lo; - let lhs = match parser.token { - token::Underscore | token::Ident(..) => Pat(try!(parser.parse_pat())), - token::Lt => { - let mut tags = Vec::new(); - while parser.token != token::FatArrow { - tags.push(try!(parse_tag(&mut parser))); - } - Tags(tags) - } - _ => return Err(parser.fatal("unrecognized pattern")), - }; - let lhs_hi = parser.last_span.hi; - - try!(parser.expect(&token::FatArrow)); - - let rhs_lo = parser.span.lo; - let mut rhs_hi = parser.span.hi; - let rhs = if parser.eat_keyword(token::keywords::Else) { - try!(parser.expect(&token::Comma)); - Else - } else { - let expr = try!(parser.parse_expr_res(Restrictions::RESTRICTION_STMT_EXPR, None)); - rhs_hi = parser.last_span.hi; - - let require_comma = - !classify::expr_is_simple_block(&*expr) - && parser.token != token::CloseDelim(token::Brace); - - if require_comma { - try!(parser.expect_one_of( - &[token::Comma], &[token::CloseDelim(token::Brace)])); - } else { - parser.eat(&token::Comma); - } - - Expr(expr) - }; - - arms.push(Arm { - binding: binding, - lhs: spanned(lhs_lo, lhs_hi, lhs), - rhs: spanned(rhs_lo, rhs_hi, rhs), - }); - } - - // Consume the closing brace - parser.bump(); - - Ok(Match { - discriminant: discriminant, - arms: arms, - }) -} - -/// Description of a wildcard match arm. -/// -/// We defer generating code for these until we process the last, catch-all -/// arm. This isn't part of the AST produced by `parse()`; it's created -/// while processing that AST. -struct WildcardArm { - binding: Tokens, - kind: TagKind, - expr: P, -} - -fn make_tag_pattern(cx: &mut ExtCtxt, binding: Tokens, tag: Tag) -> Tokens { - let kind = tag.kind.lift(cx); - let mut fields = quote_tokens!(&mut *cx, kind: $kind,); - match tag.name { - None => (), - Some(name) => push_all(&mut fields, quote_tokens!(&mut *cx, name: atom!($name),)), - } - quote_tokens!(&mut *cx, - ::tree_builder::types::TagToken($binding ::tokenizer::Tag { $fields ..}) - ) -} - -macro_rules! ext_err { - ($span: expr, $message: expr) => { return Err(($span, $message)) } -} -macro_rules! ext_err_if { - ($condition: expr, $span: expr, $message: expr) => { - if $condition { return Err(($span, $message)) } - } -} - -/// Expand the `match_token!` macro. -pub fn expand_to_tokens(cx: &mut ExtCtxt, span: Span, toks: &[TokenTree]) - -> Result, (Span, &'static str)> { - let Match { discriminant, mut arms } = panictry!(parse(cx, toks)); - - // Handle the last arm specially at the end. - let last_arm = match arms.pop() { - Some(x) => x, - None => ext_err!(span, "need at least one match arm"), - }; - - // Code for the arms other than the last one. - let mut arm_code: Tokens = vec!(); - - // Tags we've seen, used for detecting duplicates. - let mut seen_tags: HashSet = HashSet::new(); - - // Case arms for wildcard matching. We collect these and - // emit them later. - let mut wildcards: Vec = vec!(); - - // Tags excluded (by an 'else' RHS) from wildcard matching. - let mut wild_excluded: HashMap> = HashMap::new(); - - for Arm { binding, lhs, rhs } in arms.into_iter() { - // Build Rust syntax for the `name @` binding, if any. - let binding = match binding { - Some(i) => quote_tokens!(&mut *cx, $i @), - None => vec!(), - }; - - match (lhs.node, rhs.node) { - (Pat(_), Else) - => ext_err!(rhs.span, "'else' may not appear with an ordinary pattern"), - - // ordinary pattern => expression - (Pat(pat), Expr(expr)) => { - ext_err_if!(!wildcards.is_empty(), lhs.span, - "ordinary patterns may not appear after wildcard tags"); - push_all(&mut arm_code, quote_tokens!(&mut *cx, $binding $pat => $expr,)); - } - - // ... => else - (Tags(tags), Else) => { - for Spanned { span, node: tag } in tags.into_iter() { - ext_err_if!(!seen_tags.insert(tag.clone()), span, "duplicate tag"); - ext_err_if!(tag.name.is_none(), rhs.span, - "'else' may not appear with a wildcard tag"); - match wild_excluded.entry(tag.kind) { - Occupied(e) => { e.into_mut().push(tag.clone()); } - Vacant(e) => { e.insert(vec![tag.clone()]); } - } - } - } - - // <_> => expression - // ... => expression - (Tags(tags), Expr(expr)) => { - // Is this arm a tag wildcard? - // `None` if we haven't processed the first tag yet. - let mut wildcard = None; - for Spanned { span, node: tag } in tags.into_iter() { - ext_err_if!(!seen_tags.insert(tag.clone()), span, "duplicate tag"); - - match tag.name { - // - Some(_) => { - ext_err_if!(!wildcards.is_empty(), lhs.span, - "specific tags may not appear after wildcard tags"); - - ext_err_if!(wildcard == Some(true), span, - "wildcard tags must appear alone"); - - if wildcard.is_some() { - // Push the delimeter `|` if it's not the first tag. - push_all(&mut arm_code, quote_tokens!(&mut *cx, |)); - } - push_all(&mut arm_code, make_tag_pattern(cx, binding.clone(), tag)); - - wildcard = Some(false); - } - - // <_> - None => { - ext_err_if!(wildcard.is_some(), span, - "wildcard tags must appear alone"); - wildcard = Some(true); - wildcards.push(WildcardArm { - binding: binding.clone(), - kind: tag.kind, - expr: expr.clone(), - }); - } - } - } - - match wildcard { - None => ext_err!(lhs.span, "[internal macro error] tag arm with no tags"), - Some(false) => { - push_all(&mut arm_code, quote_tokens!(&mut *cx, => $expr,)); - } - Some(true) => () // codegen for wildcards is deferred - } - } - } - } - - // Time to process the last, catch-all arm. We will generate something like - // - // last_arm_token => { - // let enable_wildcards = match last_arm_token { - // TagToken(Tag { kind: EndTag, name: atom!("body"), .. }) => false, - // TagToken(Tag { kind: EndTag, name: atom!("html"), .. }) => false, - // // ... - // _ => true, - // }; - // - // match (enable_wildcards, last_arm_token) { - // (true, TagToken(name @ Tag { kind: StartTag, .. })) - // => ..., // wildcard action for start tags - // - // (true, TagToken(name @ Tag { kind: EndTag, .. })) - // => ..., // wildcard action for end tags - // - // (_, token) => ... // using the pattern from that last arm - // } - // } - - let Arm { binding, lhs, rhs } = last_arm; - let last_arm_token = token::gensym_ident("last_arm_token"); - let enable_wildcards = token::gensym_ident("enable_wildcards"); - - let (last_pat, last_expr) = match (binding, lhs.node, rhs.node) { - (Some(id), _, _) => ext_err!(id.span, "the last arm cannot have an @-binding"), - (None, Tags(_), _) => ext_err!(lhs.span, "the last arm cannot have tag patterns"), - (None, _, Else) => ext_err!(rhs.span, "the last arm cannot use 'else'"), - (None, Pat(p), Expr(e)) => match p.node { - ast::PatKind::Wild | ast::PatKind::Ident(..) => (p, e), - _ => ext_err!(lhs.span, "the last arm must have a wildcard or ident pattern"), - }, - }; - - // We can't actually tell if the last pattern is a variable or a nullary enum - // constructor, but in the latter case rustc will (probably?) give an error - // about non-exhaustive matching on the expanded `match` expression. - - // Code for the `false` arms inside `let enable_wildcards = ...`. - let mut enable_wildcards_code = vec!(); - for (_, tags) in wild_excluded.into_iter() { - for tag in tags.into_iter() { - push_all(&mut enable_wildcards_code, make_tag_pattern(cx, vec!(), tag)); - push_all(&mut enable_wildcards_code, quote_tokens!(&mut *cx, => false,)); - } - } - - // Code for the wildcard actions. - let mut wildcard_code = vec!(); - for WildcardArm { binding, kind, expr } in wildcards.into_iter() { - let pat = make_tag_pattern(cx, binding, Tag { kind: kind, name: None }); - push_all(&mut wildcard_code, quote_tokens!(&mut *cx, - (true, $pat) => $expr, - )); - } - - // Put it all together! - Ok(quote_tokens!(&mut *cx, - match $discriminant { - $arm_code - - $last_arm_token => { - let $enable_wildcards = match $last_arm_token { - $enable_wildcards_code - _ => true, - }; - - match ($enable_wildcards, $last_arm_token) { - $wildcard_code - (_, $last_pat) => $last_expr, - } - }, - } - )) -} diff --git a/macros/src/pre_expand.rs b/macros/src/pre_expand.rs deleted file mode 100644 index 873fc4ef..00000000 --- a/macros/src/pre_expand.rs +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright 2015 The html5ever Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use match_token; -use std::fs::File; -use std::hash::{Hash, Hasher}; -use std::io::{Read, Write}; -use std::path::Path; -use std::rc::Rc; -use syntax::{codemap, ext, parse, print}; -use syntax::ext::base::DummyResolver; -use syntax::parse::token; -use syntax::tokenstream::{Delimited, TokenTree}; - -pub fn pre_expand(from: &Path, to: &Path) { - let mut source = String::new(); - let mut file_from = File::open(from).unwrap(); - file_from.read_to_string(&mut source).unwrap(); - - let mut file_to = File::create(to).unwrap(); - write_header(&from, &source, &mut file_to); - - let sess = parse::ParseSess::new(); - let mut resolver = DummyResolver; - let mut cx = ext::base::ExtCtxt::new(&sess, vec![], - ext::expand::ExpansionConfig::default("".to_owned()), - &mut resolver); - - let from = from.to_string_lossy().into_owned(); - let tts = panictry!(parse::parse_tts_from_source_str(from, source, vec![], &sess)); - let tts = find_and_expand_match_token(&mut cx, tts); - let tts = pretty(&mut cx, tts); - - let expanded = print::pprust::tts_to_string(&tts); - file_to.write_all(expanded.as_bytes()).unwrap(); -} - -fn find_and_expand_match_token(cx: &mut ext::base::ExtCtxt, tts: Vec) - -> Vec { - let mut expanded = Vec::new(); - let mut tts = tts.into_iter().peekable(); - while let Some(tt) = tts.next() { - match tt { - TokenTree::Token(span, token::Token::Ident(ident)) - if ident.name.as_str() == "match_token" - => { - // `!` - if !matches!(tts.next(), Some(TokenTree::Token(_, token::Token::Not))) { - expanded.push(tt); - continue - } - match tts.next() { - Some(TokenTree::Delimited(_, block)) => { - cx.bt_push(expn_info(span)); - expanded.extend( - match match_token::expand_to_tokens(cx, span, &block.tts) { - Ok(tts) => tts, - Err((span, message)) => { - cx.parse_sess.span_diagnostic.span_err(span, message); - panic!("Error in match_token! expansion."); - } - }); - cx.bt_pop(); - } - _ => panic!("expected a block after {:?}", span) - } - } - TokenTree::Delimited(span, mut block) => { - Rc::make_mut(&mut block); - let block = Rc::try_unwrap(block).unwrap(); - expanded.push(TokenTree::Delimited(span, Rc::new(Delimited { - delim: block.delim, - open_span: block.open_span, - tts: find_and_expand_match_token(cx, block.tts), - close_span: block.close_span, - }))) - } - _ => expanded.push(tt) - } - } - expanded -} - -fn expn_info(span: codemap::Span) -> codemap::ExpnInfo { - codemap::ExpnInfo { - call_site: span, - callee: codemap::NameAndSpan { - format: codemap::ExpnFormat::MacroBang(token::intern("match_token")), - allow_internal_unstable: false, - span: None, - } - } -} - -/// Somehow, going through a parser and back to tokens gives nicer whitespace. -fn pretty(cx: &mut ext::base::ExtCtxt, tts: Vec) -> Vec { - let mut parser = parse::new_parser_from_tts(cx.parse_sess(), cx.cfg(), tts); - let start_span = parser.span; - let mut items = Vec::new(); - let attrs = parser.parse_inner_attributes().unwrap(); - while let Ok(Some(item)) = parser.parse_item() { - items.push(item) - } - cx.bt_push(expn_info(start_span)); - quote_tokens!(&mut *cx, $attrs $items) -} - -#[allow(deprecated)] -fn write_header(source_file_name: &Path, source: &str, file: &mut File) { - use std::hash::SipHasher; - - let mut hasher = SipHasher::new(); - source.hash(&mut hasher); - let source_hash = hasher.finish(); - - for header_line in source.lines().take_while(|line| line.starts_with("//")) { - writeln!(file, "{}", header_line).unwrap(); - } - writeln!(file, r" -// This file is generated from {} -// source SipHash: {} -", - source_file_name.file_name().unwrap().to_string_lossy(), source_hash).unwrap(); -} diff --git a/src/tree_builder/mod.rs b/src/tree_builder/mod.rs index 5bbc1027..8cf017ed 100644 --- a/src/tree_builder/mod.rs +++ b/src/tree_builder/mod.rs @@ -39,7 +39,11 @@ pub mod interface; mod data; mod types; mod actions; -#[path = "rules.expanded.rs"] mod rules; +mod rules { + //! The tree builder rules, as a single, enormous nested match expression. + + include!(concat!(env!("OUT_DIR"), "/rules.rs")); +} /// Tree builder options, with an impl for Default. #[derive(Copy, Clone)] diff --git a/src/tree_builder/rules.rs b/src/tree_builder/rules.rs index 2c79fda2..796d1aa0 100644 --- a/src/tree_builder/rules.rs +++ b/src/tree_builder/rules.rs @@ -7,7 +7,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -//! The tree builder rules, as a single, enormous nested match expression. +// The tree builder rules, as a single, enormous nested match expression. use tree_builder::types::*; use tree_builder::tag_sets::*; @@ -748,7 +748,7 @@ impl TreeBuilderStep Done } - // FIXME: This should be unreachable, but match_token! requires a + // FIXME: This should be unreachable, but match_token requires a // catch-all case. _ => panic!("impossible case in InBody mode"), }), @@ -786,7 +786,7 @@ impl TreeBuilderStep //§ parsing-main-intable InTable => match_token!(token { - // FIXME: hack, should implement pat | pat for match_token!() instead + // FIXME: hack, should implement pat | pat for match_token instead NullCharacterToken => self.process_chars_in_table(token), CharacterTokens(..) => self.process_chars_in_table(token), @@ -1465,7 +1465,7 @@ impl TreeBuilderStep } } - // FIXME: This should be unreachable, but match_token! requires a + // FIXME: This should be unreachable, but match_token requires a // catch-all case. _ => panic!("impossible case in foreign content"), }) From dd9f2ccc5e38c29f4ecc104d0f9e1c23e261f5a0 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 17 Oct 2016 21:59:15 +0200 Subject: [PATCH 2/2] Use an actual Rust parser for the match_token! macro. --- Cargo.toml | 3 +- build.rs | 2 + macros/match_token.rs | 387 ++++++++++++++++++-------------------- macros/visit.rs | 260 +++++++++++++++++++++++++ src/tree_builder/mod.rs | 6 + src/tree_builder/rules.rs | 6 +- 6 files changed, 458 insertions(+), 206 deletions(-) create mode 100644 macros/visit.rs diff --git a/Cargo.toml b/Cargo.toml index d3a0ac78..27c7a539 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,7 +33,6 @@ harness = false [features] unstable = ["tendril/unstable", "string_cache/unstable"] heap_size = ["heapsize", "heapsize_plugin"] -codegen = [] # Now unused, remove at the next breaking change [dependencies] log = "0" @@ -50,7 +49,9 @@ rustc-test = "0.1.3" [build-dependencies] phf_codegen = "0.7.3" +quote = "0.3" rustc-serialize = "0.3.15" +syn = { version = "0.9.1", features = ["full", "visit"]} [profile.dev] debug = false diff --git a/build.rs b/build.rs index 1a6c1643..42e325a6 100644 --- a/build.rs +++ b/build.rs @@ -8,7 +8,9 @@ // except according to those terms. extern crate phf_codegen; +#[macro_use] extern crate quote; extern crate rustc_serialize; +extern crate syn; use rustc_serialize::json::{Json, Decoder}; use rustc_serialize::Decodable; diff --git a/macros/match_token.rs b/macros/match_token.rs index e4962661..0d8e7e91 100644 --- a/macros/match_token.rs +++ b/macros/match_token.rs @@ -99,102 +99,81 @@ matching, by enforcing the following restrictions on its input: is common in the HTML5 syntax. */ -use std::collections::{HashMap, HashSet}; -use std::fmt::Write as FmtWrite; +use quote::{ToTokens, Tokens}; +use self::visit::{Visitor, RecursiveVisitor}; +use std::collections::HashSet; use std::fs::File; use std::io::{Read, Write}; +use std::mem; use std::path::Path; +use std::slice; +use syn; -#[derive(Clone)] -struct Source<'a> { - src: &'a str, -} - -impl<'a> Source<'a> { - fn consume(&mut self, n: usize) -> &'a str { - let (before, after) = self.src.split_at(n); - self.src = after; - before - } - - fn find(&mut self, s: &str) -> Option<&'a str> { - self.src.find(s).map(|position| { - let before = self.consume(position); - self.consume(s.len()); - before - }) - } - - fn consume_if_present(&mut self, s: &str) -> bool { - let present = self.src.starts_with(s); - if present { - self.consume(s.len()); - } - present - } - - fn expect(&mut self, s: &str) { - assert!(self.consume_if_present(s), "{:?}… does not start with {:?}", &self.src[..50], s); - } +mod visit; - /// Not exactly Rust whitespace, but close enough - fn consume_whitespace(&mut self) { - while self.src.starts_with(&[' ', '\t', '\n', '\r'][..]) { - self.consume(1); - } - } +pub fn expand_match_tokens(from: &Path, to: &Path) { + let mut source = String::new(); + File::open(from).unwrap().read_to_string(&mut source).unwrap(); + let mut crate_ = syn::parse_crate(&source).expect("Parsing rules.rs module"); + RecursiveVisitor { node_visitor: ExpanderVisitor }.visit_crate(&mut crate_); + let mut tokens = Tokens::new(); + crate_.to_tokens(&mut tokens); + let code = tokens.to_string().replace("{ ", "{\n").replace(" }", "\n}"); + File::create(to).unwrap().write_all(code.as_bytes()).unwrap(); +} - /// Not exactly the syntax of a Rust identifier, but close enough - fn consume_ident(&mut self) -> Option<&'a str> { - let end = self.src.find(|c: char| !c.is_alphanumeric() && c != '_').unwrap_or(self.src.len()); - if end > 0 { - Some(self.consume(end)) - } else { - None - } - } +struct ExpanderVisitor; - fn find_top_level(&mut self, start_at: usize, delimeter: u8) -> usize { - let mut i = start_at; - let bytes = self.src.as_bytes(); - loop { - let b = *bytes.get(i).expect("unbalanced brackets"); - i += 1; - if b == delimeter { - return i - } - match b { - b'{' => i = self.find_top_level(i, b'}'), - b'[' => i = self.find_top_level(i, b']'), - b'(' => i = self.find_top_level(i, b')'), - _ => {} +impl Visitor for ExpanderVisitor { + fn visit_expression(&mut self, expr: &mut syn::Expr) { + let tts; + if let syn::Expr::Mac(ref mut macro_) = *expr { + if macro_.path == syn::Path::from("match_token") { + tts = mem::replace(&mut macro_.tts, Vec::new()); + } else { + return } + } else { + return } + let (to_be_matched, arms) = parse_match_token_macro(tts); + let tokens = expand_match_token_macro(to_be_matched, arms); + *expr = syn::parse_expr(&tokens.to_string()).expect("Parsing a match expression"); } } -pub fn expand_match_tokens(from: &Path, to: &Path) { - let mut source = String::new(); - File::open(from).unwrap().read_to_string(&mut source).unwrap(); +fn parse_match_token_macro(tts: Vec) -> (syn::Ident, Vec) { + use syn::TokenTree::Delimited; + use syn::DelimToken::{Brace, Paren}; - let mut source = Source { src: &*source }; + let mut tts = tts.into_iter(); + let inner_tts = if let Some(Delimited(syn::Delimited { delim: Paren, tts })) = tts.next() { + tts + } else { + panic!("expected one top-level () block") + }; + assert_eq!(tts.len(), 0); - let mut file = File::create(to).unwrap(); - let mut write = |s: &str| file.write_all(s.as_bytes()).unwrap(); - while let Some(before) = source.find("match_token!") { - write(before); - source.expect("(token {"); - let mut arms = Vec::new(); - loop { - source.consume_whitespace(); - if source.consume_if_present("})") { - break - } - arms.push(parse_arm(&mut source)); - } - write_match_token(arms, &mut write); + let mut tts = inner_tts.into_iter(); + let ident = if let Some(syn::TokenTree::Token(syn::Token::Ident(ident))) = tts.next() { + ident + } else { + panic!("expected ident") + }; + + let block = if let Some(Delimited(syn::Delimited { delim: Brace, tts })) = tts.next() { + tts + } else { + panic!("expected one {} block") + }; + assert_eq!(tts.len(), 0); + + let mut tts = block.iter(); + let mut arms = Vec::new(); + while tts.len() > 0 { + arms.push(parse_arm(&mut tts)) } - write(source.src); + (ident, arms) } #[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)] @@ -209,13 +188,13 @@ enum TagKind { #[derive(PartialEq, Eq, Hash, Clone, Debug)] struct Tag { kind: TagKind, - name: Option, + name: Option, } /// Left-hand side of a pattern-match arm. #[derive(Debug)] enum LHS { - Pattern(String), + Pattern(Tokens), Tags(Vec), } @@ -223,111 +202,115 @@ enum LHS { #[derive(Debug)] enum RHS { Else, - Expression(String), + Expression(Tokens), } /// A whole arm, including optional outer `name @` binding. #[derive(Debug)] struct Arm { - binding: Option, + binding: Option, lhs: LHS, rhs: RHS, } -fn parse_arm(source: &mut Source) -> Arm { - loop { - source.consume_whitespace(); - if source.consume_if_present("//") { - source.find("\n"); - } else { - break - } +fn parse_arm(tts: &mut slice::Iter) -> Arm { + Arm { + binding: parse_binding(tts), + lhs: parse_lhs(tts), + rhs: parse_rhs(tts), } - let start = source.clone(); - let mut binding = None; - if let Some(ident) = source.consume_ident() { - source.consume_whitespace(); - if source.consume_if_present("@") { - binding = Some(ident.to_owned()) - } else { - *source = start - } +} + +fn parse_binding(tts: &mut slice::Iter) -> Option { + let start = tts.clone(); + if let (Some(&syn::TokenTree::Token(syn::Token::Ident(ref ident))), + Some(&syn::TokenTree::Token(syn::Token::At))) = (tts.next(), tts.next()) { + Some(ident.clone()) + } else { + *tts = start; + None } +} - Arm { - binding: binding, - lhs: parse_lhs(source), - rhs: parse_rhs(source), +fn consume_if_present(tts: &mut slice::Iter, expected: syn::Token) -> bool { + if let Some(&syn::TokenTree::Token(ref first)) = tts.as_slice().first() { + if *first == expected { + tts.next(); + return true + } } + false } -fn parse_lhs(source: &mut Source) -> LHS { - source.consume_whitespace(); - if source.consume_if_present("<") { +fn parse_lhs(tts: &mut slice::Iter) -> LHS { + if consume_if_present(tts, syn::Token::Lt) { let mut tags = Vec::new(); loop { tags.push(Tag { - kind: if source.consume_if_present("/") { + kind: if consume_if_present(tts, syn::Token::BinOp(syn::BinOpToken::Slash)) { TagKind::EndTag } else { TagKind::StartTag }, - name: if source.consume_if_present("_") { + name: if consume_if_present(tts, syn::Token::Underscore) { None } else { - Some(source.consume_ident().expect("expected identifier (tag name)").to_owned()) + if let Some(&syn::TokenTree::Token(syn::Token::Ident(ref ident))) = tts.next() { + Some(ident.clone()) + } else { + panic!("expected identifier (tag name)") + } } }); - assert!(source.consume_if_present(">"), "expected '>' closing a tag pattern"); - source.consume_whitespace(); - if !source.consume_if_present("<") { + assert!(consume_if_present(tts, syn::Token::Gt), "expected '>' closing a tag pattern"); + if !consume_if_present(tts, syn::Token::Lt) { break } } - source.consume_whitespace(); - assert!(source.consume_if_present("=>")); + assert!(consume_if_present(tts, syn::Token::FatArrow)); LHS::Tags(tags) } else { - LHS::Pattern(source.find("=>").expect("did not find =>").to_owned()) + let mut pattern = Tokens::new(); + for tt in tts { + if let &syn::TokenTree::Token(syn::Token::FatArrow) = tt { + return LHS::Pattern(pattern) + } + tt.to_tokens(&mut pattern) + } + panic!("did not find =>") } } -fn parse_rhs(source: &mut Source) -> RHS { - source.consume_whitespace(); - let start_at; - let delimeter; - if source.consume_if_present("else,") { - return RHS::Else - } else if source.src.starts_with("{") { - start_at = 1; - delimeter = b'}'; - } else { - start_at = 0; - delimeter = b','; +fn parse_rhs(tts: &mut slice::Iter) -> RHS { + use syn::DelimToken::Brace; + let start = tts.clone(); + let first = tts.next(); + let after_first = tts.clone(); + let second = tts.next(); + if let (Some(&syn::TokenTree::Token(syn::Token::Ident(ref ident))), + Some(&syn::TokenTree::Token(syn::Token::Comma))) = (first, second) { + if ident == "else" { + return RHS::Else + } } - let end = source.find_top_level(start_at, delimeter); - let expr = source.consume(end); - if delimeter == b'}' { - source.consume_whitespace(); - source.consume_if_present(","); + let mut expression = Tokens::new(); + if let Some(&syn::TokenTree::Delimited(syn::Delimited { delim: Brace, .. })) = first { + first.to_tokens(&mut expression); + *tts = after_first; + consume_if_present(tts, syn::Token::Comma); + } else { + *tts = start; + for tt in tts { + tt.to_tokens(&mut expression); + if let &syn::TokenTree::Token(syn::Token::Comma) = tt { + break + } + } } - RHS::Expression(expr.to_owned()) + RHS::Expression(expression) } -/// Description of a wildcard match arm. -/// -/// We defer generating code for these until we process the last, catch-all -/// arm. This isn't part of the AST produced by `parse()`; it's created -/// while processing that AST. -struct WildcardArm { - binding: String, - kind: TagKind, - expr: String, -} - -fn write_match_token(mut arms: Vec, write: &mut F) where F: FnMut(&str) { - write("match token {\n"); - +fn expand_match_token_macro(to_be_matched: syn::Ident, mut arms: Vec) -> Tokens { // Handle the last arm specially at the end. let last_arm = arms.pop().unwrap(); @@ -336,16 +319,19 @@ fn write_match_token(mut arms: Vec, write: &mut F) where F: FnMut(&str) // Case arms for wildcard matching. We collect these and // emit them later. - let mut wildcards: Vec = Vec::new(); + let mut wildcards_patterns: Vec = Vec::new(); + let mut wildcards_expressions: Vec = Vec::new(); // Tags excluded (by an 'else' RHS) from wildcard matching. - let mut wild_excluded: HashMap> = HashMap::new(); + let mut wild_excluded_patterns: Vec = Vec::new(); + + let mut arms_code = Vec::new(); for Arm { binding, lhs, rhs } in arms { // Build Rust syntax for the `name @` binding, if any. let binding = match binding { - Some(ident) => format!("{} @ ", ident), - None => String::new(), + Some(ident) => quote!(#ident @), + None => quote!(), }; match (lhs, rhs) { @@ -353,10 +339,10 @@ fn write_match_token(mut arms: Vec, write: &mut F) where F: FnMut(&str) // ordinary pattern => expression (LHS::Pattern(pat), RHS::Expression(expr)) => { - if !wildcards.is_empty() { + if !wildcards_patterns.is_empty() { panic!("ordinary patterns may not appear after wildcard tags {:?} {:?}", pat, expr); } - write(&format!(" {}{} => {}\n", binding, pat, expr)); + arms_code.push(quote!(#binding #pat => #expr)) } // ... => else @@ -368,7 +354,7 @@ fn write_match_token(mut arms: Vec, write: &mut F) where F: FnMut(&str) if tag.name.is_none() { panic!("'else' may not appear with a wildcard tag"); } - wild_excluded.entry(tag.kind).or_insert_with(Vec::new).push(tag.clone()); + wild_excluded_patterns.push(make_tag_pattern(&Tokens::new(), tag)); } } @@ -386,7 +372,7 @@ fn write_match_token(mut arms: Vec, write: &mut F) where F: FnMut(&str) match tag.name { // Some(_) => { - if !wildcards.is_empty() { + if !wildcards_patterns.is_empty() { panic!("specific tags may not appear after wildcard tags"); } @@ -396,11 +382,9 @@ fn write_match_token(mut arms: Vec, write: &mut F) where F: FnMut(&str) if wildcard.is_some() { // Push the delimeter `|` if it's not the first tag. - write(" |\n "); - } else { - write(" "); + arms_code.push(quote!( | )) } - write(&make_tag_pattern(&binding, tag)); + arms_code.push(make_tag_pattern(&binding, tag)); wildcard = Some(false); } @@ -411,22 +395,15 @@ fn write_match_token(mut arms: Vec, write: &mut F) where F: FnMut(&str) panic!("wildcard tags must appear alone"); } wildcard = Some(true); - wildcards.push(WildcardArm { - binding: binding.clone(), - kind: tag.kind, - expr: expr.clone(), - }); + wildcards_patterns.push(make_tag_pattern(&binding, tag)); + wildcards_expressions.push(expr.clone()); } } } match wildcard { None => panic!("[internal macro error] tag arm with no tags"), - Some(false) => { - write(" =>\n "); - write(&expr); - write("\n"); - } + Some(false) => arms_code.push(quote!( => #expr)), Some(true) => {} // codegen for wildcards is deferred } } @@ -463,40 +440,44 @@ fn write_match_token(mut arms: Vec, write: &mut F) where F: FnMut(&str) (None, LHS::Pattern(p), RHS::Expression(e)) => (p, e) }; - write(" last_arm_token => {\n"); - write(" let enable_wildcards = match last_arm_token {\n"); - - // Code for the `false` arms inside `let enable_wildcards = ...`. - for (_, tags) in wild_excluded { - for tag in tags { - write(&format!(" {} => false,\n", make_tag_pattern("", tag))); + quote! { + // Use a no-op macro to work around a bug(?) in syn::parse_exr. + as_expr! { + match #to_be_matched { + #( + #arms_code + )* + last_arm_token => { + let enable_wildcards = match last_arm_token { + #( + #wild_excluded_patterns => false, + )* + _ => true, + }; + match (enable_wildcards, last_arm_token) { + #( + (true, #wildcards_patterns) => #wildcards_expressions + )* + (_, #last_pat) => #last_expr + } + } + } } } - - write(" _ => true,\n"); - write(" };\n"); - write(" match (enable_wildcards, last_arm_token) {\n"); - - // Code for the wildcard actions. - for WildcardArm { binding, kind, expr } in wildcards { - let pat = make_tag_pattern(&binding, Tag { kind: kind, name: None }); - write(&format!(" (true, {}) =>\n", pat)); - write(&format!(" {}\n", expr)); - } - - write(&format!(" (_, {}) => {}\n", last_pat, last_expr)); - write(" }\n"); - write(" }\n"); - write("}\n"); } -fn make_tag_pattern(binding: &str, tag: Tag) -> String { - let mut s = format!( - "::tree_builder::types::TagToken({}::tokenizer::Tag {{ kind: {:?}, ", - binding, tag.kind); - if let Some(name) = tag.name { - write!(s, "name: atom!({:?}), ", name).unwrap(); +fn make_tag_pattern(binding: &Tokens, tag: Tag) -> Tokens { + let kind = match tag.kind { + TagKind::StartTag => quote!(::tokenizer::StartTag), + TagKind::EndTag => quote!(::tokenizer::EndTag), + }; + let name_field = if let Some(name) = tag.name { + let name = name.to_string(); + quote!(name: atom!(#name),) + } else { + quote!() + }; + quote! { + ::tree_builder::types::TagToken(#binding ::tokenizer::Tag { kind: #kind, #name_field .. }) } - s.push_str(".. })"); - s } diff --git a/macros/visit.rs b/macros/visit.rs new file mode 100644 index 00000000..2b77f4dd --- /dev/null +++ b/macros/visit.rs @@ -0,0 +1,260 @@ +// Copyright 2016 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/// Just enough of an AST visitor to reach every expression. + +use syn; + +pub trait Visitor { + fn visit_crate(&mut self, _crate: &mut syn::Crate) {} + fn visit_item(&mut self, _item: &mut syn::Item) {} + fn visit_trait_item(&mut self, _item: &mut syn::TraitItem) {} + fn visit_impl_item(&mut self, _item: &mut syn::ImplItem) {} + fn visit_block(&mut self, _expr: &mut syn::Block) {} + fn visit_statement(&mut self, _expr: &mut syn::Stmt) {} + fn visit_expression(&mut self, _expr: &mut syn::Expr) {} +} + +pub struct RecursiveVisitor { + pub node_visitor: V +} + +impl Visitor for RecursiveVisitor { + fn visit_crate(&mut self, crate_: &mut syn::Crate) { + self.node_visitor.visit_crate(crate_); + for item in &mut crate_.items { + self.visit_item(item) + } + } + + fn visit_item(&mut self, item: &mut syn::Item) { + use syn::ItemKind::*; + self.node_visitor.visit_item(item); + match item.node { + ExternCrate(_) => {} + Use(_) => {} + Static(_, _, ref mut expr) => self.visit_expression(expr), + Const(_, ref mut expr) => self.visit_expression(expr), + Fn(_, _, _, _, _, ref mut block) => self.visit_block(block), + Mod(ref mut items) => { + for item in items { + self.visit_item(item) + } + } + ForeignMod(_) => {} + Ty(_, _) => {} + Enum(_, _) => {} + Struct(_, _) => {} + Union(_, _) => {} + Trait(_, _, _, ref mut trait_items) => { + for trait_item in trait_items { + self.visit_trait_item(trait_item) + } + } + DefaultImpl(_, _) => {} + Impl(_, _, _, _, _, ref mut impl_items) => { + for impl_item in impl_items { + self.visit_impl_item(impl_item) + } + } + Mac(_) => {} + } + } + + fn visit_trait_item(&mut self, trait_item: &mut syn::TraitItem) { + use syn::TraitItemKind::*; + self.node_visitor.visit_trait_item(trait_item); + match trait_item.node { + Const(_, Some(ref mut expr)) => self.visit_expression(expr), + Const(_, None) => {} + Method(_, Some(ref mut block)) => self.visit_block(block), + Method(_, None) => {} + Type(_, _) => {} + Macro(_) => {} + } + } + + fn visit_impl_item(&mut self, impl_item: &mut syn::ImplItem) { + use syn::ImplItemKind::*; + self.node_visitor.visit_impl_item(impl_item); + match impl_item.node { + Const(_, ref mut expr) => self.visit_expression(expr), + Method(_, ref mut block) => self.visit_block(block), + Type(_) => {} + Macro(_) => {} + } + } + + fn visit_block(&mut self, block: &mut syn::Block) { + self.node_visitor.visit_block(block); + for statement in &mut block.stmts { + self.visit_statement(statement) + } + } + + fn visit_statement(&mut self, statement: &mut syn::Stmt) { + use syn::Stmt::*; + self.node_visitor.visit_statement(statement); + match *statement { + Local(ref mut local) => { + if let Some(ref mut expr) = local.init { + self.visit_expression(expr) + } + } + Item(ref mut item) => self.visit_item(item), + Expr(ref mut expr) => self.visit_expression(expr), + Semi(ref mut expr) => self.visit_expression(expr), + Mac(_) => {} + } + } + + fn visit_expression(&mut self, expr: &mut syn::Expr) { + use syn::Expr::*; + self.node_visitor.visit_expression(expr); + match *expr { + Box(ref mut boxed) => { + self.visit_expression(boxed) + } + Vec(ref mut elements) => { + for element in elements { + self.visit_expression(element) + } + } + Call(ref mut called, ref mut args) => { + self.visit_expression(called); + for arg in args { + self.visit_expression(arg) + } + } + MethodCall(_, _, ref mut args) => { + for arg in args { + self.visit_expression(arg) + } + } + Tup(ref mut elements) => { + for element in elements { + self.visit_expression(element) + } + } + Binary(_, ref mut left, ref mut right) => { + self.visit_expression(left); + self.visit_expression(right); + } + Unary(_, ref mut operand) => { + self.visit_expression(operand) + } + Lit(_) => {} + Cast(ref mut expr, _) => { + self.visit_expression(expr) + } + Type(ref mut expr, _) => { + self.visit_expression(expr) + } + If(ref mut test, ref mut then, ref mut else_) => { + self.visit_expression(test); + self.visit_block(then); + if let Some(ref mut else_) = *else_ { + self.visit_expression(else_); + } + } + IfLet(_, ref mut test, ref mut then, ref mut else_) => { + self.visit_expression(test); + self.visit_block(then); + if let Some(ref mut else_) = *else_ { + self.visit_expression(else_); + } + } + While(ref mut test, ref mut block, _) => { + self.visit_expression(test); + self.visit_block(block); + } + WhileLet(_, ref mut test, ref mut block, _) => { + self.visit_expression(test); + self.visit_block(block); + } + ForLoop(_, ref mut iterable, ref mut block, _) => { + self.visit_expression(iterable); + self.visit_block(block); + } + Loop(ref mut block, _) => { + self.visit_block(block); + } + Match(ref mut matched, ref mut arms) => { + self.visit_expression(matched); + for arm in arms { + if let Some(ref mut guard) = arm.guard { + self.visit_expression(guard) + } + self.visit_expression(&mut arm.body) + } + } + Closure(_, _, ref mut block) => { + self.visit_block(block) + } + Block(_, ref mut block) => { + self.visit_block(block) + } + Assign(ref mut left, ref mut right) => { + self.visit_expression(left); + self.visit_expression(right); + } + AssignOp(_, ref mut left, ref mut right) => { + self.visit_expression(left); + self.visit_expression(right); + } + Field(ref mut base, _) => { + self.visit_expression(base) + } + TupField(ref mut base, _) => { + self.visit_expression(base) + } + Index(ref mut base, ref mut index) => { + self.visit_expression(base); + self.visit_expression(index); + } + Range(ref mut start, ref mut end, _) => { + if let Some(ref mut start) = *start { + self.visit_expression(start) + } + if let Some(ref mut end) = *end { + self.visit_expression(end) + } + } + Path(_, _) => {} + AddrOf(_, ref mut base) => { + self.visit_expression(base) + } + Break(_) => {} + Continue(_) => {} + Ret(Some(ref mut expr)) => { + self.visit_expression(expr) + } + Ret(None) => {} + Mac(_) => {} + Struct(_, ref mut fields, ref mut base) => { + for field in fields { + self.visit_expression(&mut field.expr) + } + if let Some(ref mut base) = *base { + self.visit_expression(base) + } + } + Repeat(ref mut element, ref mut number) => { + self.visit_expression(element); + self.visit_expression(number); + } + Paren(ref mut expr) => { + self.visit_expression(expr) + } + Try(ref mut expr) => { + self.visit_expression(expr) + } + } + } +} diff --git a/src/tree_builder/mod.rs b/src/tree_builder/mod.rs index 8cf017ed..2df4ce90 100644 --- a/src/tree_builder/mod.rs +++ b/src/tree_builder/mod.rs @@ -39,6 +39,12 @@ pub mod interface; mod data; mod types; mod actions; + +/// This macro is used in macros/match_token.rs to work around a bug(?) in syn::parse_exr. +macro_rules! as_expr { + ($e: expr) => ($e) +} + mod rules { //! The tree builder rules, as a single, enormous nested match expression. diff --git a/src/tree_builder/rules.rs b/src/tree_builder/rules.rs index 796d1aa0..5826ad51 100644 --- a/src/tree_builder/rules.rs +++ b/src/tree_builder/rules.rs @@ -1400,7 +1400,9 @@ impl TreeBuilderStep } fn step_foreign(&mut self, token: Token) -> ProcessResult { - match_token!(token { + // Use parens to make this an expression rather than a statement + // and work around a limitation of the build.rs script. + (match_token!(token { NullCharacterToken => { self.unexpected(&token); self.append_text("\u{fffd}".to_tendril()) @@ -1468,6 +1470,6 @@ impl TreeBuilderStep // FIXME: This should be unreachable, but match_token requires a // catch-all case. _ => panic!("impossible case in foreign content"), - }) + })) } }